aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_dir.c1
-rw-r--r--fs/9p/vfs_inode.c3
-rw-r--r--fs/Kconfig165
-rw-r--r--fs/Kconfig.binfmt6
-rw-r--r--fs/Makefile5
-rw-r--r--fs/adfs/dir.c1
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/dir.c1
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs4/inode.c2
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/binfmt_flat.c4
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/bio-integrity.c29
-rw-r--r--fs/bio.c307
-rw-r--r--fs/block_dev.c182
-rw-r--r--fs/buffer.c13
-rw-r--r--fs/cifs/CHANGES10
-rw-r--r--fs/cifs/README44
-rw-r--r--fs/cifs/asn1.c11
-rw-r--r--fs/cifs/cifs_spnego.c39
-rw-r--r--fs/cifs/cifs_spnego.h2
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c84
-rw-r--r--fs/cifs/connect.c33
-rw-r--r--fs/cifs/dns_resolve.c77
-rw-r--r--fs/cifs/file.c134
-rw-r--r--fs/cifs/inode.c648
-rw-r--r--fs/cifs/misc.c8
-rw-r--r--fs/cifs/readdir.c128
-rw-r--r--fs/cifs/sess.c17
-rw-r--r--fs/cifs/transport.c3
-rw-r--r--fs/compat.c8
-rw-r--r--fs/configfs/dir.c17
-rw-r--r--fs/cramfs/inode.c84
-rw-r--r--fs/dcache.c12
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c68
-rw-r--r--fs/dlm/config.c240
-rw-r--r--fs/dlm/dlm_internal.h7
-rw-r--r--fs/dlm/lockspace.c158
-rw-r--r--fs/dlm/lockspace.h1
-rw-r--r--fs/dlm/user.c134
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/dquot.c6
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/efs/namei.c3
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/eventpoll.c5
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext3/file.c1
-rw-r--r--fs/ext3/inode.c8
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/Makefile10
-rw-r--r--fs/ext4/acl.h12
-rw-r--r--fs/ext4/balloc.c1458
-rw-r--r--fs/ext4/bitmap.c6
-rw-r--r--fs/ext4/dir.c84
-rw-r--r--fs/ext4/ext4.h137
-rw-r--r--fs/ext4/ext4_extents.h19
-rw-r--r--fs/ext4/ext4_i.h39
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/ext4_sb.h25
-rw-r--r--fs/ext4/extents.c392
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/ext4/fsync.c7
-rw-r--r--fs/ext4/hash.c8
-rw-r--r--fs/ext4/ialloc.c73
-rw-r--r--fs/ext4/inode.c1064
-rw-r--r--fs/ext4/ioctl.c96
-rw-r--r--fs/ext4/mballoc.c273
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/migrate.c13
-rw-r--r--fs/ext4/namei.c402
-rw-r--r--fs/ext4/resize.c36
-rw-r--r--fs/ext4/super.c318
-rw-r--r--fs/ext4/symlink.c8
-rw-r--r--fs/ext4/xattr.c14
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/fat/fatent.c14
-rw-r--r--fs/fat/inode.c16
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/glock.c15
-rw-r--r--fs/gfs2/glock.h1
-rw-r--r--fs/gfs2/incore.h38
-rw-r--r--fs/gfs2/inode.c159
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/locking/dlm/mount.c3
-rw-r--r--fs/gfs2/log.c21
-rw-r--r--fs/gfs2/mount.c9
-rw-r--r--fs/gfs2/ops_address.c18
-rw-r--r--fs/gfs2/ops_file.c16
-rw-r--r--fs/gfs2/ops_fstype.c578
-rw-r--r--fs/gfs2/ops_inode.c127
-rw-r--r--fs/gfs2/ops_super.c108
-rw-r--r--fs/gfs2/super.c340
-rw-r--r--fs/gfs2/super.h6
-rw-r--r--fs/gfs2/sys.c11
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c1
-rw-r--r--fs/inotify_user.c27
-rw-r--r--fs/ioctl.c277
-rw-r--r--fs/ioprio.c8
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jbd/transaction.c4
-rw-r--r--fs/jbd2/checkpoint.c71
-rw-r--r--fs/jbd2/commit.c32
-rw-r--r--fs/jbd2/journal.c103
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/jbd2/transaction.c4
-rw-r--r--fs/jffs2/jffs2_fs_i.h1
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/lockd/Makefile2
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/grace.c59
-rw-r--r--fs/lockd/host.c350
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc.c88
-rw-r--r--fs/lockd/svc4proc.c35
-rw-r--r--fs/lockd/svclock.c18
-rw-r--r--fs/lockd/svcproc.c35
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/lockd/xdr.c2
-rw-r--r--fs/lockd/xdr4.c2
-rw-r--r--fs/nfs/callback.c3
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/super.c12
-rw-r--r--fs/nfsd/export.c6
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c7
-rw-r--r--fs/nfsd/nfs4proc.c20
-rw-r--r--fs/nfsd/nfs4state.c34
-rw-r--r--fs/nfsd/nfs4xdr.c171
-rw-r--r--fs/nfsd/nfsctl.c5
-rw-r--r--fs/nfsd/nfsfh.c30
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c20
-rw-r--r--fs/nfsd/vfs.c63
-rw-r--r--fs/ntfs/namei.c89
-rw-r--r--fs/ntfs/usnjrnl.h4
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/alloc.c922
-rw-r--r--fs/ocfs2/alloc.h95
-rw-r--r--fs/ocfs2/aops.c62
-rw-r--r--fs/ocfs2/buffer_head_io.c134
-rw-r--r--fs/ocfs2/buffer_head_io.h23
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c26
-rw-r--r--fs/ocfs2/cluster/tcp.c44
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h32
-rw-r--r--fs/ocfs2/dir.c120
-rw-r--r--fs/ocfs2/dlmglue.c9
-rw-r--r--fs/ocfs2/extent_map.c386
-rw-r--r--fs/ocfs2/extent_map.h7
-rw-r--r--fs/ocfs2/file.c334
-rw-r--r--fs/ocfs2/file.h32
-rw-r--r--fs/ocfs2/inode.c87
-rw-r--r--fs/ocfs2/inode.h6
-rw-r--r--fs/ocfs2/ioctl.c3
-rw-r--r--fs/ocfs2/journal.c112
-rw-r--r--fs/ocfs2/journal.h52
-rw-r--r--fs/ocfs2/localalloc.c384
-rw-r--r--fs/ocfs2/localalloc.h4
-rw-r--r--fs/ocfs2/locks.c15
-rw-r--r--fs/ocfs2/locks.h1
-rw-r--r--fs/ocfs2/namei.c101
-rw-r--r--fs/ocfs2/ocfs2.h56
-rw-r--r--fs/ocfs2/ocfs2_fs.h220
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/resize.c11
-rw-r--r--fs/ocfs2/slot_map.c7
-rw-r--r--fs/ocfs2/stack_user.c33
-rw-r--r--fs/ocfs2/stackglue.c27
-rw-r--r--fs/ocfs2/stackglue.h19
-rw-r--r--fs/ocfs2/suballoc.c248
-rw-r--r--fs/ocfs2/suballoc.h26
-rw-r--r--fs/ocfs2/super.c64
-rw-r--r--fs/ocfs2/symlink.c18
-rw-r--r--fs/ocfs2/uptodate.c38
-rw-r--r--fs/ocfs2/uptodate.h3
-rw-r--r--fs/ocfs2/xattr.c4834
-rw-r--r--fs/ocfs2/xattr.h68
-rw-r--r--fs/omfs/bitmap.c5
-rw-r--r--fs/omfs/file.c33
-rw-r--r--fs/omfs/inode.c5
-rw-r--r--fs/open.c3
-rw-r--r--fs/partitions/check.c272
-rw-r--r--fs/partitions/check.h4
-rw-r--r--fs/proc/Kconfig10
-rw-r--r--fs/proc/array.c66
-rw-r--r--fs/proc/base.c21
-rw-r--r--fs/proc/generic.c5
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/nommu.c4
-rw-r--r--fs/proc/proc_misc.c12
-rw-r--r--fs/proc/proc_sysctl.c6
-rw-r--r--fs/proc/task_mmu.c20
-rw-r--r--fs/proc/task_nommu.c5
-rw-r--r--fs/proc/vmcore.c6
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/readdir.c8
-rw-r--r--fs/reiserfs/super.c1
-rw-r--r--fs/seq_file.c25
-rw-r--r--fs/splice.c3
-rw-r--r--fs/ubifs/budget.c147
-rw-r--r--fs/ubifs/commit.c3
-rw-r--r--fs/ubifs/debug.c29
-rw-r--r--fs/ubifs/debug.h143
-rw-r--r--fs/ubifs/dir.c27
-rw-r--r--fs/ubifs/file.c28
-rw-r--r--fs/ubifs/find.c28
-rw-r--r--fs/ubifs/gc.c20
-rw-r--r--fs/ubifs/io.c14
-rw-r--r--fs/ubifs/journal.c110
-rw-r--r--fs/ubifs/log.c4
-rw-r--r--fs/ubifs/misc.h63
-rw-r--r--fs/ubifs/orphan.c4
-rw-r--r--fs/ubifs/super.c75
-rw-r--r--fs/ubifs/tnc.c116
-rw-r--r--fs/ubifs/tnc_commit.c37
-rw-r--r--fs/ubifs/ubifs-media.h6
-rw-r--r--fs/ubifs/ubifs.h47
-rw-r--r--fs/ubifs/xattr.c54
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/ialloc.c44
-rw-r--r--fs/udf/super.c2
-rw-r--r--fs/ufs/super.c4
-rw-r--r--fs/xfs/linux-2.6/sema.h52
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h12
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c192
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.h15
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h6
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c211
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.c22
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h65
-rw-r--r--fs/xfs/quota/xfs_dquot.c38
-rw-r--r--fs/xfs/quota/xfs_dquot.h29
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c8
-rw-r--r--fs/xfs/quota/xfs_qm.c14
-rw-r--r--fs/xfs/quota/xfs_qm.h2
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c7
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c4
-rw-r--r--fs/xfs/xfs_acl.c52
-rw-r--r--fs/xfs/xfs_acl.h14
-rw-r--r--fs/xfs/xfs_arch.h68
-rw-r--r--fs/xfs/xfs_attr.c110
-rw-r--r--fs/xfs/xfs_attr.h1
-rw-r--r--fs/xfs/xfs_attr_leaf.c75
-rw-r--r--fs/xfs/xfs_attr_leaf.h2
-rw-r--r--fs/xfs/xfs_bit.c103
-rw-r--r--fs/xfs/xfs_bit.h34
-rw-r--r--fs/xfs/xfs_bmap.c34
-rw-r--r--fs/xfs/xfs_btree.c105
-rw-r--r--fs/xfs/xfs_btree.h8
-rw-r--r--fs/xfs/xfs_buf_item.c48
-rw-r--r--fs/xfs/xfs_dfrag.c40
-rw-r--r--fs/xfs/xfs_dmapi.h1
-rw-r--r--fs/xfs/xfs_error.c5
-rw-r--r--fs/xfs/xfs_error.h12
-rw-r--r--fs/xfs/xfs_filestream.c2
-rw-r--r--fs/xfs/xfs_ialloc_btree.c30
-rw-r--r--fs/xfs/xfs_iget.c48
-rw-r--r--fs/xfs/xfs_inode.c164
-rw-r--r--fs/xfs/xfs_inode.h46
-rw-r--r--fs/xfs/xfs_inode_item.c11
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_log.c151
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_priv.h15
-rw-r--r--fs/xfs/xfs_log_recover.c7
-rw-r--r--fs/xfs/xfs_mount.c82
-rw-r--r--fs/xfs/xfs_mount.h17
-rw-r--r--fs/xfs/xfs_rtalloc.c19
-rw-r--r--fs/xfs/xfs_rw.c2
-rw-r--r--fs/xfs/xfs_trans.c75
-rw-r--r--fs/xfs/xfs_trans.h12
-rw-r--r--fs/xfs/xfs_trans_buf.c12
-rw-r--r--fs/xfs/xfs_trans_item.c66
-rw-r--r--fs/xfs/xfs_utils.c4
-rw-r--r--fs/xfs/xfs_utils.h3
-rw-r--r--fs/xfs/xfs_vfsops.c13
-rw-r--r--fs/xfs/xfs_vnodeops.c224
310 files changed, 14956 insertions, 8192 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 047c791427aa..c061c3f18e7c 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -55,7 +55,7 @@ enum {
55 Opt_err 55 Opt_err
56}; 56};
57 57
58static match_table_t tokens = { 58static const match_table_t tokens = {
59 {Opt_debug, "debug=%x"}, 59 {Opt_debug, "debug=%x"},
60 {Opt_dfltuid, "dfltuid=%u"}, 60 {Opt_dfltuid, "dfltuid=%u"},
61 {Opt_dfltgid, "dfltgid=%u"}, 61 {Opt_dfltgid, "dfltgid=%u"},
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 88e3787c6ea9..e298fe194093 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
119 119
120const struct file_operations v9fs_dir_operations = { 120const struct file_operations v9fs_dir_operations = {
121 .read = generic_read_dir, 121 .read = generic_read_dir,
122 .llseek = generic_file_llseek,
122 .readdir = v9fs_dir_readdir, 123 .readdir = v9fs_dir_readdir,
123 .open = v9fs_file_open, 124 .open = v9fs_file_open,
124 .release = v9fs_dir_release, 125 .release = v9fs_dir_release,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c95295c65045..e83aa5ebe861 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
626 return NULL; 626 return NULL;
627 627
628error: 628error:
629 if (fid) 629 p9_client_clunk(fid);
630 p9_client_clunk(fid);
631 630
632 return ERR_PTR(result); 631 return ERR_PTR(result);
633} 632}
diff --git a/fs/Kconfig b/fs/Kconfig
index d3873583360b..9e9d70c02a07 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,37 +136,51 @@ config EXT3_FS_SECURITY
136 If you are not using a security module that requires using 136 If you are not using a security module that requires using
137 extended attributes for file security labels, say N. 137 extended attributes for file security labels, say N.
138 138
139config EXT4DEV_FS 139config EXT4_FS
140 tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" 140 tristate "The Extended 4 (ext4) filesystem"
141 depends on EXPERIMENTAL
142 select JBD2 141 select JBD2
143 select CRC16 142 select CRC16
144 help 143 help
145 Ext4dev is a predecessor filesystem of the next generation 144 This is the next generation of the ext3 filesystem.
146 extended fs ext4, based on ext3 filesystem code. It will be
147 renamed ext4 fs later, once ext4dev is mature and stabilized.
148 145
149 Unlike the change from ext2 filesystem to ext3 filesystem, 146 Unlike the change from ext2 filesystem to ext3 filesystem,
150 the on-disk format of ext4dev is not the same as ext3 any more: 147 the on-disk format of ext4 is not forwards compatible with
151 it is based on extent maps and it supports 48-bit physical block 148 ext3; it is based on extent maps and it supports 48-bit
152 numbers. These combined on-disk format changes will allow 149 physical block numbers. The ext4 filesystem also supports delayed
153 ext4dev/ext4 to handle more than 16 TB filesystem volumes -- 150 allocation, persistent preallocation, high resolution time stamps,
154 a hard limit that ext3 cannot overcome without changing the 151 and a number of other features to improve performance and speed
155 on-disk format. 152 up fsck time. For more information, please see the web pages at
156 153 http://ext4.wiki.kernel.org.
157 Other than extent maps and 48-bit block numbers, ext4dev also is 154
158 likely to have other new features such as persistent preallocation, 155 The ext4 filesystem will support mounting an ext3
159 high resolution time stamps, and larger file support etc. These 156 filesystem; while there will be some performance gains from
160 features will be added to ext4dev gradually. 157 the delayed allocation and inode table readahead, the best
158 performance gains will require enabling ext4 features in the
159 filesystem, or formating a new filesystem as an ext4
160 filesystem initially.
161 161
162 To compile this file system support as a module, choose M here. The 162 To compile this file system support as a module, choose M here. The
163 module will be called ext4dev. 163 module will be called ext4dev.
164 164
165 If unsure, say N. 165 If unsure, say N.
166 166
167config EXT4DEV_FS_XATTR 167config EXT4DEV_COMPAT
168 bool "Ext4dev extended attributes" 168 bool "Enable ext4dev compatibility"
169 depends on EXT4DEV_FS 169 depends on EXT4_FS
170 help
171 Starting with 2.6.28, the name of the ext4 filesystem was
172 renamed from ext4dev to ext4. Unfortunately there are some
173 legacy userspace programs (such as klibc's fstype) have
174 "ext4dev" hardcoded.
175
176 To enable backwards compatibility so that systems that are
177 still expecting to mount ext4 filesystems using ext4dev,
178 chose Y here. This feature will go away by 2.6.31, so
179 please arrange to get your userspace programs fixed!
180
181config EXT4_FS_XATTR
182 bool "Ext4 extended attributes"
183 depends on EXT4_FS
170 default y 184 default y
171 help 185 help
172 Extended attributes are name:value pairs associated with inodes by 186 Extended attributes are name:value pairs associated with inodes by
@@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR
175 189
176 If unsure, say N. 190 If unsure, say N.
177 191
178 You need this for POSIX ACL support on ext4dev/ext4. 192 You need this for POSIX ACL support on ext4.
179 193
180config EXT4DEV_FS_POSIX_ACL 194config EXT4_FS_POSIX_ACL
181 bool "Ext4dev POSIX Access Control Lists" 195 bool "Ext4 POSIX Access Control Lists"
182 depends on EXT4DEV_FS_XATTR 196 depends on EXT4_FS_XATTR
183 select FS_POSIX_ACL 197 select FS_POSIX_ACL
184 help 198 help
185 POSIX Access Control Lists (ACLs) support permissions for users and 199 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL
190 204
191 If you don't know what Access Control Lists are, say N 205 If you don't know what Access Control Lists are, say N
192 206
193config EXT4DEV_FS_SECURITY 207config EXT4_FS_SECURITY
194 bool "Ext4dev Security Labels" 208 bool "Ext4 Security Labels"
195 depends on EXT4DEV_FS_XATTR 209 depends on EXT4_FS_XATTR
196 help 210 help
197 Security labels support alternative access control models 211 Security labels support alternative access control models
198 implemented by security modules like SELinux. This option 212 implemented by security modules like SELinux. This option
199 enables an extended attribute handler for file security 213 enables an extended attribute handler for file security
200 labels in the ext4dev/ext4 filesystem. 214 labels in the ext4 filesystem.
201 215
202 If you are not using a security module that requires using 216 If you are not using a security module that requires using
203 extended attributes for file security labels, say N. 217 extended attributes for file security labels, say N.
@@ -206,17 +220,16 @@ config JBD
206 tristate 220 tristate
207 help 221 help
208 This is a generic journalling layer for block devices. It is 222 This is a generic journalling layer for block devices. It is
209 currently used by the ext3 and OCFS2 file systems, but it could 223 currently used by the ext3 file system, but it could also be
210 also be used to add journal support to other file systems or block 224 used to add journal support to other file systems or block
211 devices such as RAID or LVM. 225 devices such as RAID or LVM.
212 226
213 If you are using the ext3 or OCFS2 file systems, you need to 227 If you are using the ext3 file system, you need to say Y here.
214 say Y here. If you are not using ext3 OCFS2 then you will probably 228 If you are not using ext3 then you will probably want to say N.
215 want to say N.
216 229
217 To compile this device as a module, choose M here: the module will be 230 To compile this device as a module, choose M here: the module will be
218 called jbd. If you are compiling ext3 or OCFS2 into the kernel, 231 called jbd. If you are compiling ext3 into the kernel, you
219 you cannot compile this code as a module. 232 cannot compile this code as a module.
220 233
221config JBD_DEBUG 234config JBD_DEBUG
222 bool "JBD (ext3) debugging support" 235 bool "JBD (ext3) debugging support"
@@ -240,22 +253,23 @@ config JBD2
240 help 253 help
241 This is a generic journaling layer for block devices that support 254 This is a generic journaling layer for block devices that support
242 both 32-bit and 64-bit block numbers. It is currently used by 255 both 32-bit and 64-bit block numbers. It is currently used by
243 the ext4dev/ext4 filesystem, but it could also be used to add 256 the ext4 and OCFS2 filesystems, but it could also be used to add
244 journal support to other file systems or block devices such 257 journal support to other file systems or block devices such
245 as RAID or LVM. 258 as RAID or LVM.
246 259
247 If you are using ext4dev/ext4, you need to say Y here. If you are not 260 If you are using ext4 or OCFS2, you need to say Y here.
248 using ext4dev/ext4 then you will probably want to say N. 261 If you are not using ext4 or OCFS2 then you will
262 probably want to say N.
249 263
250 To compile this device as a module, choose M here. The module will be 264 To compile this device as a module, choose M here. The module will be
251 called jbd2. If you are compiling ext4dev/ext4 into the kernel, 265 called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
252 you cannot compile this code as a module. 266 you cannot compile this code as a module.
253 267
254config JBD2_DEBUG 268config JBD2_DEBUG
255 bool "JBD2 (ext4dev/ext4) debugging support" 269 bool "JBD2 (ext4) debugging support"
256 depends on JBD2 && DEBUG_FS 270 depends on JBD2 && DEBUG_FS
257 help 271 help
258 If you are using the ext4dev/ext4 journaled file system (or 272 If you are using the ext4 journaled file system (or
259 potentially any other filesystem/device using JBD2), this option 273 potentially any other filesystem/device using JBD2), this option
260 allows you to enable debugging output while the system is running, 274 allows you to enable debugging output while the system is running,
261 in order to help track down any problems you are having. 275 in order to help track down any problems you are having.
@@ -270,9 +284,9 @@ config JBD2_DEBUG
270config FS_MBCACHE 284config FS_MBCACHE
271# Meta block cache for Extended Attributes (ext2/ext3/ext4) 285# Meta block cache for Extended Attributes (ext2/ext3/ext4)
272 tristate 286 tristate
273 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR 287 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
274 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y 288 default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
275 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m 289 default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
276 290
277config REISERFS_FS 291config REISERFS_FS
278 tristate "Reiserfs support" 292 tristate "Reiserfs support"
@@ -419,6 +433,14 @@ config FS_POSIX_ACL
419 bool 433 bool
420 default n 434 default n
421 435
436config FILE_LOCKING
437 bool "Enable POSIX file locking API" if EMBEDDED
438 default y
439 help
440 This option enables standard file locking support, required
441 for filesystems like NFS and for the flock() system
442 call. Disabling this option saves about 11k.
443
422source "fs/xfs/Kconfig" 444source "fs/xfs/Kconfig"
423source "fs/gfs2/Kconfig" 445source "fs/gfs2/Kconfig"
424 446
@@ -426,7 +448,7 @@ config OCFS2_FS
426 tristate "OCFS2 file system support" 448 tristate "OCFS2 file system support"
427 depends on NET && SYSFS 449 depends on NET && SYSFS
428 select CONFIGFS_FS 450 select CONFIGFS_FS
429 select JBD 451 select JBD2
430 select CRC32 452 select CRC32
431 help 453 help
432 OCFS2 is a general purpose extent based shared disk cluster file 454 OCFS2 is a general purpose extent based shared disk cluster file
@@ -497,6 +519,16 @@ config OCFS2_DEBUG_FS
497 this option for debugging only as it is likely to decrease 519 this option for debugging only as it is likely to decrease
498 performance of the filesystem. 520 performance of the filesystem.
499 521
522config OCFS2_COMPAT_JBD
523 bool "Use JBD for compatibility"
524 depends on OCFS2_FS
525 default n
526 select JBD
527 help
528 The ocfs2 filesystem now uses JBD2 for its journalling. JBD2
529 is backwards compatible with JBD. It is safe to say N here.
530 However, if you really want to use the original JBD, say Y here.
531
500endif # BLOCK 532endif # BLOCK
501 533
502config DNOTIFY 534config DNOTIFY
@@ -1765,6 +1797,28 @@ config SUNRPC_XPRT_RDMA
1765 1797
1766 If unsure, say N. 1798 If unsure, say N.
1767 1799
1800config SUNRPC_REGISTER_V4
1801 bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
1802 depends on SUNRPC && EXPERIMENTAL
1803 default n
1804 help
1805 Sun added support for registering RPC services at an IPv6
1806 address by creating two new versions of the rpcbind protocol
1807 (RFC 1833).
1808
1809 This option enables support in the kernel RPC server for
1810 registering kernel RPC services via version 4 of the rpcbind
1811 protocol. If you enable this option, you must run a portmapper
1812 daemon that supports rpcbind protocol version 4.
1813
1814 Serving NFS over IPv6 from knfsd (the kernel's NFS server)
1815 requires that you enable this option and use a portmapper that
1816 supports rpcbind version 4.
1817
1818 If unsure, say N to get traditional behavior (register kernel
1819 RPC services using only rpcbind version 2). Distributions
1820 using the legacy Linux portmapper daemon must say N here.
1821
1768config RPCSEC_GSS_KRB5 1822config RPCSEC_GSS_KRB5
1769 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" 1823 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
1770 depends on SUNRPC && EXPERIMENTAL 1824 depends on SUNRPC && EXPERIMENTAL
@@ -1930,6 +1984,16 @@ config CIFS_WEAK_PW_HASH
1930 1984
1931 If unsure, say N. 1985 If unsure, say N.
1932 1986
1987config CIFS_UPCALL
1988 bool "Kerberos/SPNEGO advanced session setup"
1989 depends on CIFS && KEYS
1990 help
1991 Enables an upcall mechanism for CIFS which accesses
1992 userspace helper utilities to provide SPNEGO packaged (RFC 4178)
1993 Kerberos tickets which are needed to mount to certain secure servers
1994 (for which more secure Kerberos authentication is required). If
1995 unsure, say N.
1996
1933config CIFS_XATTR 1997config CIFS_XATTR
1934 bool "CIFS extended attributes" 1998 bool "CIFS extended attributes"
1935 depends on CIFS 1999 depends on CIFS
@@ -1982,17 +2046,6 @@ config CIFS_EXPERIMENTAL
1982 (which is disabled by default). See the file fs/cifs/README 2046 (which is disabled by default). See the file fs/cifs/README
1983 for more details. If unsure, say N. 2047 for more details. If unsure, say N.
1984 2048
1985config CIFS_UPCALL
1986 bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
1987 depends on CIFS_EXPERIMENTAL
1988 depends on KEYS
1989 help
1990 Enables an upcall mechanism for CIFS which accesses
1991 userspace helper utilities to provide SPNEGO packaged (RFC 4178)
1992 Kerberos tickets which are needed to mount to certain secure servers
1993 (for which more secure Kerberos authentication is required). If
1994 unsure, say N.
1995
1996config CIFS_DFS_UPCALL 2049config CIFS_DFS_UPCALL
1997 bool "DFS feature support (EXPERIMENTAL)" 2050 bool "DFS feature support (EXPERIMENTAL)"
1998 depends on CIFS_EXPERIMENTAL 2051 depends on CIFS_EXPERIMENTAL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4a551af6f3fc..17c9c5ec14c5 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -59,10 +59,12 @@ config BINFMT_SHARED_FLAT
59 help 59 help
60 Support FLAT shared libraries 60 Support FLAT shared libraries
61 61
62config HAVE_AOUT
63 def_bool n
64
62config BINFMT_AOUT 65config BINFMT_AOUT
63 tristate "Kernel support for a.out and ECOFF binaries" 66 tristate "Kernel support for a.out and ECOFF binaries"
64 depends on ARCH_SUPPORTS_AOUT && \ 67 depends on HAVE_AOUT
65 (X86_32 || ALPHA || ARM || M68K)
66 ---help--- 68 ---help---
67 A.out (Assembler.OUTput) is a set of formats for libraries and 69 A.out (Assembler.OUTput) is a set of formats for libraries and
68 executables used in the earliest versions of UNIX. Linux used 70 executables used in the earliest versions of UNIX. Linux used
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5eff15..b6f27dc26b72 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,7 +7,7 @@
7 7
8obj-y := open.o read_write.o file_table.o super.o \ 8obj-y := open.o read_write.o file_table.o super.o \
9 char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ 9 char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
10 ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ 10 ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
27obj-$(CONFIG_SIGNALFD) += signalfd.o 27obj-$(CONFIG_SIGNALFD) += signalfd.o
28obj-$(CONFIG_TIMERFD) += timerfd.o 28obj-$(CONFIG_TIMERFD) += timerfd.o
29obj-$(CONFIG_EVENTFD) += eventfd.o 29obj-$(CONFIG_EVENTFD) += eventfd.o
30obj-$(CONFIG_FILE_LOCKING) += locks.o
30obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 31obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
31 32
32nfsd-$(CONFIG_NFSD) := nfsctl.o 33nfsd-$(CONFIG_NFSD) := nfsctl.o
@@ -69,7 +70,7 @@ obj-$(CONFIG_DLM) += dlm/
69# Do not add any filesystems before this line 70# Do not add any filesystems before this line
70obj-$(CONFIG_REISERFS_FS) += reiserfs/ 71obj-$(CONFIG_REISERFS_FS) += reiserfs/
71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 72obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
72obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev 73obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev
73obj-$(CONFIG_JBD) += jbd/ 74obj-$(CONFIG_JBD) += jbd/
74obj-$(CONFIG_JBD2) += jbd2/ 75obj-$(CONFIG_JBD2) += jbd2/
75obj-$(CONFIG_EXT2_FS) += ext2/ 76obj-$(CONFIG_EXT2_FS) += ext2/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index fc1a8dc64d78..85a30e929800 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,6 +197,7 @@ out:
197 197
198const struct file_operations adfs_dir_operations = { 198const struct file_operations adfs_dir_operations = {
199 .read = generic_read_dir, 199 .read = generic_read_dir,
200 .llseek = generic_file_llseek,
200 .readdir = adfs_readdir, 201 .readdir = adfs_readdir,
201 .fsync = file_fsync, 202 .fsync = file_fsync,
202}; 203};
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 26f3b43726bb..7f83a46f2b7e 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -157,7 +157,7 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
157 157
158enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err}; 158enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
159 159
160static match_table_t tokens = { 160static const match_table_t tokens = {
161 {Opt_uid, "uid=%u"}, 161 {Opt_uid, "uid=%u"},
162 {Opt_gid, "gid=%u"}, 162 {Opt_gid, "gid=%u"},
163 {Opt_ownmask, "ownmask=%o"}, 163 {Opt_ownmask, "ownmask=%o"},
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 6e3f282424b0..7b36904dbeac 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t);
19 19
20const struct file_operations affs_dir_operations = { 20const struct file_operations affs_dir_operations = {
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .llseek = generic_file_llseek,
22 .readdir = affs_readdir, 23 .readdir = affs_readdir,
23 .fsync = file_fsync, 24 .fsync = file_fsync,
24}; 25};
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3a89094f93d0..8989c93193ed 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -135,7 +135,7 @@ enum {
135 Opt_verbose, Opt_volume, Opt_ignore, Opt_err, 135 Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
136}; 136};
137 137
138static match_table_t tokens = { 138static const match_table_t tokens = {
139 {Opt_bs, "bs=%u"}, 139 {Opt_bs, "bs=%u"},
140 {Opt_mode, "mode=%o"}, 140 {Opt_mode, "mode=%o"},
141 {Opt_mufs, "mufs"}, 141 {Opt_mufs, "mufs"},
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 250d8c4d66e4..aee239a048cb 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -64,7 +64,7 @@ enum {
64 afs_opt_vol, 64 afs_opt_vol,
65}; 65};
66 66
67static match_table_t afs_options_list = { 67static const match_table_t afs_options_list = {
68 { afs_opt_cell, "cell=%s" }, 68 { afs_opt_cell, "cell=%s" },
69 { afs_opt_rwpath, "rwpath" }, 69 { afs_opt_rwpath, "rwpath" },
70 { afs_opt_vol, "vol=%s" }, 70 { afs_opt_vol, "vol=%s" },
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index dda510d31f84..b70eea1e8c59 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -59,7 +59,7 @@ static const struct super_operations autofs_sops = {
59 59
60enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; 60enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
61 61
62static match_table_t autofs_tokens = { 62static const match_table_t autofs_tokens = {
63 {Opt_fd, "fd=%u"}, 63 {Opt_fd, "fd=%u"},
64 {Opt_uid, "uid=%u"}, 64 {Opt_uid, "uid=%u"},
65 {Opt_gid, "gid=%u"}, 65 {Opt_gid, "gid=%u"},
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7bb3e5ba0537..45d55819203d 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -213,7 +213,7 @@ static const struct super_operations autofs4_sops = {
213enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, 213enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
214 Opt_indirect, Opt_direct, Opt_offset}; 214 Opt_indirect, Opt_direct, Opt_offset};
215 215
216static match_table_t tokens = { 216static const match_table_t tokens = {
217 {Opt_fd, "fd=%u"}, 217 {Opt_fd, "fd=%u"},
218 {Opt_uid, "uid=%u"}, 218 {Opt_uid, "uid=%u"},
219 {Opt_gid, "gid=%u"}, 219 {Opt_gid, "gid=%u"},
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index bcfb2dc0a61b..2a41c2a7fc52 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,6 +36,7 @@ const struct file_operations autofs4_root_operations = {
36 .release = dcache_dir_close, 36 .release = dcache_dir_close,
37 .read = generic_read_dir, 37 .read = generic_read_dir,
38 .readdir = dcache_readdir, 38 .readdir = dcache_readdir,
39 .llseek = dcache_dir_lseek,
39 .ioctl = autofs4_root_ioctl, 40 .ioctl = autofs4_root_ioctl,
40}; 41};
41 42
@@ -44,6 +45,7 @@ const struct file_operations autofs4_dir_operations = {
44 .release = dcache_dir_close, 45 .release = dcache_dir_close,
45 .read = generic_read_dir, 46 .read = generic_read_dir,
46 .readdir = dcache_readdir, 47 .readdir = dcache_readdir,
48 .llseek = dcache_dir_lseek,
47}; 49};
48 50
49const struct inode_operations autofs4_indirect_root_inode_operations = { 51const struct inode_operations autofs4_indirect_root_inode_operations = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 02c6e62b72f8..9286b2af893a 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
66static const struct file_operations befs_dir_operations = { 66static const struct file_operations befs_dir_operations = {
67 .read = generic_read_dir, 67 .read = generic_read_dir,
68 .readdir = befs_readdir, 68 .readdir = befs_readdir,
69 .llseek = generic_file_llseek,
69}; 70};
70 71
71static const struct inode_operations befs_dir_inode_operations = { 72static const struct inode_operations befs_dir_inode_operations = {
@@ -649,7 +650,7 @@ enum {
649 Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, 650 Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
650}; 651};
651 652
652static match_table_t befs_tokens = { 653static const match_table_t befs_tokens = {
653 {Opt_uid, "uid=%d"}, 654 {Opt_uid, "uid=%d"},
654 {Opt_gid, "gid=%d"}, 655 {Opt_gid, "gid=%d"},
655 {Opt_charset, "iocharset=%s"}, 656 {Opt_charset, "iocharset=%s"},
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 87ee5ccee348..ed8feb052df9 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
125 inode->i_ino); 125 inode->i_ino);
126 if (err) { 126 if (err) {
127 inode_dec_link_count(inode); 127 inode_dec_link_count(inode);
128 iput(inode);
129 mutex_unlock(&info->bfs_lock); 128 mutex_unlock(&info->bfs_lock);
129 iput(inode);
130 return err; 130 return err;
131 } 131 }
132 mutex_unlock(&info->bfs_lock); 132 mutex_unlock(&info->bfs_lock);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 56372ecf1690..dfc0197905ca 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -914,7 +914,9 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
914 /* Stash our initial stack pointer into the mm structure */ 914 /* Stash our initial stack pointer into the mm structure */
915 current->mm->start_stack = (unsigned long )sp; 915 current->mm->start_stack = (unsigned long )sp;
916 916
917 917#ifdef FLAT_PLAT_INIT
918 FLAT_PLAT_INIT(regs);
919#endif
918 DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", 920 DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
919 (int)regs, (int)start_addr, (int)current->mm->start_stack); 921 (int)regs, (int)start_addr, (int)current->mm->start_stack);
920 922
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 756205314c24..8d7e88e02e0f 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -120,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
120 if (bprm->misc_bang) 120 if (bprm->misc_bang)
121 goto _ret; 121 goto _ret;
122 122
123 bprm->misc_bang = 1;
124
125 /* to keep locking time low, we copy the interpreter string */ 123 /* to keep locking time low, we copy the interpreter string */
126 read_lock(&entries_lock); 124 read_lock(&entries_lock);
127 fmt = check_file(bprm); 125 fmt = check_file(bprm);
@@ -199,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
199 if (retval < 0) 197 if (retval < 0)
200 goto _error; 198 goto _error;
201 199
200 bprm->misc_bang = 1;
201
202 retval = search_binary_handler (bprm, regs); 202 retval = search_binary_handler (bprm, regs);
203 if (retval < 0) 203 if (retval < 0)
204 goto _error; 204 goto _error;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index c3e174b35fe6..19caf7c962ac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
107 BUG_ON(bip == NULL); 107 BUG_ON(bip == NULL);
108 108
109 /* A cloned bio doesn't own the integrity metadata */ 109 /* A cloned bio doesn't own the integrity metadata */
110 if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) 110 if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
111 && bip->bip_buf != NULL)
111 kfree(bip->bip_buf); 112 kfree(bip->bip_buf);
112 113
113 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); 114 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
@@ -150,6 +151,24 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
150} 151}
151EXPORT_SYMBOL(bio_integrity_add_page); 152EXPORT_SYMBOL(bio_integrity_add_page);
152 153
154static int bdev_integrity_enabled(struct block_device *bdev, int rw)
155{
156 struct blk_integrity *bi = bdev_get_integrity(bdev);
157
158 if (bi == NULL)
159 return 0;
160
161 if (rw == READ && bi->verify_fn != NULL &&
162 (bi->flags & INTEGRITY_FLAG_READ))
163 return 1;
164
165 if (rw == WRITE && bi->generate_fn != NULL &&
166 (bi->flags & INTEGRITY_FLAG_WRITE))
167 return 1;
168
169 return 0;
170}
171
153/** 172/**
154 * bio_integrity_enabled - Check whether integrity can be passed 173 * bio_integrity_enabled - Check whether integrity can be passed
155 * @bio: bio to check 174 * @bio: bio to check
@@ -313,6 +332,14 @@ static void bio_integrity_generate(struct bio *bio)
313 } 332 }
314} 333}
315 334
335static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
336{
337 if (bi)
338 return bi->tuple_size;
339
340 return 0;
341}
342
316/** 343/**
317 * bio_integrity_prep - Prepare bio for integrity I/O 344 * bio_integrity_prep - Prepare bio for integrity I/O
318 * @bio: bio to prepare 345 * @bio: bio to prepare
diff --git a/fs/bio.c b/fs/bio.c
index 8000e2fa16cb..77a55bcceedb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
30 30
31static struct kmem_cache *bio_slab __read_mostly; 31static struct kmem_cache *bio_slab __read_mostly;
32 32
33mempool_t *bio_split_pool __read_mostly; 33static mempool_t *bio_split_pool __read_mostly;
34 34
35/* 35/*
36 * if you change this list, also change bvec_alloc or things will 36 * if you change this list, also change bvec_alloc or things will
@@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
60 struct bio_vec *bvl; 60 struct bio_vec *bvl;
61 61
62 /* 62 /*
63 * see comment near bvec_array define! 63 * If 'bs' is given, lookup the pool and do the mempool alloc.
64 * If not, this is a bio_kmalloc() allocation and just do a
65 * kzalloc() for the exact number of vecs right away.
64 */ 66 */
65 switch (nr) { 67 if (bs) {
66 case 1 : *idx = 0; break; 68 /*
67 case 2 ... 4: *idx = 1; break; 69 * see comment near bvec_array define!
68 case 5 ... 16: *idx = 2; break; 70 */
69 case 17 ... 64: *idx = 3; break; 71 switch (nr) {
70 case 65 ... 128: *idx = 4; break; 72 case 1:
71 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 73 *idx = 0;
74 break;
75 case 2 ... 4:
76 *idx = 1;
77 break;
78 case 5 ... 16:
79 *idx = 2;
80 break;
81 case 17 ... 64:
82 *idx = 3;
83 break;
84 case 65 ... 128:
85 *idx = 4;
86 break;
87 case 129 ... BIO_MAX_PAGES:
88 *idx = 5;
89 break;
72 default: 90 default:
73 return NULL; 91 return NULL;
74 } 92 }
75 /*
76 * idx now points to the pool we want to allocate from
77 */
78 93
79 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 94 /*
80 if (bvl) 95 * idx now points to the pool we want to allocate from
81 memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); 96 */
97 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
98 if (bvl)
99 memset(bvl, 0,
100 bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
101 } else
102 bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
82 103
83 return bvl; 104 return bvl;
84} 105}
@@ -107,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio)
107 bio_free(bio, fs_bio_set); 128 bio_free(bio, fs_bio_set);
108} 129}
109 130
131static void bio_kmalloc_destructor(struct bio *bio)
132{
133 kfree(bio->bi_io_vec);
134 kfree(bio);
135}
136
110void bio_init(struct bio *bio) 137void bio_init(struct bio *bio)
111{ 138{
112 memset(bio, 0, sizeof(*bio)); 139 memset(bio, 0, sizeof(*bio));
113 bio->bi_flags = 1 << BIO_UPTODATE; 140 bio->bi_flags = 1 << BIO_UPTODATE;
141 bio->bi_comp_cpu = -1;
114 atomic_set(&bio->bi_cnt, 1); 142 atomic_set(&bio->bi_cnt, 1);
115} 143}
116 144
@@ -118,19 +146,25 @@ void bio_init(struct bio *bio)
118 * bio_alloc_bioset - allocate a bio for I/O 146 * bio_alloc_bioset - allocate a bio for I/O
119 * @gfp_mask: the GFP_ mask given to the slab allocator 147 * @gfp_mask: the GFP_ mask given to the slab allocator
120 * @nr_iovecs: number of iovecs to pre-allocate 148 * @nr_iovecs: number of iovecs to pre-allocate
121 * @bs: the bio_set to allocate from 149 * @bs: the bio_set to allocate from. If %NULL, just use kmalloc
122 * 150 *
123 * Description: 151 * Description:
124 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 152 * bio_alloc_bioset will first try its own mempool to satisfy the allocation.
125 * If %__GFP_WAIT is set then we will block on the internal pool waiting 153 * If %__GFP_WAIT is set then we will block on the internal pool waiting
126 * for a &struct bio to become free. 154 * for a &struct bio to become free. If a %NULL @bs is passed in, we will
155 * fall back to just using @kmalloc to allocate the required memory.
127 * 156 *
128 * allocate bio and iovecs from the memory pools specified by the 157 * allocate bio and iovecs from the memory pools specified by the
129 * bio_set structure. 158 * bio_set structure, or @kmalloc if none given.
130 **/ 159 **/
131struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 160struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
132{ 161{
133 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 162 struct bio *bio;
163
164 if (bs)
165 bio = mempool_alloc(bs->bio_pool, gfp_mask);
166 else
167 bio = kmalloc(sizeof(*bio), gfp_mask);
134 168
135 if (likely(bio)) { 169 if (likely(bio)) {
136 struct bio_vec *bvl = NULL; 170 struct bio_vec *bvl = NULL;
@@ -141,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
141 175
142 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 176 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
143 if (unlikely(!bvl)) { 177 if (unlikely(!bvl)) {
144 mempool_free(bio, bs->bio_pool); 178 if (bs)
179 mempool_free(bio, bs->bio_pool);
180 else
181 kfree(bio);
145 bio = NULL; 182 bio = NULL;
146 goto out; 183 goto out;
147 } 184 }
@@ -164,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
164 return bio; 201 return bio;
165} 202}
166 203
204/*
205 * Like bio_alloc(), but doesn't use a mempool backing. This means that
206 * it CAN fail, but while bio_alloc() can only be used for allocations
207 * that have a short (finite) life span, bio_kmalloc() should be used
208 * for more permanent bio allocations (like allocating some bio's for
209 * initalization or setup purposes).
210 */
211struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
212{
213 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
214
215 if (bio)
216 bio->bi_destructor = bio_kmalloc_destructor;
217
218 return bio;
219}
220
167void zero_fill_bio(struct bio *bio) 221void zero_fill_bio(struct bio *bio)
168{ 222{
169 unsigned long flags; 223 unsigned long flags;
@@ -208,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
208 return bio->bi_phys_segments; 262 return bio->bi_phys_segments;
209} 263}
210 264
211inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
212{
213 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
214 blk_recount_segments(q, bio);
215
216 return bio->bi_hw_segments;
217}
218
219/** 265/**
220 * __bio_clone - clone a bio 266 * __bio_clone - clone a bio
221 * @bio: destination bio 267 * @bio: destination bio
@@ -350,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
350 */ 396 */
351 397
352 while (bio->bi_phys_segments >= q->max_phys_segments 398 while (bio->bi_phys_segments >= q->max_phys_segments
353 || bio->bi_hw_segments >= q->max_hw_segments 399 || bio->bi_phys_segments >= q->max_hw_segments) {
354 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
355 400
356 if (retried_segments) 401 if (retried_segments)
357 return 0; 402 return 0;
@@ -395,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
395 } 440 }
396 441
397 /* If we may be able to merge these biovecs, force a recount */ 442 /* If we may be able to merge these biovecs, force a recount */
398 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 443 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
399 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
400 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 444 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
401 445
402 bio->bi_vcnt++; 446 bio->bi_vcnt++;
403 bio->bi_phys_segments++; 447 bio->bi_phys_segments++;
404 bio->bi_hw_segments++;
405 done: 448 done:
406 bio->bi_size += len; 449 bio->bi_size += len;
407 return len; 450 return len;
@@ -449,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
449 492
450struct bio_map_data { 493struct bio_map_data {
451 struct bio_vec *iovecs; 494 struct bio_vec *iovecs;
452 int nr_sgvecs;
453 struct sg_iovec *sgvecs; 495 struct sg_iovec *sgvecs;
496 int nr_sgvecs;
497 int is_our_pages;
454}; 498};
455 499
456static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, 500static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
457 struct sg_iovec *iov, int iov_count) 501 struct sg_iovec *iov, int iov_count,
502 int is_our_pages)
458{ 503{
459 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 504 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
460 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); 505 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
461 bmd->nr_sgvecs = iov_count; 506 bmd->nr_sgvecs = iov_count;
507 bmd->is_our_pages = is_our_pages;
462 bio->bi_private = bmd; 508 bio->bi_private = bmd;
463} 509}
464 510
@@ -469,20 +515,21 @@ static void bio_free_map_data(struct bio_map_data *bmd)
469 kfree(bmd); 515 kfree(bmd);
470} 516}
471 517
472static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) 518static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
519 gfp_t gfp_mask)
473{ 520{
474 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 521 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
475 522
476 if (!bmd) 523 if (!bmd)
477 return NULL; 524 return NULL;
478 525
479 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 526 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
480 if (!bmd->iovecs) { 527 if (!bmd->iovecs) {
481 kfree(bmd); 528 kfree(bmd);
482 return NULL; 529 return NULL;
483 } 530 }
484 531
485 bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); 532 bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
486 if (bmd->sgvecs) 533 if (bmd->sgvecs)
487 return bmd; 534 return bmd;
488 535
@@ -491,8 +538,9 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
491 return NULL; 538 return NULL;
492} 539}
493 540
494static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, 541static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
495 int uncopy) 542 struct sg_iovec *iov, int iov_count, int uncopy,
543 int do_free_page)
496{ 544{
497 int ret = 0, i; 545 int ret = 0, i;
498 struct bio_vec *bvec; 546 struct bio_vec *bvec;
@@ -502,7 +550,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
502 550
503 __bio_for_each_segment(bvec, bio, i, 0) { 551 __bio_for_each_segment(bvec, bio, i, 0) {
504 char *bv_addr = page_address(bvec->bv_page); 552 char *bv_addr = page_address(bvec->bv_page);
505 unsigned int bv_len = bvec->bv_len; 553 unsigned int bv_len = iovecs[i].bv_len;
506 554
507 while (bv_len && iov_idx < iov_count) { 555 while (bv_len && iov_idx < iov_count) {
508 unsigned int bytes; 556 unsigned int bytes;
@@ -535,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
535 } 583 }
536 } 584 }
537 585
538 if (uncopy) 586 if (do_free_page)
539 __free_page(bvec->bv_page); 587 __free_page(bvec->bv_page);
540 } 588 }
541 589
@@ -552,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
552int bio_uncopy_user(struct bio *bio) 600int bio_uncopy_user(struct bio *bio)
553{ 601{
554 struct bio_map_data *bmd = bio->bi_private; 602 struct bio_map_data *bmd = bio->bi_private;
555 int ret; 603 int ret = 0;
556
557 ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
558 604
605 if (!bio_flagged(bio, BIO_NULL_MAPPED))
606 ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
607 bmd->nr_sgvecs, 1, bmd->is_our_pages);
559 bio_free_map_data(bmd); 608 bio_free_map_data(bmd);
560 bio_put(bio); 609 bio_put(bio);
561 return ret; 610 return ret;
@@ -564,16 +613,20 @@ int bio_uncopy_user(struct bio *bio)
564/** 613/**
565 * bio_copy_user_iov - copy user data to bio 614 * bio_copy_user_iov - copy user data to bio
566 * @q: destination block queue 615 * @q: destination block queue
616 * @map_data: pointer to the rq_map_data holding pages (if necessary)
567 * @iov: the iovec. 617 * @iov: the iovec.
568 * @iov_count: number of elements in the iovec 618 * @iov_count: number of elements in the iovec
569 * @write_to_vm: bool indicating writing to pages or not 619 * @write_to_vm: bool indicating writing to pages or not
620 * @gfp_mask: memory allocation flags
570 * 621 *
571 * Prepares and returns a bio for indirect user io, bouncing data 622 * Prepares and returns a bio for indirect user io, bouncing data
572 * to/from kernel pages as necessary. Must be paired with 623 * to/from kernel pages as necessary. Must be paired with
573 * call bio_uncopy_user() on io completion. 624 * call bio_uncopy_user() on io completion.
574 */ 625 */
575struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, 626struct bio *bio_copy_user_iov(struct request_queue *q,
576 int iov_count, int write_to_vm) 627 struct rq_map_data *map_data,
628 struct sg_iovec *iov, int iov_count,
629 int write_to_vm, gfp_t gfp_mask)
577{ 630{
578 struct bio_map_data *bmd; 631 struct bio_map_data *bmd;
579 struct bio_vec *bvec; 632 struct bio_vec *bvec;
@@ -596,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
596 len += iov[i].iov_len; 649 len += iov[i].iov_len;
597 } 650 }
598 651
599 bmd = bio_alloc_map_data(nr_pages, iov_count); 652 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
600 if (!bmd) 653 if (!bmd)
601 return ERR_PTR(-ENOMEM); 654 return ERR_PTR(-ENOMEM);
602 655
603 ret = -ENOMEM; 656 ret = -ENOMEM;
604 bio = bio_alloc(GFP_KERNEL, nr_pages); 657 bio = bio_alloc(gfp_mask, nr_pages);
605 if (!bio) 658 if (!bio)
606 goto out_bmd; 659 goto out_bmd;
607 660
608 bio->bi_rw |= (!write_to_vm << BIO_RW); 661 bio->bi_rw |= (!write_to_vm << BIO_RW);
609 662
610 ret = 0; 663 ret = 0;
664 i = 0;
611 while (len) { 665 while (len) {
612 unsigned int bytes = PAGE_SIZE; 666 unsigned int bytes;
667
668 if (map_data)
669 bytes = 1U << (PAGE_SHIFT + map_data->page_order);
670 else
671 bytes = PAGE_SIZE;
613 672
614 if (bytes > len) 673 if (bytes > len)
615 bytes = len; 674 bytes = len;
616 675
617 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 676 if (map_data) {
677 if (i == map_data->nr_entries) {
678 ret = -ENOMEM;
679 break;
680 }
681 page = map_data->pages[i++];
682 } else
683 page = alloc_page(q->bounce_gfp | gfp_mask);
618 if (!page) { 684 if (!page) {
619 ret = -ENOMEM; 685 ret = -ENOMEM;
620 break; 686 break;
@@ -633,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
633 * success 699 * success
634 */ 700 */
635 if (!write_to_vm) { 701 if (!write_to_vm) {
636 ret = __bio_copy_iov(bio, iov, iov_count, 0); 702 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
637 if (ret) 703 if (ret)
638 goto cleanup; 704 goto cleanup;
639 } 705 }
640 706
641 bio_set_map_data(bmd, bio, iov, iov_count); 707 bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
642 return bio; 708 return bio;
643cleanup: 709cleanup:
644 bio_for_each_segment(bvec, bio, i) 710 if (!map_data)
645 __free_page(bvec->bv_page); 711 bio_for_each_segment(bvec, bio, i)
712 __free_page(bvec->bv_page);
646 713
647 bio_put(bio); 714 bio_put(bio);
648out_bmd: 715out_bmd:
@@ -653,29 +720,32 @@ out_bmd:
653/** 720/**
654 * bio_copy_user - copy user data to bio 721 * bio_copy_user - copy user data to bio
655 * @q: destination block queue 722 * @q: destination block queue
723 * @map_data: pointer to the rq_map_data holding pages (if necessary)
656 * @uaddr: start of user address 724 * @uaddr: start of user address
657 * @len: length in bytes 725 * @len: length in bytes
658 * @write_to_vm: bool indicating writing to pages or not 726 * @write_to_vm: bool indicating writing to pages or not
727 * @gfp_mask: memory allocation flags
659 * 728 *
660 * Prepares and returns a bio for indirect user io, bouncing data 729 * Prepares and returns a bio for indirect user io, bouncing data
661 * to/from kernel pages as necessary. Must be paired with 730 * to/from kernel pages as necessary. Must be paired with
662 * call bio_uncopy_user() on io completion. 731 * call bio_uncopy_user() on io completion.
663 */ 732 */
664struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, 733struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
665 unsigned int len, int write_to_vm) 734 unsigned long uaddr, unsigned int len,
735 int write_to_vm, gfp_t gfp_mask)
666{ 736{
667 struct sg_iovec iov; 737 struct sg_iovec iov;
668 738
669 iov.iov_base = (void __user *)uaddr; 739 iov.iov_base = (void __user *)uaddr;
670 iov.iov_len = len; 740 iov.iov_len = len;
671 741
672 return bio_copy_user_iov(q, &iov, 1, write_to_vm); 742 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
673} 743}
674 744
675static struct bio *__bio_map_user_iov(struct request_queue *q, 745static struct bio *__bio_map_user_iov(struct request_queue *q,
676 struct block_device *bdev, 746 struct block_device *bdev,
677 struct sg_iovec *iov, int iov_count, 747 struct sg_iovec *iov, int iov_count,
678 int write_to_vm) 748 int write_to_vm, gfp_t gfp_mask)
679{ 749{
680 int i, j; 750 int i, j;
681 int nr_pages = 0; 751 int nr_pages = 0;
@@ -701,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
701 if (!nr_pages) 771 if (!nr_pages)
702 return ERR_PTR(-EINVAL); 772 return ERR_PTR(-EINVAL);
703 773
704 bio = bio_alloc(GFP_KERNEL, nr_pages); 774 bio = bio_alloc(gfp_mask, nr_pages);
705 if (!bio) 775 if (!bio)
706 return ERR_PTR(-ENOMEM); 776 return ERR_PTR(-ENOMEM);
707 777
708 ret = -ENOMEM; 778 ret = -ENOMEM;
709 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 779 pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
710 if (!pages) 780 if (!pages)
711 goto out; 781 goto out;
712 782
@@ -785,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
785 * @uaddr: start of user address 855 * @uaddr: start of user address
786 * @len: length in bytes 856 * @len: length in bytes
787 * @write_to_vm: bool indicating writing to pages or not 857 * @write_to_vm: bool indicating writing to pages or not
858 * @gfp_mask: memory allocation flags
788 * 859 *
789 * Map the user space address into a bio suitable for io to a block 860 * Map the user space address into a bio suitable for io to a block
790 * device. Returns an error pointer in case of error. 861 * device. Returns an error pointer in case of error.
791 */ 862 */
792struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, 863struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
793 unsigned long uaddr, unsigned int len, int write_to_vm) 864 unsigned long uaddr, unsigned int len, int write_to_vm,
865 gfp_t gfp_mask)
794{ 866{
795 struct sg_iovec iov; 867 struct sg_iovec iov;
796 868
797 iov.iov_base = (void __user *)uaddr; 869 iov.iov_base = (void __user *)uaddr;
798 iov.iov_len = len; 870 iov.iov_len = len;
799 871
800 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); 872 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
801} 873}
802 874
803/** 875/**
@@ -807,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
807 * @iov: the iovec. 879 * @iov: the iovec.
808 * @iov_count: number of elements in the iovec 880 * @iov_count: number of elements in the iovec
809 * @write_to_vm: bool indicating writing to pages or not 881 * @write_to_vm: bool indicating writing to pages or not
882 * @gfp_mask: memory allocation flags
810 * 883 *
811 * Map the user space address into a bio suitable for io to a block 884 * Map the user space address into a bio suitable for io to a block
812 * device. Returns an error pointer in case of error. 885 * device. Returns an error pointer in case of error.
813 */ 886 */
814struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, 887struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
815 struct sg_iovec *iov, int iov_count, 888 struct sg_iovec *iov, int iov_count,
816 int write_to_vm) 889 int write_to_vm, gfp_t gfp_mask)
817{ 890{
818 struct bio *bio; 891 struct bio *bio;
819 892
820 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); 893 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
821 894 gfp_mask);
822 if (IS_ERR(bio)) 895 if (IS_ERR(bio))
823 return bio; 896 return bio;
824 897
@@ -942,19 +1015,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
942{ 1015{
943 struct bio_vec *bvec; 1016 struct bio_vec *bvec;
944 const int read = bio_data_dir(bio) == READ; 1017 const int read = bio_data_dir(bio) == READ;
945 char *p = bio->bi_private; 1018 struct bio_map_data *bmd = bio->bi_private;
946 int i; 1019 int i;
1020 char *p = bmd->sgvecs[0].iov_base;
947 1021
948 __bio_for_each_segment(bvec, bio, i, 0) { 1022 __bio_for_each_segment(bvec, bio, i, 0) {
949 char *addr = page_address(bvec->bv_page); 1023 char *addr = page_address(bvec->bv_page);
1024 int len = bmd->iovecs[i].bv_len;
950 1025
951 if (read && !err) 1026 if (read && !err)
952 memcpy(p, addr, bvec->bv_len); 1027 memcpy(p, addr, len);
953 1028
954 __free_page(bvec->bv_page); 1029 __free_page(bvec->bv_page);
955 p += bvec->bv_len; 1030 p += len;
956 } 1031 }
957 1032
1033 bio_free_map_data(bmd);
958 bio_put(bio); 1034 bio_put(bio);
959} 1035}
960 1036
@@ -972,38 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
972struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, 1048struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
973 gfp_t gfp_mask, int reading) 1049 gfp_t gfp_mask, int reading)
974{ 1050{
975 unsigned long kaddr = (unsigned long)data;
976 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
977 unsigned long start = kaddr >> PAGE_SHIFT;
978 const int nr_pages = end - start;
979 struct bio *bio; 1051 struct bio *bio;
980 struct bio_vec *bvec; 1052 struct bio_vec *bvec;
981 int i, ret; 1053 int i;
982
983 bio = bio_alloc(gfp_mask, nr_pages);
984 if (!bio)
985 return ERR_PTR(-ENOMEM);
986
987 while (len) {
988 struct page *page;
989 unsigned int bytes = PAGE_SIZE;
990
991 if (bytes > len)
992 bytes = len;
993
994 page = alloc_page(q->bounce_gfp | gfp_mask);
995 if (!page) {
996 ret = -ENOMEM;
997 goto cleanup;
998 }
999
1000 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
1001 ret = -EINVAL;
1002 goto cleanup;
1003 }
1004 1054
1005 len -= bytes; 1055 bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
1006 } 1056 if (IS_ERR(bio))
1057 return bio;
1007 1058
1008 if (!reading) { 1059 if (!reading) {
1009 void *p = data; 1060 void *p = data;
@@ -1016,16 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1016 } 1067 }
1017 } 1068 }
1018 1069
1019 bio->bi_private = data;
1020 bio->bi_end_io = bio_copy_kern_endio; 1070 bio->bi_end_io = bio_copy_kern_endio;
1021 return bio;
1022cleanup:
1023 bio_for_each_segment(bvec, bio, i)
1024 __free_page(bvec->bv_page);
1025
1026 bio_put(bio);
1027 1071
1028 return ERR_PTR(ret); 1072 return bio;
1029} 1073}
1030 1074
1031/* 1075/*
@@ -1212,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
1212 * split a bio - only worry about a bio with a single page 1256 * split a bio - only worry about a bio with a single page
1213 * in it's iovec 1257 * in it's iovec
1214 */ 1258 */
1215struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 1259struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1216{ 1260{
1217 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 1261 struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
1218 1262
1219 if (!bp) 1263 if (!bp)
1220 return bp; 1264 return bp;
@@ -1248,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1248 bp->bio2.bi_end_io = bio_pair_end_2; 1292 bp->bio2.bi_end_io = bio_pair_end_2;
1249 1293
1250 bp->bio1.bi_private = bi; 1294 bp->bio1.bi_private = bi;
1251 bp->bio2.bi_private = pool; 1295 bp->bio2.bi_private = bio_split_pool;
1252 1296
1253 if (bio_integrity(bi)) 1297 if (bio_integrity(bi))
1254 bio_integrity_split(bi, bp, first_sectors); 1298 bio_integrity_split(bi, bp, first_sectors);
@@ -1256,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1256 return bp; 1300 return bp;
1257} 1301}
1258 1302
1303/**
1304 * bio_sector_offset - Find hardware sector offset in bio
1305 * @bio: bio to inspect
1306 * @index: bio_vec index
1307 * @offset: offset in bv_page
1308 *
1309 * Return the number of hardware sectors between beginning of bio
1310 * and an end point indicated by a bio_vec index and an offset
1311 * within that vector's page.
1312 */
1313sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1314 unsigned int offset)
1315{
1316 unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
1317 struct bio_vec *bv;
1318 sector_t sectors;
1319 int i;
1320
1321 sectors = 0;
1322
1323 if (index >= bio->bi_idx)
1324 index = bio->bi_vcnt - 1;
1325
1326 __bio_for_each_segment(bv, bio, i, 0) {
1327 if (i == index) {
1328 if (offset > bv->bv_offset)
1329 sectors += (offset - bv->bv_offset) / sector_sz;
1330 break;
1331 }
1332
1333 sectors += bv->bv_len / sector_sz;
1334 }
1335
1336 return sectors;
1337}
1338EXPORT_SYMBOL(bio_sector_offset);
1259 1339
1260/* 1340/*
1261 * create memory pools for biovec's in a bio_set. 1341 * create memory pools for biovec's in a bio_set.
@@ -1358,6 +1438,7 @@ static int __init init_bio(void)
1358subsys_initcall(init_bio); 1438subsys_initcall(init_bio);
1359 1439
1360EXPORT_SYMBOL(bio_alloc); 1440EXPORT_SYMBOL(bio_alloc);
1441EXPORT_SYMBOL(bio_kmalloc);
1361EXPORT_SYMBOL(bio_put); 1442EXPORT_SYMBOL(bio_put);
1362EXPORT_SYMBOL(bio_free); 1443EXPORT_SYMBOL(bio_free);
1363EXPORT_SYMBOL(bio_endio); 1444EXPORT_SYMBOL(bio_endio);
@@ -1365,7 +1446,6 @@ EXPORT_SYMBOL(bio_init);
1365EXPORT_SYMBOL(__bio_clone); 1446EXPORT_SYMBOL(__bio_clone);
1366EXPORT_SYMBOL(bio_clone); 1447EXPORT_SYMBOL(bio_clone);
1367EXPORT_SYMBOL(bio_phys_segments); 1448EXPORT_SYMBOL(bio_phys_segments);
1368EXPORT_SYMBOL(bio_hw_segments);
1369EXPORT_SYMBOL(bio_add_page); 1449EXPORT_SYMBOL(bio_add_page);
1370EXPORT_SYMBOL(bio_add_pc_page); 1450EXPORT_SYMBOL(bio_add_pc_page);
1371EXPORT_SYMBOL(bio_get_nr_vecs); 1451EXPORT_SYMBOL(bio_get_nr_vecs);
@@ -1375,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
1375EXPORT_SYMBOL(bio_copy_kern); 1455EXPORT_SYMBOL(bio_copy_kern);
1376EXPORT_SYMBOL(bio_pair_release); 1456EXPORT_SYMBOL(bio_pair_release);
1377EXPORT_SYMBOL(bio_split); 1457EXPORT_SYMBOL(bio_split);
1378EXPORT_SYMBOL(bio_split_pool);
1379EXPORT_SYMBOL(bio_copy_user); 1458EXPORT_SYMBOL(bio_copy_user);
1380EXPORT_SYMBOL(bio_uncopy_user); 1459EXPORT_SYMBOL(bio_uncopy_user);
1381EXPORT_SYMBOL(bioset_create); 1460EXPORT_SYMBOL(bioset_create);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aff54219e049..d84f0469a016 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release);
540 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 540 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
541 */ 541 */
542 542
543static struct kobject *bdev_get_kobj(struct block_device *bdev)
544{
545 if (bdev->bd_contains != bdev)
546 return kobject_get(&bdev->bd_part->dev.kobj);
547 else
548 return kobject_get(&bdev->bd_disk->dev.kobj);
549}
550
551static struct kobject *bdev_get_holder(struct block_device *bdev)
552{
553 if (bdev->bd_contains != bdev)
554 return kobject_get(bdev->bd_part->holder_dir);
555 else
556 return kobject_get(bdev->bd_disk->holder_dir);
557}
558
559static int add_symlink(struct kobject *from, struct kobject *to) 543static int add_symlink(struct kobject *from, struct kobject *to)
560{ 544{
561 if (!from || !to) 545 if (!from || !to)
@@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
604 if (!bo->hdev) 588 if (!bo->hdev)
605 goto fail_put_sdir; 589 goto fail_put_sdir;
606 590
607 bo->sdev = bdev_get_kobj(bdev); 591 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
608 if (!bo->sdev) 592 if (!bo->sdev)
609 goto fail_put_hdev; 593 goto fail_put_hdev;
610 594
611 bo->hdir = bdev_get_holder(bdev); 595 bo->hdir = kobject_get(bdev->bd_part->holder_dir);
612 if (!bo->hdir) 596 if (!bo->hdir)
613 goto fail_put_sdev; 597 goto fail_put_sdev;
614 598
@@ -868,6 +852,87 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
868 852
869EXPORT_SYMBOL(open_by_devnum); 853EXPORT_SYMBOL(open_by_devnum);
870 854
855/**
856 * flush_disk - invalidates all buffer-cache entries on a disk
857 *
858 * @bdev: struct block device to be flushed
859 *
860 * Invalidates all buffer-cache entries on a disk. It should be called
861 * when a disk has been changed -- either by a media change or online
862 * resize.
863 */
864static void flush_disk(struct block_device *bdev)
865{
866 if (__invalidate_device(bdev)) {
867 char name[BDEVNAME_SIZE] = "";
868
869 if (bdev->bd_disk)
870 disk_name(bdev->bd_disk, 0, name);
871 printk(KERN_WARNING "VFS: busy inodes on changed media or "
872 "resized disk %s\n", name);
873 }
874
875 if (!bdev->bd_disk)
876 return;
877 if (disk_partitionable(bdev->bd_disk))
878 bdev->bd_invalidated = 1;
879}
880
881/**
882 * check_disk_size_change - checks for disk size change and adjusts bdev size.
883 * @disk: struct gendisk to check
884 * @bdev: struct bdev to adjust.
885 *
886 * This routine checks to see if the bdev size does not match the disk size
887 * and adjusts it if it differs.
888 */
889void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
890{
891 loff_t disk_size, bdev_size;
892
893 disk_size = (loff_t)get_capacity(disk) << 9;
894 bdev_size = i_size_read(bdev->bd_inode);
895 if (disk_size != bdev_size) {
896 char name[BDEVNAME_SIZE];
897
898 disk_name(disk, 0, name);
899 printk(KERN_INFO
900 "%s: detected capacity change from %lld to %lld\n",
901 name, bdev_size, disk_size);
902 i_size_write(bdev->bd_inode, disk_size);
903 flush_disk(bdev);
904 }
905}
906EXPORT_SYMBOL(check_disk_size_change);
907
908/**
909 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
910 * @disk: struct gendisk to be revalidated
911 *
912 * This routine is a wrapper for lower-level driver's revalidate_disk
913 * call-backs. It is used to do common pre and post operations needed
914 * for all revalidate_disk operations.
915 */
916int revalidate_disk(struct gendisk *disk)
917{
918 struct block_device *bdev;
919 int ret = 0;
920
921 if (disk->fops->revalidate_disk)
922 ret = disk->fops->revalidate_disk(disk);
923
924 bdev = bdget_disk(disk, 0);
925 if (!bdev)
926 return ret;
927
928 mutex_lock(&bdev->bd_mutex);
929 check_disk_size_change(disk, bdev);
930 mutex_unlock(&bdev->bd_mutex);
931 bdput(bdev);
932 return ret;
933}
934EXPORT_SYMBOL(revalidate_disk);
935
871/* 936/*
872 * This routine checks whether a removable media has been changed, 937 * This routine checks whether a removable media has been changed,
873 * and invalidates all buffer-cache-entries in that case. This 938 * and invalidates all buffer-cache-entries in that case. This
@@ -887,13 +952,9 @@ int check_disk_change(struct block_device *bdev)
887 if (!bdops->media_changed(bdev->bd_disk)) 952 if (!bdops->media_changed(bdev->bd_disk))
888 return 0; 953 return 0;
889 954
890 if (__invalidate_device(bdev)) 955 flush_disk(bdev);
891 printk("VFS: busy inodes on changed media.\n");
892
893 if (bdops->revalidate_disk) 956 if (bdops->revalidate_disk)
894 bdops->revalidate_disk(bdev->bd_disk); 957 bdops->revalidate_disk(bdev->bd_disk);
895 if (bdev->bd_disk->minors > 1)
896 bdev->bd_invalidated = 1;
897 return 1; 958 return 1;
898} 959}
899 960
@@ -927,10 +988,10 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
927 988
928static int do_open(struct block_device *bdev, struct file *file, int for_part) 989static int do_open(struct block_device *bdev, struct file *file, int for_part)
929{ 990{
930 struct module *owner = NULL;
931 struct gendisk *disk; 991 struct gendisk *disk;
992 struct hd_struct *part = NULL;
932 int ret; 993 int ret;
933 int part; 994 int partno;
934 int perm = 0; 995 int perm = 0;
935 996
936 if (file->f_mode & FMODE_READ) 997 if (file->f_mode & FMODE_READ)
@@ -948,25 +1009,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
948 1009
949 ret = -ENXIO; 1010 ret = -ENXIO;
950 file->f_mapping = bdev->bd_inode->i_mapping; 1011 file->f_mapping = bdev->bd_inode->i_mapping;
1012
951 lock_kernel(); 1013 lock_kernel();
952 disk = get_gendisk(bdev->bd_dev, &part); 1014
953 if (!disk) { 1015 disk = get_gendisk(bdev->bd_dev, &partno);
954 unlock_kernel(); 1016 if (!disk)
955 bdput(bdev); 1017 goto out_unlock_kernel;
956 return ret; 1018 part = disk_get_part(disk, partno);
957 } 1019 if (!part)
958 owner = disk->fops->owner; 1020 goto out_unlock_kernel;
959 1021
960 mutex_lock_nested(&bdev->bd_mutex, for_part); 1022 mutex_lock_nested(&bdev->bd_mutex, for_part);
961 if (!bdev->bd_openers) { 1023 if (!bdev->bd_openers) {
962 bdev->bd_disk = disk; 1024 bdev->bd_disk = disk;
1025 bdev->bd_part = part;
963 bdev->bd_contains = bdev; 1026 bdev->bd_contains = bdev;
964 if (!part) { 1027 if (!partno) {
965 struct backing_dev_info *bdi; 1028 struct backing_dev_info *bdi;
966 if (disk->fops->open) { 1029 if (disk->fops->open) {
967 ret = disk->fops->open(bdev->bd_inode, file); 1030 ret = disk->fops->open(bdev->bd_inode, file);
968 if (ret) 1031 if (ret)
969 goto out_first; 1032 goto out_clear;
970 } 1033 }
971 if (!bdev->bd_openers) { 1034 if (!bdev->bd_openers) {
972 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1035 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -978,36 +1041,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
978 if (bdev->bd_invalidated) 1041 if (bdev->bd_invalidated)
979 rescan_partitions(disk, bdev); 1042 rescan_partitions(disk, bdev);
980 } else { 1043 } else {
981 struct hd_struct *p;
982 struct block_device *whole; 1044 struct block_device *whole;
983 whole = bdget_disk(disk, 0); 1045 whole = bdget_disk(disk, 0);
984 ret = -ENOMEM; 1046 ret = -ENOMEM;
985 if (!whole) 1047 if (!whole)
986 goto out_first; 1048 goto out_clear;
987 BUG_ON(for_part); 1049 BUG_ON(for_part);
988 ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1); 1050 ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
989 if (ret) 1051 if (ret)
990 goto out_first; 1052 goto out_clear;
991 bdev->bd_contains = whole; 1053 bdev->bd_contains = whole;
992 p = disk->part[part - 1];
993 bdev->bd_inode->i_data.backing_dev_info = 1054 bdev->bd_inode->i_data.backing_dev_info =
994 whole->bd_inode->i_data.backing_dev_info; 1055 whole->bd_inode->i_data.backing_dev_info;
995 if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { 1056 if (!(disk->flags & GENHD_FL_UP) ||
1057 !part || !part->nr_sects) {
996 ret = -ENXIO; 1058 ret = -ENXIO;
997 goto out_first; 1059 goto out_clear;
998 } 1060 }
999 kobject_get(&p->dev.kobj); 1061 bd_set_size(bdev, (loff_t)part->nr_sects << 9);
1000 bdev->bd_part = p;
1001 bd_set_size(bdev, (loff_t) p->nr_sects << 9);
1002 } 1062 }
1003 } else { 1063 } else {
1064 disk_put_part(part);
1004 put_disk(disk); 1065 put_disk(disk);
1005 module_put(owner); 1066 module_put(disk->fops->owner);
1067 part = NULL;
1068 disk = NULL;
1006 if (bdev->bd_contains == bdev) { 1069 if (bdev->bd_contains == bdev) {
1007 if (bdev->bd_disk->fops->open) { 1070 if (bdev->bd_disk->fops->open) {
1008 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); 1071 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
1009 if (ret) 1072 if (ret)
1010 goto out; 1073 goto out_unlock_bdev;
1011 } 1074 }
1012 if (bdev->bd_invalidated) 1075 if (bdev->bd_invalidated)
1013 rescan_partitions(bdev->bd_disk, bdev); 1076 rescan_partitions(bdev->bd_disk, bdev);
@@ -1020,19 +1083,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
1020 unlock_kernel(); 1083 unlock_kernel();
1021 return 0; 1084 return 0;
1022 1085
1023out_first: 1086 out_clear:
1024 bdev->bd_disk = NULL; 1087 bdev->bd_disk = NULL;
1088 bdev->bd_part = NULL;
1025 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1089 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1026 if (bdev != bdev->bd_contains) 1090 if (bdev != bdev->bd_contains)
1027 __blkdev_put(bdev->bd_contains, 1); 1091 __blkdev_put(bdev->bd_contains, 1);
1028 bdev->bd_contains = NULL; 1092 bdev->bd_contains = NULL;
1029 put_disk(disk); 1093 out_unlock_bdev:
1030 module_put(owner);
1031out:
1032 mutex_unlock(&bdev->bd_mutex); 1094 mutex_unlock(&bdev->bd_mutex);
1095 out_unlock_kernel:
1033 unlock_kernel(); 1096 unlock_kernel();
1034 if (ret) 1097
1035 bdput(bdev); 1098 disk_put_part(part);
1099 if (disk)
1100 module_put(disk->fops->owner);
1101 put_disk(disk);
1102 bdput(bdev);
1103
1036 return ret; 1104 return ret;
1037} 1105}
1038 1106
@@ -1117,11 +1185,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
1117 1185
1118 put_disk(disk); 1186 put_disk(disk);
1119 module_put(owner); 1187 module_put(owner);
1120 1188 disk_put_part(bdev->bd_part);
1121 if (bdev->bd_contains != bdev) { 1189 bdev->bd_part = NULL;
1122 kobject_put(&bdev->bd_part->dev.kobj);
1123 bdev->bd_part = NULL;
1124 }
1125 bdev->bd_disk = NULL; 1190 bdev->bd_disk = NULL;
1126 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1191 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1127 if (bdev != bdev->bd_contains) 1192 if (bdev != bdev->bd_contains)
@@ -1197,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev);
1197 1262
1198/** 1263/**
1199 * lookup_bdev - lookup a struct block_device by name 1264 * lookup_bdev - lookup a struct block_device by name
1265 * @pathname: special file representing the block device
1200 * 1266 *
1201 * @path: special file representing the block device 1267 * Get a reference to the blockdevice at @pathname in the current
1202 *
1203 * Get a reference to the blockdevice at @path in the current
1204 * namespace if possible and return it. Return ERR_PTR(error) 1268 * namespace if possible and return it. Return ERR_PTR(error)
1205 * otherwise. 1269 * otherwise.
1206 */ 1270 */
diff --git a/fs/buffer.c b/fs/buffer.c
index 38653e36e225..ac78d4c19b3b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2926,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh)
2926 BUG_ON(!buffer_mapped(bh)); 2926 BUG_ON(!buffer_mapped(bh));
2927 BUG_ON(!bh->b_end_io); 2927 BUG_ON(!bh->b_end_io);
2928 2928
2929 if (buffer_ordered(bh) && (rw == WRITE)) 2929 /*
2930 rw = WRITE_BARRIER; 2930 * Mask in barrier bit for a write (could be either a WRITE or a
2931 * WRITE_SYNC
2932 */
2933 if (buffer_ordered(bh) && (rw & WRITE))
2934 rw |= WRITE_BARRIER;
2931 2935
2932 /* 2936 /*
2933 * Only clear out a write error when rewriting, should this 2937 * Only clear out a write error when rewriting
2934 * include WRITE_SYNC as well?
2935 */ 2938 */
2936 if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) 2939 if (test_set_buffer_req(bh) && (rw & WRITE))
2937 clear_buffer_write_io_error(bh); 2940 clear_buffer_write_io_error(bh);
2938 2941
2939 /* 2942 /*
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f5d0083e09fa..06e521a945c3 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -4,7 +4,15 @@ Fix premature write failure on congested networks (we would give up
4on EAGAIN from the socket too quickly on large writes). 4on EAGAIN from the socket too quickly on large writes).
5Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. 5Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
6Fix endian problems in acl (mode from/to cifs acl) on bigendian 6Fix endian problems in acl (mode from/to cifs acl) on bigendian
7architectures. 7architectures. Fix problems with preserving timestamps on copying open
8files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit
9on parent directory when server supports Unix Extensions but not POSIX
10create. Update cifs.upcall version to handle new Kerberos sec flags
11(this requires update of cifs.upcall program from Samba). Fix memory leak
12on dns_upcall (resolving DFS referralls). Fix plain text password
13authentication (requires setting SecurityFlags to 0x30030 to enable
14lanman and plain text though). Fix writes to be at correct offset when
15file is open with O_APPEND and file is on a directio (forcediretio) mount.
8 16
9Version 1.53 17Version 1.53
10------------ 18------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 2bd6fe556f88..bd2343d4c6a6 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -542,10 +542,20 @@ SecurityFlags Flags which control security negotiation and
542 hashing mechanisms (as "must use") on the other hand 542 hashing mechanisms (as "must use") on the other hand
543 does not make much sense. Default flags are 543 does not make much sense. Default flags are
544 0x07007 544 0x07007
545 (NTLM, NTLMv2 and packet signing allowed). Maximum 545 (NTLM, NTLMv2 and packet signing allowed). The maximum
546 allowable flags if you want to allow mounts to servers 546 allowable flags if you want to allow mounts to servers
547 using weaker password hashes is 0x37037 (lanman, 547 using weaker password hashes is 0x37037 (lanman,
548 plaintext, ntlm, ntlmv2, signing allowed): 548 plaintext, ntlm, ntlmv2, signing allowed). Some
549 SecurityFlags require the corresponding menuconfig
550 options to be enabled (lanman and plaintext require
551 CONFIG_CIFS_WEAK_PW_HASH for example). Enabling
552 plaintext authentication currently requires also
553 enabling lanman authentication in the security flags
554 because the cifs module only supports sending
555 laintext passwords using the older lanman dialect
556 form of the session setup SMB. (e.g. for authentication
557 using plain text passwords, set the SecurityFlags
558 to 0x30030):
549 559
550 may use packet signing 0x00001 560 may use packet signing 0x00001
551 must use packet signing 0x01001 561 must use packet signing 0x01001
@@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in
642that they represent all for that share, not just those for which the server 652that they represent all for that share, not just those for which the server
643returned success. 653returned success.
644 654
645Also note that "cat /proc/fs/cifs/DebugData" will display information about 655Also note that "cat /proc/fs/cifs/DebugData" will display information about
646the active sessions and the shares that are mounted. 656the active sessions and the shares that are mounted.
647Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is 657
648on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and 658Enabling Kerberos (extended security) works but requires version 1.2 or later
649LANMAN support do not require this helper. 659of the helper program cifs.upcall to be present and to be configured in the
660/etc/request-key.conf file. The cifs.upcall helper program is from the Samba
661project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
662require this helper. Note that NTLMv2 security (which does not require the
663cifs.upcall helper program), instead of using Kerberos, is sufficient for
664some use cases.
665
666Enabling DFS support (used to access shares transparently in an MS-DFS
667global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In
668addition, DFS support for target shares which are specified as UNC
669names which begin with host names (rather than IP addresses) requires
670a user space helper (such as cifs.upcall) to be present in order to
671translate host names to ip address, and the user space helper must also
672be configured in the file /etc/request-key.conf
673
674To use cifs Kerberos and DFS support, the Linux keyutils package should be
675installed and something like the following lines should be added to the
676/etc/request-key.conf file:
677
678create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
679create dns_resolver * * /usr/local/sbin/cifs.upcall %k
680
681
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 5fabd2caf93c..1b09f1670061 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
476 unsigned int cls, con, tag, oidlen, rc; 476 unsigned int cls, con, tag, oidlen, rc;
477 bool use_ntlmssp = false; 477 bool use_ntlmssp = false;
478 bool use_kerberos = false; 478 bool use_kerberos = false;
479 bool use_mskerberos = false;
479 480
480 *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/ 481 *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
481 482
@@ -574,10 +575,12 @@ decode_negTokenInit(unsigned char *security_blob, int length,
574 *(oid + 1), *(oid + 2), *(oid + 3))); 575 *(oid + 1), *(oid + 2), *(oid + 3)));
575 576
576 if (compare_oid(oid, oidlen, MSKRB5_OID, 577 if (compare_oid(oid, oidlen, MSKRB5_OID,
577 MSKRB5_OID_LEN)) 578 MSKRB5_OID_LEN) &&
578 use_kerberos = true; 579 !use_kerberos)
580 use_mskerberos = true;
579 else if (compare_oid(oid, oidlen, KRB5_OID, 581 else if (compare_oid(oid, oidlen, KRB5_OID,
580 KRB5_OID_LEN)) 582 KRB5_OID_LEN) &&
583 !use_mskerberos)
581 use_kerberos = true; 584 use_kerberos = true;
582 else if (compare_oid(oid, oidlen, NTLMSSP_OID, 585 else if (compare_oid(oid, oidlen, NTLMSSP_OID,
583 NTLMSSP_OID_LEN)) 586 NTLMSSP_OID_LEN))
@@ -630,6 +633,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
630 633
631 if (use_kerberos) 634 if (use_kerberos)
632 *secType = Kerberos; 635 *secType = Kerberos;
636 else if (use_mskerberos)
637 *secType = MSKerberos;
633 else if (use_ntlmssp) 638 else if (use_ntlmssp)
634 *secType = NTLMSSP; 639 *secType = NTLMSSP;
635 640
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2434ab0e8791..fcee9298b620 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = {
66 .describe = user_describe, 66 .describe = user_describe,
67}; 67};
68 68
69#define MAX_VER_STR_LEN 8 /* length of longest version string e.g. 69/* length of longest version string e.g. strlen("ver=0xFF") */
70 strlen("ver=0xFF") */ 70#define MAX_VER_STR_LEN 8
71#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg 71
72 in future could have strlen(";sec=ntlmsspi") */ 72/* length of longest security mechanism name, eg in future could have
73#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */ 73 * strlen(";sec=ntlmsspi") */
74#define MAX_MECH_STR_LEN 13
75
76/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
77#define MAX_IPV6_ADDR_LEN 42
78
79/* strlen of "host=" */
80#define HOST_KEY_LEN 5
81
82/* strlen of ";ip4=" or ";ip6=" */
83#define IP_KEY_LEN 5
84
85/* strlen of ";uid=0x" */
86#define UID_KEY_LEN 7
87
88/* strlen of ";user=" */
89#define USER_KEY_LEN 6
90
74/* get a key struct with a SPNEGO security blob, suitable for session setup */ 91/* get a key struct with a SPNEGO security blob, suitable for session setup */
75struct key * 92struct key *
76cifs_get_spnego_key(struct cifsSesInfo *sesInfo) 93cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -84,11 +101,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
84 /* length of fields (with semicolons): ver=0xyz ip4=ipaddress 101 /* length of fields (with semicolons): ver=0xyz ip4=ipaddress
85 host=hostname sec=mechanism uid=0xFF user=username */ 102 host=hostname sec=mechanism uid=0xFF user=username */
86 desc_len = MAX_VER_STR_LEN + 103 desc_len = MAX_VER_STR_LEN +
87 6 /* len of "host=" */ + strlen(hostname) + 104 HOST_KEY_LEN + strlen(hostname) +
88 5 /* len of ";ipv4=" */ + MAX_IPV6_ADDR_LEN + 105 IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
89 MAX_MECH_STR_LEN + 106 MAX_MECH_STR_LEN +
90 7 /* len of ";uid=0x" */ + (sizeof(uid_t) * 2) + 107 UID_KEY_LEN + (sizeof(uid_t) * 2) +
91 6 /* len of ";user=" */ + strlen(sesInfo->userName) + 1; 108 USER_KEY_LEN + strlen(sesInfo->userName) + 1;
92 109
93 spnego_key = ERR_PTR(-ENOMEM); 110 spnego_key = ERR_PTR(-ENOMEM);
94 description = kzalloc(desc_len, GFP_KERNEL); 111 description = kzalloc(desc_len, GFP_KERNEL);
@@ -114,9 +131,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
114 131
115 dp = description + strlen(description); 132 dp = description + strlen(description);
116 133
117 /* for now, only sec=krb5 is valid */ 134 /* for now, only sec=krb5 and sec=mskrb5 are valid */
118 if (server->secType == Kerberos) 135 if (server->secType == Kerberos)
119 sprintf(dp, ";sec=krb5"); 136 sprintf(dp, ";sec=krb5");
137 else if (server->secType == MSKerberos)
138 sprintf(dp, ";sec=mskrb5");
120 else 139 else
121 goto out; 140 goto out;
122 141
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 05a34b17a1ab..e4041ec4d712 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -23,7 +23,7 @@
23#ifndef _CIFS_SPNEGO_H 23#ifndef _CIFS_SPNEGO_H
24#define _CIFS_SPNEGO_H 24#define _CIFS_SPNEGO_H
25 25
26#define CIFS_SPNEGO_UPCALL_VERSION 1 26#define CIFS_SPNEGO_UPCALL_VERSION 2
27 27
28/* 28/*
29 * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. 29 * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 83fd40dc1ef0..bd5f13d38450 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
294 294
295 if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) 295 if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
296 if (extended_security & CIFSSEC_MAY_PLNTXT) { 296 if (extended_security & CIFSSEC_MAY_PLNTXT) {
297 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
297 memcpy(lnm_session_key, password_with_pad, 298 memcpy(lnm_session_key, password_with_pad,
298 CIFS_ENCPWD_SIZE); 299 CIFS_ENCPWD_SIZE);
299 return; 300 return;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e8da4ee761b5..25ecbd5b0404 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -175,6 +175,8 @@ out_no_root:
175 if (inode) 175 if (inode)
176 iput(inode); 176 iput(inode);
177 177
178 cifs_umount(sb, cifs_sb);
179
178out_mount_failed: 180out_mount_failed:
179 if (cifs_sb) { 181 if (cifs_sb) {
180#ifdef CONFIG_CIFS_DFS_UPCALL 182#ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 135c965c4137..f7b4a5cd837b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int,
41 struct nameidata *); 41 struct nameidata *);
42extern struct dentry *cifs_lookup(struct inode *, struct dentry *, 42extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
43 struct nameidata *); 43 struct nameidata *);
44extern int cifs_unlink(struct inode *, struct dentry *); 44extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
45extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); 45extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
46extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t); 46extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
47extern int cifs_mkdir(struct inode *, struct dentry *, int); 47extern int cifs_mkdir(struct inode *, struct dentry *, int);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7e1cf262effe..0d22479d99b7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -80,7 +80,8 @@ enum securityEnum {
80 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 80 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
81 RawNTLMSSP, /* NTLMSSP without SPNEGO */ 81 RawNTLMSSP, /* NTLMSSP without SPNEGO */
82 NTLMSSP, /* NTLMSSP via SPNEGO */ 82 NTLMSSP, /* NTLMSSP via SPNEGO */
83 Kerberos /* Kerberos via SPNEGO */ 83 Kerberos, /* Kerberos via SPNEGO */
84 MSKerberos, /* MS Kerberos via SPNEGO */
84}; 85};
85 86
86enum protocolEnum { 87enum protocolEnum {
@@ -308,6 +309,7 @@ struct cifs_search_info {
308 __u32 resume_key; 309 __u32 resume_key;
309 char *ntwrk_buf_start; 310 char *ntwrk_buf_start;
310 char *srch_entries_start; 311 char *srch_entries_start;
312 char *last_entry;
311 char *presume_name; 313 char *presume_name;
312 unsigned int resume_name_len; 314 unsigned int resume_name_len;
313 bool endOfSearch:1; 315 bool endOfSearch:1;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a729d083e6f4..0cff7fe986e8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -179,6 +179,8 @@ extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
179extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon, 179extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
180 const FILE_BASIC_INFO *data, __u16 fid, 180 const FILE_BASIC_INFO *data, __u16 fid,
181 __u32 pid_of_opener); 181 __u32 pid_of_opener);
182extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
183 bool delete_file, __u16 fid, __u32 pid_of_opener);
182#if 0 184#if 0
183extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, 185extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
184 char *fileName, __u16 dos_attributes, 186 char *fileName, __u16 dos_attributes,
@@ -229,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
229 const struct nls_table *nls_codepage, 231 const struct nls_table *nls_codepage,
230 int remap_special_chars); 232 int remap_special_chars);
231extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, 233extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
232 int netfid, char *target_name, 234 int netfid, const char *target_name,
233 const struct nls_table *nls_codepage, 235 const struct nls_table *nls_codepage,
234 int remap_special_chars); 236 int remap_special_chars);
235extern int CIFSCreateHardLink(const int xid, 237extern int CIFSCreateHardLink(const int xid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 994de7c90474..6f4ffe15d68d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2017,7 +2017,7 @@ renameRetry:
2017} 2017}
2018 2018
2019int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, 2019int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2020 int netfid, char *target_name, 2020 int netfid, const char *target_name,
2021 const struct nls_table *nls_codepage, int remap) 2021 const struct nls_table *nls_codepage, int remap)
2022{ 2022{
2023 struct smb_com_transaction2_sfi_req *pSMB = NULL; 2023 struct smb_com_transaction2_sfi_req *pSMB = NULL;
@@ -2071,7 +2071,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2071 remap); 2071 remap);
2072 } 2072 }
2073 rename_info->target_name_len = cpu_to_le32(2 * len_of_str); 2073 rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
2074 count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2; 2074 count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
2075 byte_count += count; 2075 byte_count += count;
2076 pSMB->DataCount = cpu_to_le16(count); 2076 pSMB->DataCount = cpu_to_le16(count);
2077 pSMB->TotalDataCount = pSMB->DataCount; 2077 pSMB->TotalDataCount = pSMB->DataCount;
@@ -3614,6 +3614,8 @@ findFirstRetry:
3614 /* BB remember to free buffer if error BB */ 3614 /* BB remember to free buffer if error BB */
3615 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3615 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3616 if (rc == 0) { 3616 if (rc == 0) {
3617 unsigned int lnoff;
3618
3617 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) 3619 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
3618 psrch_inf->unicode = true; 3620 psrch_inf->unicode = true;
3619 else 3621 else
@@ -3636,6 +3638,17 @@ findFirstRetry:
3636 le16_to_cpu(parms->SearchCount); 3638 le16_to_cpu(parms->SearchCount);
3637 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + 3639 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
3638 psrch_inf->entries_in_buffer; 3640 psrch_inf->entries_in_buffer;
3641 lnoff = le16_to_cpu(parms->LastNameOffset);
3642 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3643 lnoff) {
3644 cERROR(1, ("ignoring corrupt resume name"));
3645 psrch_inf->last_entry = NULL;
3646 return rc;
3647 }
3648
3649 psrch_inf->last_entry = psrch_inf->srch_entries_start +
3650 lnoff;
3651
3639 *pnetfid = parms->SearchHandle; 3652 *pnetfid = parms->SearchHandle;
3640 } else { 3653 } else {
3641 cifs_buf_release(pSMB); 3654 cifs_buf_release(pSMB);
@@ -3725,6 +3738,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3725 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3738 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3726 3739
3727 if (rc == 0) { 3740 if (rc == 0) {
3741 unsigned int lnoff;
3742
3728 /* BB fixme add lock for file (srch_info) struct here */ 3743 /* BB fixme add lock for file (srch_info) struct here */
3729 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) 3744 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
3730 psrch_inf->unicode = true; 3745 psrch_inf->unicode = true;
@@ -3751,6 +3766,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3751 le16_to_cpu(parms->SearchCount); 3766 le16_to_cpu(parms->SearchCount);
3752 psrch_inf->index_of_last_entry += 3767 psrch_inf->index_of_last_entry +=
3753 psrch_inf->entries_in_buffer; 3768 psrch_inf->entries_in_buffer;
3769 lnoff = le16_to_cpu(parms->LastNameOffset);
3770 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3771 lnoff) {
3772 cERROR(1, ("ignoring corrupt resume name"));
3773 psrch_inf->last_entry = NULL;
3774 return rc;
3775 } else
3776 psrch_inf->last_entry =
3777 psrch_inf->srch_entries_start + lnoff;
3778
3754/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d", 3779/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
3755 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */ 3780 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
3756 3781
@@ -4876,6 +4901,61 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4876 return rc; 4901 return rc;
4877} 4902}
4878 4903
4904int
4905CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
4906 bool delete_file, __u16 fid, __u32 pid_of_opener)
4907{
4908 struct smb_com_transaction2_sfi_req *pSMB = NULL;
4909 char *data_offset;
4910 int rc = 0;
4911 __u16 params, param_offset, offset, byte_count, count;
4912
4913 cFYI(1, ("Set File Disposition (via SetFileInfo)"));
4914 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4915
4916 if (rc)
4917 return rc;
4918
4919 pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
4920 pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
4921
4922 params = 6;
4923 pSMB->MaxSetupCount = 0;
4924 pSMB->Reserved = 0;
4925 pSMB->Flags = 0;
4926 pSMB->Timeout = 0;
4927 pSMB->Reserved2 = 0;
4928 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
4929 offset = param_offset + params;
4930
4931 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
4932
4933 count = 1;
4934 pSMB->MaxParameterCount = cpu_to_le16(2);
4935 /* BB find max SMB PDU from sess */
4936 pSMB->MaxDataCount = cpu_to_le16(1000);
4937 pSMB->SetupCount = 1;
4938 pSMB->Reserved3 = 0;
4939 pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
4940 byte_count = 3 /* pad */ + params + count;
4941 pSMB->DataCount = cpu_to_le16(count);
4942 pSMB->ParameterCount = cpu_to_le16(params);
4943 pSMB->TotalDataCount = pSMB->DataCount;
4944 pSMB->TotalParameterCount = pSMB->ParameterCount;
4945 pSMB->ParameterOffset = cpu_to_le16(param_offset);
4946 pSMB->DataOffset = cpu_to_le16(offset);
4947 pSMB->Fid = fid;
4948 pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
4949 pSMB->Reserved4 = 0;
4950 pSMB->hdr.smb_buf_length += byte_count;
4951 pSMB->ByteCount = cpu_to_le16(byte_count);
4952 *data_offset = delete_file ? 1 : 0;
4953 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4954 if (rc)
4955 cFYI(1, ("Send error in SetFileDisposition = %d", rc));
4956
4957 return rc;
4958}
4879 4959
4880int 4960int
4881CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon, 4961CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0711db65afe8..4c13bcdb92a5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3598,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3598 char ntlm_session_key[CIFS_SESS_KEY_SIZE]; 3598 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
3599 bool ntlmv2_flag = false; 3599 bool ntlmv2_flag = false;
3600 int first_time = 0; 3600 int first_time = 0;
3601 struct TCP_Server_Info *server = pSesInfo->server;
3601 3602
3602 /* what if server changes its buffer size after dropping the session? */ 3603 /* what if server changes its buffer size after dropping the session? */
3603 if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ { 3604 if (server->maxBuf == 0) /* no need to send on reconnect */ {
3604 rc = CIFSSMBNegotiate(xid, pSesInfo); 3605 rc = CIFSSMBNegotiate(xid, pSesInfo);
3605 if (rc == -EAGAIN) /* retry only once on 1st time connection */ { 3606 if (rc == -EAGAIN) {
3607 /* retry only once on 1st time connection */
3606 rc = CIFSSMBNegotiate(xid, pSesInfo); 3608 rc = CIFSSMBNegotiate(xid, pSesInfo);
3607 if (rc == -EAGAIN) 3609 if (rc == -EAGAIN)
3608 rc = -EHOSTDOWN; 3610 rc = -EHOSTDOWN;
3609 } 3611 }
3610 if (rc == 0) { 3612 if (rc == 0) {
3611 spin_lock(&GlobalMid_Lock); 3613 spin_lock(&GlobalMid_Lock);
3612 if (pSesInfo->server->tcpStatus != CifsExiting) 3614 if (server->tcpStatus != CifsExiting)
3613 pSesInfo->server->tcpStatus = CifsGood; 3615 server->tcpStatus = CifsGood;
3614 else 3616 else
3615 rc = -EHOSTDOWN; 3617 rc = -EHOSTDOWN;
3616 spin_unlock(&GlobalMid_Lock); 3618 spin_unlock(&GlobalMid_Lock);
@@ -3623,23 +3625,22 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3623 goto ss_err_exit; 3625 goto ss_err_exit;
3624 3626
3625 pSesInfo->flags = 0; 3627 pSesInfo->flags = 0;
3626 pSesInfo->capabilities = pSesInfo->server->capabilities; 3628 pSesInfo->capabilities = server->capabilities;
3627 if (linuxExtEnabled == 0) 3629 if (linuxExtEnabled == 0)
3628 pSesInfo->capabilities &= (~CAP_UNIX); 3630 pSesInfo->capabilities &= (~CAP_UNIX);
3629 /* pSesInfo->sequence_number = 0;*/ 3631 /* pSesInfo->sequence_number = 0;*/
3630 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", 3632 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
3631 pSesInfo->server->secMode, 3633 server->secMode, server->capabilities, server->timeAdj));
3632 pSesInfo->server->capabilities, 3634
3633 pSesInfo->server->timeAdj));
3634 if (experimEnabled < 2) 3635 if (experimEnabled < 2)
3635 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); 3636 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
3636 else if (extended_security 3637 else if (extended_security
3637 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) 3638 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
3638 && (pSesInfo->server->secType == NTLMSSP)) { 3639 && (server->secType == NTLMSSP)) {
3639 rc = -EOPNOTSUPP; 3640 rc = -EOPNOTSUPP;
3640 } else if (extended_security 3641 } else if (extended_security
3641 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) 3642 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
3642 && (pSesInfo->server->secType == RawNTLMSSP)) { 3643 && (server->secType == RawNTLMSSP)) {
3643 cFYI(1, ("NTLMSSP sesssetup")); 3644 cFYI(1, ("NTLMSSP sesssetup"));
3644 rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag, 3645 rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
3645 nls_info); 3646 nls_info);
@@ -3668,12 +3669,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3668 3669
3669 } else { 3670 } else {
3670 SMBNTencrypt(pSesInfo->password, 3671 SMBNTencrypt(pSesInfo->password,
3671 pSesInfo->server->cryptKey, 3672 server->cryptKey,
3672 ntlm_session_key); 3673 ntlm_session_key);
3673 3674
3674 if (first_time) 3675 if (first_time)
3675 cifs_calculate_mac_key( 3676 cifs_calculate_mac_key(
3676 &pSesInfo->server->mac_signing_key, 3677 &server->mac_signing_key,
3677 ntlm_session_key, 3678 ntlm_session_key,
3678 pSesInfo->password); 3679 pSesInfo->password);
3679 } 3680 }
@@ -3686,13 +3687,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3686 nls_info); 3687 nls_info);
3687 } 3688 }
3688 } else { /* old style NTLM 0.12 session setup */ 3689 } else { /* old style NTLM 0.12 session setup */
3689 SMBNTencrypt(pSesInfo->password, pSesInfo->server->cryptKey, 3690 SMBNTencrypt(pSesInfo->password, server->cryptKey,
3690 ntlm_session_key); 3691 ntlm_session_key);
3691 3692
3692 if (first_time) 3693 if (first_time)
3693 cifs_calculate_mac_key( 3694 cifs_calculate_mac_key(&server->mac_signing_key,
3694 &pSesInfo->server->mac_signing_key, 3695 ntlm_session_key,
3695 ntlm_session_key, pSesInfo->password); 3696 pSesInfo->password);
3696 3697
3697 rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info); 3698 rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
3698 } 3699 }
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index f730ef35499e..1e0c1bd8f2e4 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -29,38 +29,13 @@
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31 31
32static int dns_resolver_instantiate(struct key *key, const void *data,
33 size_t datalen)
34{
35 int rc = 0;
36 char *ip;
37
38 ip = kmalloc(datalen+1, GFP_KERNEL);
39 if (!ip)
40 return -ENOMEM;
41
42 memcpy(ip, data, datalen);
43 ip[datalen] = '\0';
44
45 rcu_assign_pointer(key->payload.data, ip);
46
47 return rc;
48}
49
50struct key_type key_type_dns_resolver = {
51 .name = "dns_resolver",
52 .def_datalen = sizeof(struct in_addr),
53 .describe = user_describe,
54 .instantiate = dns_resolver_instantiate,
55 .match = user_match,
56};
57
58/* Checks if supplied name is IP address 32/* Checks if supplied name is IP address
59 * returns: 33 * returns:
60 * 1 - name is IP 34 * 1 - name is IP
61 * 0 - name is not IP 35 * 0 - name is not IP
62 */ 36 */
63static int is_ip(const char *name) 37static int
38is_ip(const char *name)
64{ 39{
65 int rc; 40 int rc;
66 struct sockaddr_in sin_server; 41 struct sockaddr_in sin_server;
@@ -82,6 +57,47 @@ static int is_ip(const char *name)
82 return 0; 57 return 0;
83} 58}
84 59
60static int
61dns_resolver_instantiate(struct key *key, const void *data,
62 size_t datalen)
63{
64 int rc = 0;
65 char *ip;
66
67 ip = kmalloc(datalen + 1, GFP_KERNEL);
68 if (!ip)
69 return -ENOMEM;
70
71 memcpy(ip, data, datalen);
72 ip[datalen] = '\0';
73
74 /* make sure this looks like an address */
75 if (!is_ip((const char *) ip)) {
76 kfree(ip);
77 return -EINVAL;
78 }
79
80 key->type_data.x[0] = datalen;
81 rcu_assign_pointer(key->payload.data, ip);
82
83 return rc;
84}
85
86static void
87dns_resolver_destroy(struct key *key)
88{
89 kfree(key->payload.data);
90}
91
92struct key_type key_type_dns_resolver = {
93 .name = "dns_resolver",
94 .def_datalen = sizeof(struct in_addr),
95 .describe = user_describe,
96 .instantiate = dns_resolver_instantiate,
97 .destroy = dns_resolver_destroy,
98 .match = user_match,
99};
100
85/* Resolves server name to ip address. 101/* Resolves server name to ip address.
86 * input: 102 * input:
87 * unc - server UNC 103 * unc - server UNC
@@ -133,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
133 149
134 rkey = request_key(&key_type_dns_resolver, name, ""); 150 rkey = request_key(&key_type_dns_resolver, name, "");
135 if (!IS_ERR(rkey)) { 151 if (!IS_ERR(rkey)) {
152 len = rkey->type_data.x[0];
136 data = rkey->payload.data; 153 data = rkey->payload.data;
137 } else { 154 } else {
138 cERROR(1, ("%s: unable to resolve: %s", __func__, name)); 155 cERROR(1, ("%s: unable to resolve: %s", __func__, name));
@@ -141,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
141 158
142skip_upcall: 159skip_upcall:
143 if (data) { 160 if (data) {
144 len = strlen(data); 161 *ip_addr = kmalloc(len + 1, GFP_KERNEL);
145 *ip_addr = kmalloc(len+1, GFP_KERNEL);
146 if (*ip_addr) { 162 if (*ip_addr) {
147 memcpy(*ip_addr, data, len); 163 memcpy(*ip_addr, data, len + 1);
148 (*ip_addr)[len] = '\0';
149 if (!IS_ERR(rkey)) 164 if (!IS_ERR(rkey))
150 cFYI(1, ("%s: resolved: %s to %s", __func__, 165 cFYI(1, ("%s: resolved: %s to %s", __func__,
151 name, 166 name,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ff14d14903a0..c4a8a0605125 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
107 107
108 /* want handles we can use to read with first 108 /* want handles we can use to read with first
109 in the list so we do not have to walk the 109 in the list so we do not have to walk the
110 list to search for one in prepare_write */ 110 list to search for one in write_begin */
111 if ((file->f_flags & O_ACCMODE) == O_WRONLY) { 111 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
112 list_add_tail(&pCifsFile->flist, 112 list_add_tail(&pCifsFile->flist,
113 &pCifsInode->openFileList); 113 &pCifsInode->openFileList);
@@ -833,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
833 return -EBADF; 833 return -EBADF;
834 open_file = (struct cifsFileInfo *) file->private_data; 834 open_file = (struct cifsFileInfo *) file->private_data;
835 835
836 rc = generic_write_checks(file, poffset, &write_size, 0);
837 if (rc)
838 return rc;
839
836 xid = GetXid(); 840 xid = GetXid();
837 841
838 if (*poffset > file->f_path.dentry->d_inode->i_size) 842 if (*poffset > file->f_path.dentry->d_inode->i_size)
@@ -911,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
911} 915}
912 916
913static ssize_t cifs_write(struct file *file, const char *write_data, 917static ssize_t cifs_write(struct file *file, const char *write_data,
914 size_t write_size, loff_t *poffset) 918 size_t write_size, loff_t *poffset)
915{ 919{
916 int rc = 0; 920 int rc = 0;
917 unsigned int bytes_written = 0; 921 unsigned int bytes_written = 0;
@@ -1061,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1061struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) 1065struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1062{ 1066{
1063 struct cifsFileInfo *open_file; 1067 struct cifsFileInfo *open_file;
1068 bool any_available = false;
1064 int rc; 1069 int rc;
1065 1070
1066 /* Having a null inode here (because mapping->host was set to zero by 1071 /* Having a null inode here (because mapping->host was set to zero by
@@ -1076,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1076 read_lock(&GlobalSMBSeslock); 1081 read_lock(&GlobalSMBSeslock);
1077refind_writable: 1082refind_writable:
1078 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1083 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1079 if (open_file->closePend) 1084 if (open_file->closePend ||
1085 (!any_available && open_file->pid != current->tgid))
1080 continue; 1086 continue;
1087
1081 if (open_file->pfile && 1088 if (open_file->pfile &&
1082 ((open_file->pfile->f_flags & O_RDWR) || 1089 ((open_file->pfile->f_flags & O_RDWR) ||
1083 (open_file->pfile->f_flags & O_WRONLY))) { 1090 (open_file->pfile->f_flags & O_WRONLY))) {
@@ -1127,6 +1134,11 @@ refind_writable:
1127 of the loop here. */ 1134 of the loop here. */
1128 } 1135 }
1129 } 1136 }
1137 /* couldn't find useable FH with same pid, try any available */
1138 if (!any_available) {
1139 any_available = true;
1140 goto refind_writable;
1141 }
1130 read_unlock(&GlobalSMBSeslock); 1142 read_unlock(&GlobalSMBSeslock);
1131 return NULL; 1143 return NULL;
1132} 1144}
@@ -1443,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
1443 return rc; 1455 return rc;
1444} 1456}
1445 1457
1446static int cifs_commit_write(struct file *file, struct page *page, 1458static int cifs_write_end(struct file *file, struct address_space *mapping,
1447 unsigned offset, unsigned to) 1459 loff_t pos, unsigned len, unsigned copied,
1460 struct page *page, void *fsdata)
1448{ 1461{
1449 int xid; 1462 int rc;
1450 int rc = 0; 1463 struct inode *inode = mapping->host;
1451 struct inode *inode = page->mapping->host;
1452 loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1453 char *page_data;
1454 1464
1455 xid = GetXid(); 1465 cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
1456 cFYI(1, ("commit write for page %p up to position %lld for %d", 1466 page, pos, copied));
1457 page, position, to)); 1467
1458 spin_lock(&inode->i_lock); 1468 if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
1459 if (position > inode->i_size) 1469 SetPageUptodate(page);
1460 i_size_write(inode, position);
1461 1470
1462 spin_unlock(&inode->i_lock);
1463 if (!PageUptodate(page)) { 1471 if (!PageUptodate(page)) {
1464 position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset; 1472 char *page_data;
1465 /* can not rely on (or let) writepage write this data */ 1473 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1466 if (to < offset) { 1474 int xid;
1467 cFYI(1, ("Illegal offsets, can not copy from %d to %d", 1475
1468 offset, to)); 1476 xid = GetXid();
1469 FreeXid(xid);
1470 return rc;
1471 }
1472 /* this is probably better than directly calling 1477 /* this is probably better than directly calling
1473 partialpage_write since in this function the file handle is 1478 partialpage_write since in this function the file handle is
1474 known which we might as well leverage */ 1479 known which we might as well leverage */
1475 /* BB check if anything else missing out of ppw 1480 /* BB check if anything else missing out of ppw
1476 such as updating last write time */ 1481 such as updating last write time */
1477 page_data = kmap(page); 1482 page_data = kmap(page);
1478 rc = cifs_write(file, page_data + offset, to-offset, 1483 rc = cifs_write(file, page_data + offset, copied, &pos);
1479 &position); 1484 /* if (rc < 0) should we set writebehind rc? */
1480 if (rc > 0)
1481 rc = 0;
1482 /* else if (rc < 0) should we set writebehind rc? */
1483 kunmap(page); 1485 kunmap(page);
1486
1487 FreeXid(xid);
1484 } else { 1488 } else {
1489 rc = copied;
1490 pos += copied;
1485 set_page_dirty(page); 1491 set_page_dirty(page);
1486 } 1492 }
1487 1493
1488 FreeXid(xid); 1494 if (rc > 0) {
1495 spin_lock(&inode->i_lock);
1496 if (pos > inode->i_size)
1497 i_size_write(inode, pos);
1498 spin_unlock(&inode->i_lock);
1499 }
1500
1501 unlock_page(page);
1502 page_cache_release(page);
1503
1489 return rc; 1504 return rc;
1490} 1505}
1491 1506
@@ -2031,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
2031 return true; 2046 return true;
2032} 2047}
2033 2048
2034static int cifs_prepare_write(struct file *file, struct page *page, 2049static int cifs_write_begin(struct file *file, struct address_space *mapping,
2035 unsigned from, unsigned to) 2050 loff_t pos, unsigned len, unsigned flags,
2051 struct page **pagep, void **fsdata)
2036{ 2052{
2037 int rc = 0; 2053 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2038 loff_t i_size; 2054 loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
2039 loff_t offset; 2055
2056 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
2040 2057
2041 cFYI(1, ("prepare write for page %p from %d to %d", page, from, to)); 2058 *pagep = __grab_cache_page(mapping, index);
2042 if (PageUptodate(page)) 2059 if (!*pagep)
2060 return -ENOMEM;
2061
2062 if (PageUptodate(*pagep))
2043 return 0; 2063 return 0;
2044 2064
2045 /* If we are writing a full page it will be up to date, 2065 /* If we are writing a full page it will be up to date,
2046 no need to read from the server */ 2066 no need to read from the server */
2047 if ((to == PAGE_CACHE_SIZE) && (from == 0)) { 2067 if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
2048 SetPageUptodate(page);
2049 return 0; 2068 return 0;
2050 }
2051 2069
2052 offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 2070 if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
2053 i_size = i_size_read(page->mapping->host); 2071 int rc;
2054 2072
2055 if ((offset >= i_size) ||
2056 ((from == 0) && (offset + to) >= i_size)) {
2057 /*
2058 * We don't need to read data beyond the end of the file.
2059 * zero it, and set the page uptodate
2060 */
2061 simple_prepare_write(file, page, from, to);
2062 SetPageUptodate(page);
2063 } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
2064 /* might as well read a page, it is fast enough */ 2073 /* might as well read a page, it is fast enough */
2065 rc = cifs_readpage_worker(file, page, &offset); 2074 rc = cifs_readpage_worker(file, *pagep, &offset);
2075
2076 /* we do not need to pass errors back
2077 e.g. if we do not have read access to the file
2078 because cifs_write_end will attempt synchronous writes
2079 -- shaggy */
2066 } else { 2080 } else {
2067 /* we could try using another file handle if there is one - 2081 /* we could try using another file handle if there is one -
2068 but how would we lock it to prevent close of that handle 2082 but how would we lock it to prevent close of that handle
2069 racing with this read? In any case 2083 racing with this read? In any case
2070 this will be written out by commit_write so is fine */ 2084 this will be written out by write_end so is fine */
2071 } 2085 }
2072 2086
2073 /* we do not need to pass errors back
2074 e.g. if we do not have read access to the file
2075 because cifs_commit_write will do the right thing. -- shaggy */
2076
2077 return 0; 2087 return 0;
2078} 2088}
2079 2089
@@ -2082,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = {
2082 .readpages = cifs_readpages, 2092 .readpages = cifs_readpages,
2083 .writepage = cifs_writepage, 2093 .writepage = cifs_writepage,
2084 .writepages = cifs_writepages, 2094 .writepages = cifs_writepages,
2085 .prepare_write = cifs_prepare_write, 2095 .write_begin = cifs_write_begin,
2086 .commit_write = cifs_commit_write, 2096 .write_end = cifs_write_end,
2087 .set_page_dirty = __set_page_dirty_nobuffers, 2097 .set_page_dirty = __set_page_dirty_nobuffers,
2088 /* .sync_page = cifs_sync_page, */ 2098 /* .sync_page = cifs_sync_page, */
2089 /* .direct_IO = */ 2099 /* .direct_IO = */
@@ -2098,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
2098 .readpage = cifs_readpage, 2108 .readpage = cifs_readpage,
2099 .writepage = cifs_writepage, 2109 .writepage = cifs_writepage,
2100 .writepages = cifs_writepages, 2110 .writepages = cifs_writepages,
2101 .prepare_write = cifs_prepare_write, 2111 .write_begin = cifs_write_begin,
2102 .commit_write = cifs_commit_write, 2112 .write_end = cifs_write_end,
2103 .set_page_dirty = __set_page_dirty_nobuffers, 2113 .set_page_dirty = __set_page_dirty_nobuffers,
2104 /* .sync_page = cifs_sync_page, */ 2114 /* .sync_page = cifs_sync_page, */
2105 /* .direct_IO = */ 2115 /* .direct_IO = */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 28a22092d450..a8c833345fc9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode,
546 if ((inode->i_mode & S_IWUGO) == 0 && 546 if ((inode->i_mode & S_IWUGO) == 0 &&
547 (attr & ATTR_READONLY) == 0) 547 (attr & ATTR_READONLY) == 0)
548 inode->i_mode |= (S_IWUGO & default_mode); 548 inode->i_mode |= (S_IWUGO & default_mode);
549 inode->i_mode &= ~S_IFMT; 549
550 inode->i_mode &= ~S_IFMT;
550 } 551 }
551 /* clear write bits if ATTR_READONLY is set */ 552 /* clear write bits if ATTR_READONLY is set */
552 if (attr & ATTR_READONLY) 553 if (attr & ATTR_READONLY)
@@ -649,6 +650,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
649 inode->i_fop = &simple_dir_operations; 650 inode->i_fop = &simple_dir_operations;
650 inode->i_uid = cifs_sb->mnt_uid; 651 inode->i_uid = cifs_sb->mnt_uid;
651 inode->i_gid = cifs_sb->mnt_gid; 652 inode->i_gid = cifs_sb->mnt_gid;
653 } else if (rc) {
652 _FreeXid(xid); 654 _FreeXid(xid);
653 iget_failed(inode); 655 iget_failed(inode);
654 return ERR_PTR(rc); 656 return ERR_PTR(rc);
@@ -663,40 +665,201 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
663 return inode; 665 return inode;
664} 666}
665 667
666int cifs_unlink(struct inode *inode, struct dentry *direntry) 668static int
669cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
670 char *full_path, __u32 dosattr)
671{
672 int rc;
673 int oplock = 0;
674 __u16 netfid;
675 __u32 netpid;
676 bool set_time = false;
677 struct cifsFileInfo *open_file;
678 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
679 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
680 struct cifsTconInfo *pTcon = cifs_sb->tcon;
681 FILE_BASIC_INFO info_buf;
682
683 if (attrs->ia_valid & ATTR_ATIME) {
684 set_time = true;
685 info_buf.LastAccessTime =
686 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
687 } else
688 info_buf.LastAccessTime = 0;
689
690 if (attrs->ia_valid & ATTR_MTIME) {
691 set_time = true;
692 info_buf.LastWriteTime =
693 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
694 } else
695 info_buf.LastWriteTime = 0;
696
697 /*
698 * Samba throws this field away, but windows may actually use it.
699 * Do not set ctime unless other time stamps are changed explicitly
700 * (i.e. by utimes()) since we would then have a mix of client and
701 * server times.
702 */
703 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
704 cFYI(1, ("CIFS - CTIME changed"));
705 info_buf.ChangeTime =
706 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
707 } else
708 info_buf.ChangeTime = 0;
709
710 info_buf.CreationTime = 0; /* don't change */
711 info_buf.Attributes = cpu_to_le32(dosattr);
712
713 /*
714 * If the file is already open for write, just use that fileid
715 */
716 open_file = find_writable_file(cifsInode);
717 if (open_file) {
718 netfid = open_file->netfid;
719 netpid = open_file->pid;
720 goto set_via_filehandle;
721 }
722
723 /*
724 * NT4 apparently returns success on this call, but it doesn't
725 * really work.
726 */
727 if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
728 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
729 &info_buf, cifs_sb->local_nls,
730 cifs_sb->mnt_cifs_flags &
731 CIFS_MOUNT_MAP_SPECIAL_CHR);
732 if (rc == 0) {
733 cifsInode->cifsAttrs = dosattr;
734 goto out;
735 } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
736 goto out;
737 }
738
739 cFYI(1, ("calling SetFileInfo since SetPathInfo for "
740 "times not supported by this server"));
741 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
742 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
743 CREATE_NOT_DIR, &netfid, &oplock,
744 NULL, cifs_sb->local_nls,
745 cifs_sb->mnt_cifs_flags &
746 CIFS_MOUNT_MAP_SPECIAL_CHR);
747
748 if (rc != 0) {
749 if (rc == -EIO)
750 rc = -EINVAL;
751 goto out;
752 }
753
754 netpid = current->tgid;
755
756set_via_filehandle:
757 rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
758 if (!rc)
759 cifsInode->cifsAttrs = dosattr;
760
761 if (open_file == NULL)
762 CIFSSMBClose(xid, pTcon, netfid);
763 else
764 atomic_dec(&open_file->wrtPending);
765out:
766 return rc;
767}
768
769/*
770 * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
771 * and rename it to a random name that hopefully won't conflict with
772 * anything else.
773 */
774static int
775cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
776{
777 int oplock = 0;
778 int rc;
779 __u16 netfid;
780 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
781 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
782 struct cifsTconInfo *tcon = cifs_sb->tcon;
783 __u32 dosattr;
784 FILE_BASIC_INFO *info_buf;
785
786 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
787 DELETE|FILE_WRITE_ATTRIBUTES,
788 CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
789 &netfid, &oplock, NULL, cifs_sb->local_nls,
790 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
791 if (rc != 0)
792 goto out;
793
794 /* set ATTR_HIDDEN and clear ATTR_READONLY */
795 cifsInode = CIFS_I(inode);
796 dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
797 if (dosattr == 0)
798 dosattr |= ATTR_NORMAL;
799 dosattr |= ATTR_HIDDEN;
800
801 info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
802 if (info_buf == NULL) {
803 rc = -ENOMEM;
804 goto out_close;
805 }
806 info_buf->Attributes = cpu_to_le32(dosattr);
807 rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid);
808 kfree(info_buf);
809 if (rc != 0)
810 goto out_close;
811 cifsInode->cifsAttrs = dosattr;
812
813 /* silly-rename the file */
814 CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
815 cifs_sb->mnt_cifs_flags &
816 CIFS_MOUNT_MAP_SPECIAL_CHR);
817
818 /* set DELETE_ON_CLOSE */
819 rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
820
821 /*
822 * some samba versions return -ENOENT when we try to set the file
823 * disposition here. Likely a samba bug, but work around it for now
824 */
825 if (rc == -ENOENT)
826 rc = 0;
827
828out_close:
829 CIFSSMBClose(xid, tcon, netfid);
830out:
831 return rc;
832}
833
834int cifs_unlink(struct inode *dir, struct dentry *dentry)
667{ 835{
668 int rc = 0; 836 int rc = 0;
669 int xid; 837 int xid;
670 struct cifs_sb_info *cifs_sb;
671 struct cifsTconInfo *pTcon;
672 char *full_path = NULL; 838 char *full_path = NULL;
673 struct cifsInodeInfo *cifsInode; 839 struct inode *inode = dentry->d_inode;
674 FILE_BASIC_INFO *pinfo_buf; 840 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
841 struct super_block *sb = dir->i_sb;
842 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
843 struct cifsTconInfo *tcon = cifs_sb->tcon;
844 struct iattr *attrs = NULL;
845 __u32 dosattr = 0, origattr = 0;
675 846
676 cFYI(1, ("cifs_unlink, inode = 0x%p", inode)); 847 cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
677 848
678 xid = GetXid(); 849 xid = GetXid();
679 850
680 if (inode) 851 /* Unlink can be called from rename so we can not take the
681 cifs_sb = CIFS_SB(inode->i_sb); 852 * sb->s_vfs_rename_mutex here */
682 else 853 full_path = build_path_from_dentry(dentry);
683 cifs_sb = CIFS_SB(direntry->d_sb);
684 pTcon = cifs_sb->tcon;
685
686 /* Unlink can be called from rename so we can not grab the sem here
687 since we deadlock otherwise */
688/* mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/
689 full_path = build_path_from_dentry(direntry);
690/* mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/
691 if (full_path == NULL) { 854 if (full_path == NULL) {
692 FreeXid(xid); 855 FreeXid(xid);
693 return -ENOMEM; 856 return -ENOMEM;
694 } 857 }
695 858
696 if ((pTcon->ses->capabilities & CAP_UNIX) && 859 if ((tcon->ses->capabilities & CAP_UNIX) &&
697 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 860 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
698 le64_to_cpu(pTcon->fsUnixInfo.Capability))) { 861 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
699 rc = CIFSPOSIXDelFile(xid, pTcon, full_path, 862 rc = CIFSPOSIXDelFile(xid, tcon, full_path,
700 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, 863 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
701 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 864 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
702 cFYI(1, ("posix del rc %d", rc)); 865 cFYI(1, ("posix del rc %d", rc));
@@ -704,125 +867,60 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
704 goto psx_del_no_retry; 867 goto psx_del_no_retry;
705 } 868 }
706 869
707 rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls, 870retry_std_delete:
871 rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
708 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 872 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
873
709psx_del_no_retry: 874psx_del_no_retry:
710 if (!rc) { 875 if (!rc) {
711 if (direntry->d_inode) 876 if (inode)
712 drop_nlink(direntry->d_inode); 877 drop_nlink(inode);
713 } else if (rc == -ENOENT) { 878 } else if (rc == -ENOENT) {
714 d_drop(direntry); 879 d_drop(dentry);
715 } else if (rc == -ETXTBSY) { 880 } else if (rc == -ETXTBSY) {
716 int oplock = 0; 881 rc = cifs_rename_pending_delete(full_path, inode, xid);
717 __u16 netfid; 882 if (rc == 0)
718 883 drop_nlink(inode);
719 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE, 884 } else if (rc == -EACCES && dosattr == 0) {
720 CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE, 885 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
721 &netfid, &oplock, NULL, cifs_sb->local_nls, 886 if (attrs == NULL) {
722 cifs_sb->mnt_cifs_flags & 887 rc = -ENOMEM;
723 CIFS_MOUNT_MAP_SPECIAL_CHR); 888 goto out_reval;
724 if (rc == 0) {
725 CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL,
726 cifs_sb->local_nls,
727 cifs_sb->mnt_cifs_flags &
728 CIFS_MOUNT_MAP_SPECIAL_CHR);
729 CIFSSMBClose(xid, pTcon, netfid);
730 if (direntry->d_inode)
731 drop_nlink(direntry->d_inode);
732 } 889 }
733 } else if (rc == -EACCES) {
734 /* try only if r/o attribute set in local lookup data? */
735 pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL);
736 if (pinfo_buf) {
737 /* ATTRS set to normal clears r/o bit */
738 pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
739 if (!(pTcon->ses->flags & CIFS_SES_NT4))
740 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
741 pinfo_buf,
742 cifs_sb->local_nls,
743 cifs_sb->mnt_cifs_flags &
744 CIFS_MOUNT_MAP_SPECIAL_CHR);
745 else
746 rc = -EOPNOTSUPP;
747 890
748 if (rc == -EOPNOTSUPP) { 891 /* try to reset dos attributes */
749 int oplock = 0; 892 origattr = cifsInode->cifsAttrs;
750 __u16 netfid; 893 if (origattr == 0)
751 /* rc = CIFSSMBSetAttrLegacy(xid, pTcon, 894 origattr |= ATTR_NORMAL;
752 full_path, 895 dosattr = origattr & ~ATTR_READONLY;
753 (__u16)ATTR_NORMAL, 896 if (dosattr == 0)
754 cifs_sb->local_nls); 897 dosattr |= ATTR_NORMAL;
755 For some strange reason it seems that NT4 eats the 898 dosattr |= ATTR_HIDDEN;
756 old setattr call without actually setting the 899
757 attributes so on to the third attempted workaround 900 rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
758 */ 901 if (rc != 0)
759 902 goto out_reval;
760 /* BB could scan to see if we already have it open 903
761 and pass in pid of opener to function */ 904 goto retry_std_delete;
762 rc = CIFSSMBOpen(xid, pTcon, full_path,
763 FILE_OPEN, SYNCHRONIZE |
764 FILE_WRITE_ATTRIBUTES, 0,
765 &netfid, &oplock, NULL,
766 cifs_sb->local_nls,
767 cifs_sb->mnt_cifs_flags &
768 CIFS_MOUNT_MAP_SPECIAL_CHR);
769 if (rc == 0) {
770 rc = CIFSSMBSetFileInfo(xid, pTcon,
771 pinfo_buf,
772 netfid,
773 current->tgid);
774 CIFSSMBClose(xid, pTcon, netfid);
775 }
776 }
777 kfree(pinfo_buf);
778 }
779 if (rc == 0) {
780 rc = CIFSSMBDelFile(xid, pTcon, full_path,
781 cifs_sb->local_nls,
782 cifs_sb->mnt_cifs_flags &
783 CIFS_MOUNT_MAP_SPECIAL_CHR);
784 if (!rc) {
785 if (direntry->d_inode)
786 drop_nlink(direntry->d_inode);
787 } else if (rc == -ETXTBSY) {
788 int oplock = 0;
789 __u16 netfid;
790
791 rc = CIFSSMBOpen(xid, pTcon, full_path,
792 FILE_OPEN, DELETE,
793 CREATE_NOT_DIR |
794 CREATE_DELETE_ON_CLOSE,
795 &netfid, &oplock, NULL,
796 cifs_sb->local_nls,
797 cifs_sb->mnt_cifs_flags &
798 CIFS_MOUNT_MAP_SPECIAL_CHR);
799 if (rc == 0) {
800 CIFSSMBRenameOpenFile(xid, pTcon,
801 netfid, NULL,
802 cifs_sb->local_nls,
803 cifs_sb->mnt_cifs_flags &
804 CIFS_MOUNT_MAP_SPECIAL_CHR);
805 CIFSSMBClose(xid, pTcon, netfid);
806 if (direntry->d_inode)
807 drop_nlink(direntry->d_inode);
808 }
809 /* BB if rc = -ETXTBUSY goto the rename logic BB */
810 }
811 }
812 }
813 if (direntry->d_inode) {
814 cifsInode = CIFS_I(direntry->d_inode);
815 cifsInode->time = 0; /* will force revalidate to get info
816 when needed */
817 direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
818 } 905 }
906
907 /* undo the setattr if we errored out and it's needed */
908 if (rc != 0 && dosattr != 0)
909 cifs_set_file_info(inode, attrs, xid, full_path, origattr);
910
911out_reval:
819 if (inode) { 912 if (inode) {
820 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
821 cifsInode = CIFS_I(inode); 913 cifsInode = CIFS_I(inode);
822 cifsInode->time = 0; /* force revalidate of dir as well */ 914 cifsInode->time = 0; /* will force revalidate to get info
915 when needed */
916 inode->i_ctime = current_fs_time(sb);
823 } 917 }
918 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
919 cifsInode = CIFS_I(dir);
920 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
824 921
825 kfree(full_path); 922 kfree(full_path);
923 kfree(attrs);
826 FreeXid(xid); 924 FreeXid(xid);
827 return rc; 925 return rc;
828} 926}
@@ -867,7 +965,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode,
867 965
868int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) 966int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
869{ 967{
870 int rc = 0; 968 int rc = 0, tmprc;
871 int xid; 969 int xid;
872 struct cifs_sb_info *cifs_sb; 970 struct cifs_sb_info *cifs_sb;
873 struct cifsTconInfo *pTcon; 971 struct cifsTconInfo *pTcon;
@@ -929,6 +1027,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
929 kfree(pInfo); 1027 kfree(pInfo);
930 goto mkdir_get_info; 1028 goto mkdir_get_info;
931 } 1029 }
1030
932 /* Is an i_ino of zero legal? */ 1031 /* Is an i_ino of zero legal? */
933 /* Are there sanity checks we can use to ensure that 1032 /* Are there sanity checks we can use to ensure that
934 the server is really filling in that field? */ 1033 the server is really filling in that field? */
@@ -1017,12 +1116,20 @@ mkdir_get_info:
1017 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && 1116 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1018 (mode & S_IWUGO) == 0) { 1117 (mode & S_IWUGO) == 0) {
1019 FILE_BASIC_INFO pInfo; 1118 FILE_BASIC_INFO pInfo;
1119 struct cifsInodeInfo *cifsInode;
1120 u32 dosattrs;
1121
1020 memset(&pInfo, 0, sizeof(pInfo)); 1122 memset(&pInfo, 0, sizeof(pInfo));
1021 pInfo.Attributes = cpu_to_le32(ATTR_READONLY); 1123 cifsInode = CIFS_I(newinode);
1022 CIFSSMBSetPathInfo(xid, pTcon, full_path, 1124 dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
1023 &pInfo, cifs_sb->local_nls, 1125 pInfo.Attributes = cpu_to_le32(dosattrs);
1126 tmprc = CIFSSMBSetPathInfo(xid, pTcon,
1127 full_path, &pInfo,
1128 cifs_sb->local_nls,
1024 cifs_sb->mnt_cifs_flags & 1129 cifs_sb->mnt_cifs_flags &
1025 CIFS_MOUNT_MAP_SPECIAL_CHR); 1130 CIFS_MOUNT_MAP_SPECIAL_CHR);
1131 if (tmprc == 0)
1132 cifsInode->cifsAttrs = dosattrs;
1026 } 1133 }
1027 if (direntry->d_inode) { 1134 if (direntry->d_inode) {
1028 if (cifs_sb->mnt_cifs_flags & 1135 if (cifs_sb->mnt_cifs_flags &
@@ -1094,117 +1201,141 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1094 return rc; 1201 return rc;
1095} 1202}
1096 1203
1204static int
1205cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1206 struct dentry *to_dentry, const char *toPath)
1207{
1208 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
1209 struct cifsTconInfo *pTcon = cifs_sb->tcon;
1210 __u16 srcfid;
1211 int oplock, rc;
1212
1213 /* try path-based rename first */
1214 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
1215 cifs_sb->mnt_cifs_flags &
1216 CIFS_MOUNT_MAP_SPECIAL_CHR);
1217
1218 /*
1219 * don't bother with rename by filehandle unless file is busy and
1220 * source Note that cross directory moves do not work with
1221 * rename by filehandle to various Windows servers.
1222 */
1223 if (rc == 0 || rc != -ETXTBSY)
1224 return rc;
1225
1226 /* open the file to be renamed -- we need DELETE perms */
1227 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
1228 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
1229 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1230 CIFS_MOUNT_MAP_SPECIAL_CHR);
1231
1232 if (rc == 0) {
1233 rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
1234 (const char *) to_dentry->d_name.name,
1235 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1236 CIFS_MOUNT_MAP_SPECIAL_CHR);
1237
1238 CIFSSMBClose(xid, pTcon, srcfid);
1239 }
1240
1241 return rc;
1242}
1243
1097int cifs_rename(struct inode *source_inode, struct dentry *source_direntry, 1244int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
1098 struct inode *target_inode, struct dentry *target_direntry) 1245 struct inode *target_inode, struct dentry *target_direntry)
1099{ 1246{
1100 char *fromName; 1247 char *fromName = NULL;
1101 char *toName; 1248 char *toName = NULL;
1102 struct cifs_sb_info *cifs_sb_source; 1249 struct cifs_sb_info *cifs_sb_source;
1103 struct cifs_sb_info *cifs_sb_target; 1250 struct cifs_sb_info *cifs_sb_target;
1104 struct cifsTconInfo *pTcon; 1251 struct cifsTconInfo *pTcon;
1252 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1253 FILE_UNIX_BASIC_INFO *info_buf_target;
1105 int xid; 1254 int xid;
1106 int rc = 0; 1255 int rc;
1107
1108 xid = GetXid();
1109 1256
1110 cifs_sb_target = CIFS_SB(target_inode->i_sb); 1257 cifs_sb_target = CIFS_SB(target_inode->i_sb);
1111 cifs_sb_source = CIFS_SB(source_inode->i_sb); 1258 cifs_sb_source = CIFS_SB(source_inode->i_sb);
1112 pTcon = cifs_sb_source->tcon; 1259 pTcon = cifs_sb_source->tcon;
1113 1260
1261 xid = GetXid();
1262
1263 /*
1264 * BB: this might be allowed if same server, but different share.
1265 * Consider adding support for this
1266 */
1114 if (pTcon != cifs_sb_target->tcon) { 1267 if (pTcon != cifs_sb_target->tcon) {
1115 FreeXid(xid); 1268 rc = -EXDEV;
1116 return -EXDEV; /* BB actually could be allowed if same server, 1269 goto cifs_rename_exit;
1117 but different share.
1118 Might eventually add support for this */
1119 } 1270 }
1120 1271
1121 /* we already have the rename sem so we do not need to grab it again 1272 /*
1122 here to protect the path integrity */ 1273 * we already have the rename sem so we do not need to
1274 * grab it again here to protect the path integrity
1275 */
1123 fromName = build_path_from_dentry(source_direntry); 1276 fromName = build_path_from_dentry(source_direntry);
1277 if (fromName == NULL) {
1278 rc = -ENOMEM;
1279 goto cifs_rename_exit;
1280 }
1281
1124 toName = build_path_from_dentry(target_direntry); 1282 toName = build_path_from_dentry(target_direntry);
1125 if ((fromName == NULL) || (toName == NULL)) { 1283 if (toName == NULL) {
1126 rc = -ENOMEM; 1284 rc = -ENOMEM;
1127 goto cifs_rename_exit; 1285 goto cifs_rename_exit;
1128 } 1286 }
1129 1287
1130 rc = CIFSSMBRename(xid, pTcon, fromName, toName, 1288 rc = cifs_do_rename(xid, source_direntry, fromName,
1131 cifs_sb_source->local_nls, 1289 target_direntry, toName);
1132 cifs_sb_source->mnt_cifs_flags & 1290
1133 CIFS_MOUNT_MAP_SPECIAL_CHR);
1134 if (rc == -EEXIST) { 1291 if (rc == -EEXIST) {
1135 /* check if they are the same file because rename of hardlinked 1292 if (pTcon->unix_ext) {
1136 files is a noop */ 1293 /*
1137 FILE_UNIX_BASIC_INFO *info_buf_source; 1294 * Are src and dst hardlinks of same inode? We can
1138 FILE_UNIX_BASIC_INFO *info_buf_target; 1295 * only tell with unix extensions enabled
1139 1296 */
1140 info_buf_source = 1297 info_buf_source =
1141 kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); 1298 kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
1142 if (info_buf_source != NULL) { 1299 GFP_KERNEL);
1300 if (info_buf_source == NULL)
1301 goto unlink_target;
1302
1143 info_buf_target = info_buf_source + 1; 1303 info_buf_target = info_buf_source + 1;
1144 if (pTcon->unix_ext) 1304 rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
1145 rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, 1305 info_buf_source,
1146 info_buf_source, 1306 cifs_sb_source->local_nls,
1147 cifs_sb_source->local_nls, 1307 cifs_sb_source->mnt_cifs_flags &
1148 cifs_sb_source->mnt_cifs_flags &
1149 CIFS_MOUNT_MAP_SPECIAL_CHR); 1308 CIFS_MOUNT_MAP_SPECIAL_CHR);
1150 /* else rc is still EEXIST so will fall through to 1309 if (rc != 0)
1151 unlink the target and retry rename */ 1310 goto unlink_target;
1152 if (rc == 0) { 1311
1153 rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName, 1312 rc = CIFSSMBUnixQPathInfo(xid, pTcon,
1154 info_buf_target, 1313 toName, info_buf_target,
1155 cifs_sb_target->local_nls, 1314 cifs_sb_target->local_nls,
1156 /* remap based on source sb */ 1315 /* remap based on source sb */
1157 cifs_sb_source->mnt_cifs_flags & 1316 cifs_sb_source->mnt_cifs_flags &
1158 CIFS_MOUNT_MAP_SPECIAL_CHR);
1159 }
1160 if ((rc == 0) &&
1161 (info_buf_source->UniqueId ==
1162 info_buf_target->UniqueId)) {
1163 /* do not rename since the files are hardlinked which
1164 is a noop */
1165 } else {
1166 /* we either can not tell the files are hardlinked
1167 (as with Windows servers) or files are not
1168 hardlinked so delete the target manually before
1169 renaming to follow POSIX rather than Windows
1170 semantics */
1171 cifs_unlink(target_inode, target_direntry);
1172 rc = CIFSSMBRename(xid, pTcon, fromName,
1173 toName,
1174 cifs_sb_source->local_nls,
1175 cifs_sb_source->mnt_cifs_flags
1176 & CIFS_MOUNT_MAP_SPECIAL_CHR);
1177 }
1178 kfree(info_buf_source);
1179 } /* if we can not get memory just leave rc as EEXIST */
1180 }
1181
1182 if (rc)
1183 cFYI(1, ("rename rc %d", rc));
1184
1185 if ((rc == -EIO) || (rc == -EEXIST)) {
1186 int oplock = 0;
1187 __u16 netfid;
1188
1189 /* BB FIXME Is Generic Read correct for rename? */
1190 /* if renaming directory - we should not say CREATE_NOT_DIR,
1191 need to test renaming open directory, also GENERIC_READ
1192 might not right be right access to request */
1193 rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ,
1194 CREATE_NOT_DIR, &netfid, &oplock, NULL,
1195 cifs_sb_source->local_nls,
1196 cifs_sb_source->mnt_cifs_flags &
1197 CIFS_MOUNT_MAP_SPECIAL_CHR);
1198 if (rc == 0) {
1199 rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName,
1200 cifs_sb_source->local_nls,
1201 cifs_sb_source->mnt_cifs_flags &
1202 CIFS_MOUNT_MAP_SPECIAL_CHR); 1317 CIFS_MOUNT_MAP_SPECIAL_CHR);
1203 CIFSSMBClose(xid, pTcon, netfid); 1318
1204 } 1319 if (rc == 0 && (info_buf_source->UniqueId ==
1320 info_buf_target->UniqueId))
1321 /* same file, POSIX says that this is a noop */
1322 goto cifs_rename_exit;
1323 } /* else ... BB we could add the same check for Windows by
1324 checking the UniqueId via FILE_INTERNAL_INFO */
1325unlink_target:
1326 /*
1327 * we either can not tell the files are hardlinked (as with
1328 * Windows servers) or files are not hardlinked. Delete the
1329 * target manually before renaming to follow POSIX rather than
1330 * Windows semantics
1331 */
1332 cifs_unlink(target_inode, target_direntry);
1333 rc = cifs_do_rename(xid, source_direntry, fromName,
1334 target_direntry, toName);
1205 } 1335 }
1206 1336
1207cifs_rename_exit: 1337cifs_rename_exit:
1338 kfree(info_buf_source);
1208 kfree(fromName); 1339 kfree(fromName);
1209 kfree(toName); 1340 kfree(toName);
1210 FreeXid(xid); 1341 FreeXid(xid);
@@ -1505,101 +1636,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1505} 1636}
1506 1637
1507static int 1638static int
1508cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
1509 char *full_path, __u32 dosattr)
1510{
1511 int rc;
1512 int oplock = 0;
1513 __u16 netfid;
1514 __u32 netpid;
1515 bool set_time = false;
1516 struct cifsFileInfo *open_file;
1517 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1518 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1519 struct cifsTconInfo *pTcon = cifs_sb->tcon;
1520 FILE_BASIC_INFO info_buf;
1521
1522 if (attrs->ia_valid & ATTR_ATIME) {
1523 set_time = true;
1524 info_buf.LastAccessTime =
1525 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
1526 } else
1527 info_buf.LastAccessTime = 0;
1528
1529 if (attrs->ia_valid & ATTR_MTIME) {
1530 set_time = true;
1531 info_buf.LastWriteTime =
1532 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
1533 } else
1534 info_buf.LastWriteTime = 0;
1535
1536 /*
1537 * Samba throws this field away, but windows may actually use it.
1538 * Do not set ctime unless other time stamps are changed explicitly
1539 * (i.e. by utimes()) since we would then have a mix of client and
1540 * server times.
1541 */
1542 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
1543 cFYI(1, ("CIFS - CTIME changed"));
1544 info_buf.ChangeTime =
1545 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
1546 } else
1547 info_buf.ChangeTime = 0;
1548
1549 info_buf.CreationTime = 0; /* don't change */
1550 info_buf.Attributes = cpu_to_le32(dosattr);
1551
1552 /*
1553 * If the file is already open for write, just use that fileid
1554 */
1555 open_file = find_writable_file(cifsInode);
1556 if (open_file) {
1557 netfid = open_file->netfid;
1558 netpid = open_file->pid;
1559 goto set_via_filehandle;
1560 }
1561
1562 /*
1563 * NT4 apparently returns success on this call, but it doesn't
1564 * really work.
1565 */
1566 if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
1567 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
1568 &info_buf, cifs_sb->local_nls,
1569 cifs_sb->mnt_cifs_flags &
1570 CIFS_MOUNT_MAP_SPECIAL_CHR);
1571 if (rc != -EOPNOTSUPP && rc != -EINVAL)
1572 goto out;
1573 }
1574
1575 cFYI(1, ("calling SetFileInfo since SetPathInfo for "
1576 "times not supported by this server"));
1577 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
1578 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
1579 CREATE_NOT_DIR, &netfid, &oplock,
1580 NULL, cifs_sb->local_nls,
1581 cifs_sb->mnt_cifs_flags &
1582 CIFS_MOUNT_MAP_SPECIAL_CHR);
1583
1584 if (rc != 0) {
1585 if (rc == -EIO)
1586 rc = -EINVAL;
1587 goto out;
1588 }
1589
1590 netpid = current->tgid;
1591
1592set_via_filehandle:
1593 rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
1594 if (open_file == NULL)
1595 CIFSSMBClose(xid, pTcon, netfid);
1596 else
1597 atomic_dec(&open_file->wrtPending);
1598out:
1599 return rc;
1600}
1601
1602static int
1603cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) 1639cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1604{ 1640{
1605 int rc; 1641 int rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4b17f8fe3157..88786ba02d27 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -150,8 +150,7 @@ cifs_buf_get(void)
150 but it may be more efficient to always alloc same size 150 but it may be more efficient to always alloc same size
151 albeit slightly larger than necessary and maxbuffersize 151 albeit slightly larger than necessary and maxbuffersize
152 defaults to this and can not be bigger */ 152 defaults to this and can not be bigger */
153 ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp, 153 ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
154 GFP_KERNEL | GFP_NOFS);
155 154
156 /* clear the first few header bytes */ 155 /* clear the first few header bytes */
157 /* for most paths, more is cleared in header_assemble */ 156 /* for most paths, more is cleared in header_assemble */
@@ -188,8 +187,7 @@ cifs_small_buf_get(void)
188 but it may be more efficient to always alloc same size 187 but it may be more efficient to always alloc same size
189 albeit slightly larger than necessary and maxbuffersize 188 albeit slightly larger than necessary and maxbuffersize
190 defaults to this and can not be bigger */ 189 defaults to this and can not be bigger */
191 ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, 190 ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
192 GFP_KERNEL | GFP_NOFS);
193 if (ret_buf) { 191 if (ret_buf) {
194 /* No need to clear memory here, cleared in header assemble */ 192 /* No need to clear memory here, cleared in header assemble */
195 /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ 193 /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
@@ -313,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
313 buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES; 311 buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
314 buffer->Pid = cpu_to_le16((__u16)current->tgid); 312 buffer->Pid = cpu_to_le16((__u16)current->tgid);
315 buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16)); 313 buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
316 spin_lock(&GlobalMid_Lock);
317 spin_unlock(&GlobalMid_Lock);
318 if (treeCon) { 314 if (treeCon) {
319 buffer->Tid = treeCon->tid; 315 buffer->Tid = treeCon->tid;
320 if (treeCon->ses) { 316 if (treeCon->ses) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5f40ed3473f5..765adf12d54f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file)
640 640
641} 641}
642 642
643static int cifs_save_resume_key(const char *current_entry,
644 struct cifsFileInfo *cifsFile)
645{
646 int rc = 0;
647 unsigned int len = 0;
648 __u16 level;
649 char *filename;
650
651 if ((cifsFile == NULL) || (current_entry == NULL))
652 return -EINVAL;
653
654 level = cifsFile->srch_inf.info_level;
655
656 if (level == SMB_FIND_FILE_UNIX) {
657 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
658
659 filename = &pFindData->FileName[0];
660 if (cifsFile->srch_inf.unicode) {
661 len = cifs_unicode_bytelen(filename);
662 } else {
663 /* BB should we make this strnlen of PATH_MAX? */
664 len = strnlen(filename, PATH_MAX);
665 }
666 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
667 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
668 FILE_DIRECTORY_INFO *pFindData =
669 (FILE_DIRECTORY_INFO *)current_entry;
670 filename = &pFindData->FileName[0];
671 len = le32_to_cpu(pFindData->FileNameLength);
672 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
673 } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
674 FILE_FULL_DIRECTORY_INFO *pFindData =
675 (FILE_FULL_DIRECTORY_INFO *)current_entry;
676 filename = &pFindData->FileName[0];
677 len = le32_to_cpu(pFindData->FileNameLength);
678 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
679 } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
680 SEARCH_ID_FULL_DIR_INFO *pFindData =
681 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
682 filename = &pFindData->FileName[0];
683 len = le32_to_cpu(pFindData->FileNameLength);
684 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
685 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
686 FILE_BOTH_DIRECTORY_INFO *pFindData =
687 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
688 filename = &pFindData->FileName[0];
689 len = le32_to_cpu(pFindData->FileNameLength);
690 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
691 } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
692 FIND_FILE_STANDARD_INFO *pFindData =
693 (FIND_FILE_STANDARD_INFO *)current_entry;
694 filename = &pFindData->FileName[0];
695 /* one byte length, no name conversion */
696 len = (unsigned int)pFindData->FileNameLength;
697 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
698 } else {
699 cFYI(1, ("Unknown findfirst level %d", level));
700 return -EINVAL;
701 }
702 cifsFile->srch_inf.resume_name_len = len;
703 cifsFile->srch_inf.presume_name = filename;
704 return rc;
705}
706
643/* find the corresponding entry in the search */ 707/* find the corresponding entry in the search */
644/* Note that the SMB server returns search entries for . and .. which 708/* Note that the SMB server returns search entries for . and .. which
645 complicates logic here if we choose to parse for them and we do not 709 complicates logic here if we choose to parse for them and we do not
@@ -703,6 +767,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
703 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 767 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
704 (rc == 0) && !cifsFile->srch_inf.endOfSearch) { 768 (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
705 cFYI(1, ("calling findnext2")); 769 cFYI(1, ("calling findnext2"));
770 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
706 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, 771 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
707 &cifsFile->srch_inf); 772 &cifsFile->srch_inf);
708 if (rc) 773 if (rc)
@@ -919,69 +984,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
919 return rc; 984 return rc;
920} 985}
921 986
922static int cifs_save_resume_key(const char *current_entry,
923 struct cifsFileInfo *cifsFile)
924{
925 int rc = 0;
926 unsigned int len = 0;
927 __u16 level;
928 char *filename;
929
930 if ((cifsFile == NULL) || (current_entry == NULL))
931 return -EINVAL;
932
933 level = cifsFile->srch_inf.info_level;
934
935 if (level == SMB_FIND_FILE_UNIX) {
936 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
937
938 filename = &pFindData->FileName[0];
939 if (cifsFile->srch_inf.unicode) {
940 len = cifs_unicode_bytelen(filename);
941 } else {
942 /* BB should we make this strnlen of PATH_MAX? */
943 len = strnlen(filename, PATH_MAX);
944 }
945 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
946 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
947 FILE_DIRECTORY_INFO *pFindData =
948 (FILE_DIRECTORY_INFO *)current_entry;
949 filename = &pFindData->FileName[0];
950 len = le32_to_cpu(pFindData->FileNameLength);
951 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
952 } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
953 FILE_FULL_DIRECTORY_INFO *pFindData =
954 (FILE_FULL_DIRECTORY_INFO *)current_entry;
955 filename = &pFindData->FileName[0];
956 len = le32_to_cpu(pFindData->FileNameLength);
957 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
958 } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
959 SEARCH_ID_FULL_DIR_INFO *pFindData =
960 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
961 filename = &pFindData->FileName[0];
962 len = le32_to_cpu(pFindData->FileNameLength);
963 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
964 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
965 FILE_BOTH_DIRECTORY_INFO *pFindData =
966 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
967 filename = &pFindData->FileName[0];
968 len = le32_to_cpu(pFindData->FileNameLength);
969 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
970 } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
971 FIND_FILE_STANDARD_INFO *pFindData =
972 (FIND_FILE_STANDARD_INFO *)current_entry;
973 filename = &pFindData->FileName[0];
974 /* one byte length, no name conversion */
975 len = (unsigned int)pFindData->FileNameLength;
976 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
977 } else {
978 cFYI(1, ("Unknown findfirst level %d", level));
979 return -EINVAL;
980 }
981 cifsFile->srch_inf.resume_name_len = len;
982 cifsFile->srch_inf.presume_name = filename;
983 return rc;
984}
985 987
986int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) 988int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
987{ 989{
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index ed150efbe27c..2851d5da0c8c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
409#ifdef CONFIG_CIFS_WEAK_PW_HASH 409#ifdef CONFIG_CIFS_WEAK_PW_HASH
410 char lnm_session_key[CIFS_SESS_KEY_SIZE]; 410 char lnm_session_key[CIFS_SESS_KEY_SIZE];
411 411
412 pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
413
412 /* no capabilities flags in old lanman negotiation */ 414 /* no capabilities flags in old lanman negotiation */
413 415
414 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 416 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
@@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
505 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); 507 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
506 } else 508 } else
507 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 509 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
508 } else if (type == Kerberos) { 510 } else if (type == Kerberos || type == MSKerberos) {
509#ifdef CONFIG_CIFS_UPCALL 511#ifdef CONFIG_CIFS_UPCALL
510 struct cifs_spnego_msg *msg; 512 struct cifs_spnego_msg *msg;
511 spnego_key = cifs_get_spnego_key(ses); 513 spnego_key = cifs_get_spnego_key(ses);
@@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
516 } 518 }
517 519
518 msg = spnego_key->payload.data; 520 msg = spnego_key->payload.data;
521 /* check version field to make sure that cifs.upcall is
522 sending us a response in an expected form */
523 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
524 cERROR(1, ("incorrect version of cifs.upcall (expected"
525 " %d but got %d)",
526 CIFS_SPNEGO_UPCALL_VERSION, msg->version));
527 rc = -EKEYREJECTED;
528 goto ssetup_exit;
529 }
519 /* bail out if key is too long */ 530 /* bail out if key is too long */
520 if (msg->sesskey_len > 531 if (msg->sesskey_len >
521 sizeof(ses->server->mac_signing_key.data.krb5)) { 532 sizeof(ses->server->mac_signing_key.data.krb5)) {
@@ -613,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
613 ses, nls_cp); 624 ses, nls_cp);
614 625
615ssetup_exit: 626ssetup_exit:
616 if (spnego_key) 627 if (spnego_key) {
628 key_revoke(spnego_key);
617 key_put(spnego_key); 629 key_put(spnego_key);
630 }
618 kfree(str_area); 631 kfree(str_area);
619 if (resp_buf_type == CIFS_SMALL_BUFFER) { 632 if (resp_buf_type == CIFS_SMALL_BUFFER) {
620 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base)); 633 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e286db9f5ee2..bf0e6d8e382a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
50 return NULL; 50 return NULL;
51 } 51 }
52 52
53 temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp, 53 temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
54 GFP_KERNEL | GFP_NOFS);
55 if (temp == NULL) 54 if (temp == NULL)
56 return temp; 55 return temp;
57 else { 56 else {
diff --git a/fs/compat.c b/fs/compat.c
index c9d1472e65c5..075d0509970d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -792,8 +792,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen,
792 if (buf->result) 792 if (buf->result)
793 return -EINVAL; 793 return -EINVAL;
794 d_ino = ino; 794 d_ino = ino;
795 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) 795 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
796 buf->result = -EOVERFLOW;
796 return -EOVERFLOW; 797 return -EOVERFLOW;
798 }
797 buf->result++; 799 buf->result++;
798 dirent = buf->dirent; 800 dirent = buf->dirent;
799 if (!access_ok(VERIFY_WRITE, dirent, 801 if (!access_ok(VERIFY_WRITE, dirent,
@@ -862,8 +864,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
862 if (reclen > buf->count) 864 if (reclen > buf->count)
863 return -EINVAL; 865 return -EINVAL;
864 d_ino = ino; 866 d_ino = ino;
865 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) 867 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
868 buf->error = -EOVERFLOW;
866 return -EOVERFLOW; 869 return -EOVERFLOW;
870 }
867 dirent = buf->previous; 871 dirent = buf->previous;
868 if (dirent) { 872 if (dirent) {
869 if (__put_user(offset, &dirent->d_off)) 873 if (__put_user(offset, &dirent->d_off))
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7a8db78a91d2..8e93341f3e82 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1311,16 +1311,18 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1311 * Ensure that no racing symlink() will make detach_prep() fail while 1311 * Ensure that no racing symlink() will make detach_prep() fail while
1312 * the new link is temporarily attached 1312 * the new link is temporarily attached
1313 */ 1313 */
1314 mutex_lock(&configfs_symlink_mutex);
1315 spin_lock(&configfs_dirent_lock);
1316 do { 1314 do {
1317 struct mutex *wait_mutex; 1315 struct mutex *wait_mutex;
1318 1316
1317 mutex_lock(&configfs_symlink_mutex);
1318 spin_lock(&configfs_dirent_lock);
1319 ret = configfs_detach_prep(dentry, &wait_mutex); 1319 ret = configfs_detach_prep(dentry, &wait_mutex);
1320 if (ret) { 1320 if (ret)
1321 configfs_detach_rollback(dentry); 1321 configfs_detach_rollback(dentry);
1322 spin_unlock(&configfs_dirent_lock); 1322 spin_unlock(&configfs_dirent_lock);
1323 mutex_unlock(&configfs_symlink_mutex); 1323 mutex_unlock(&configfs_symlink_mutex);
1324
1325 if (ret) {
1324 if (ret != -EAGAIN) { 1326 if (ret != -EAGAIN) {
1325 config_item_put(parent_item); 1327 config_item_put(parent_item);
1326 return ret; 1328 return ret;
@@ -1329,13 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1329 /* Wait until the racing operation terminates */ 1331 /* Wait until the racing operation terminates */
1330 mutex_lock(wait_mutex); 1332 mutex_lock(wait_mutex);
1331 mutex_unlock(wait_mutex); 1333 mutex_unlock(wait_mutex);
1332
1333 mutex_lock(&configfs_symlink_mutex);
1334 spin_lock(&configfs_dirent_lock);
1335 } 1334 }
1336 } while (ret == -EAGAIN); 1335 } while (ret == -EAGAIN);
1337 spin_unlock(&configfs_dirent_lock);
1338 mutex_unlock(&configfs_symlink_mutex);
1339 1336
1340 /* Get a working ref for the duration of this function */ 1337 /* Get a working ref for the duration of this function */
1341 item = configfs_get_config_item(dentry); 1338 item = configfs_get_config_item(dentry);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 0c3b618c15b3..f40423eb1a14 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex);
43static int cramfs_iget5_test(struct inode *inode, void *opaque) 43static int cramfs_iget5_test(struct inode *inode, void *opaque)
44{ 44{
45 struct cramfs_inode *cramfs_inode = opaque; 45 struct cramfs_inode *cramfs_inode = opaque;
46 46 return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
47 if (inode->i_ino != CRAMINO(cramfs_inode))
48 return 0; /* does not match */
49
50 if (inode->i_ino != 1)
51 return 1;
52
53 /* all empty directories, char, block, pipe, and sock, share inode #1 */
54
55 if ((inode->i_mode != cramfs_inode->mode) ||
56 (inode->i_gid != cramfs_inode->gid) ||
57 (inode->i_uid != cramfs_inode->uid))
58 return 0; /* does not match */
59
60 if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) &&
61 (inode->i_rdev != old_decode_dev(cramfs_inode->size)))
62 return 0; /* does not match */
63
64 return 1; /* matches */
65} 47}
66 48
67static int cramfs_iget5_set(struct inode *inode, void *opaque) 49static int cramfs_iget5_set(struct inode *inode, void *opaque)
68{ 50{
69 static struct timespec zerotime;
70 struct cramfs_inode *cramfs_inode = opaque; 51 struct cramfs_inode *cramfs_inode = opaque;
71 inode->i_mode = cramfs_inode->mode;
72 inode->i_uid = cramfs_inode->uid;
73 inode->i_size = cramfs_inode->size;
74 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
75 inode->i_gid = cramfs_inode->gid;
76 /* Struct copy intentional */
77 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
78 inode->i_ino = CRAMINO(cramfs_inode); 52 inode->i_ino = CRAMINO(cramfs_inode);
79 /* inode->i_nlink is left 1 - arguably wrong for directories,
80 but it's the best we can do without reading the directory
81 contents. 1 yields the right result in GNU find, even
82 without -noleaf option. */
83 if (S_ISREG(inode->i_mode)) {
84 inode->i_fop = &generic_ro_fops;
85 inode->i_data.a_ops = &cramfs_aops;
86 } else if (S_ISDIR(inode->i_mode)) {
87 inode->i_op = &cramfs_dir_inode_operations;
88 inode->i_fop = &cramfs_directory_operations;
89 } else if (S_ISLNK(inode->i_mode)) {
90 inode->i_op = &page_symlink_inode_operations;
91 inode->i_data.a_ops = &cramfs_aops;
92 } else {
93 inode->i_size = 0;
94 inode->i_blocks = 0;
95 init_special_inode(inode, inode->i_mode,
96 old_decode_dev(cramfs_inode->size));
97 }
98 return 0; 53 return 0;
99} 54}
100 55
@@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
104 struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), 59 struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
105 cramfs_iget5_test, cramfs_iget5_set, 60 cramfs_iget5_test, cramfs_iget5_set,
106 cramfs_inode); 61 cramfs_inode);
62 static struct timespec zerotime;
63
107 if (inode && (inode->i_state & I_NEW)) { 64 if (inode && (inode->i_state & I_NEW)) {
65 inode->i_mode = cramfs_inode->mode;
66 inode->i_uid = cramfs_inode->uid;
67 inode->i_size = cramfs_inode->size;
68 inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
69 inode->i_gid = cramfs_inode->gid;
70 /* Struct copy intentional */
71 inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
72 /* inode->i_nlink is left 1 - arguably wrong for directories,
73 but it's the best we can do without reading the directory
74 contents. 1 yields the right result in GNU find, even
75 without -noleaf option. */
76 if (S_ISREG(inode->i_mode)) {
77 inode->i_fop = &generic_ro_fops;
78 inode->i_data.a_ops = &cramfs_aops;
79 } else if (S_ISDIR(inode->i_mode)) {
80 inode->i_op = &cramfs_dir_inode_operations;
81 inode->i_fop = &cramfs_directory_operations;
82 } else if (S_ISLNK(inode->i_mode)) {
83 inode->i_op = &page_symlink_inode_operations;
84 inode->i_data.a_ops = &cramfs_aops;
85 } else {
86 inode->i_size = 0;
87 inode->i_blocks = 0;
88 init_special_inode(inode, inode->i_mode,
89 old_decode_dev(cramfs_inode->size));
90 }
108 unlock_new_inode(inode); 91 unlock_new_inode(inode);
109 } 92 }
110 return inode; 93 return inode;
111} 94}
112 95
96static void cramfs_drop_inode(struct inode *inode)
97{
98 if (inode->i_ino == 1)
99 generic_delete_inode(inode);
100 else
101 generic_drop_inode(inode);
102}
103
113/* 104/*
114 * We have our own block cache: don't fill up the buffer cache 105 * We have our own block cache: don't fill up the buffer cache
115 * with the rom-image, because the way the filesystem is set 106 * with the rom-image, because the way the filesystem is set
@@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = {
534 .put_super = cramfs_put_super, 525 .put_super = cramfs_put_super,
535 .remount_fs = cramfs_remount, 526 .remount_fs = cramfs_remount,
536 .statfs = cramfs_statfs, 527 .statfs = cramfs_statfs,
528 .drop_inode = cramfs_drop_inode,
537}; 529};
538 530
539static int cramfs_get_sb(struct file_system_type *fs_type, 531static int cramfs_get_sb(struct file_system_type *fs_type,
diff --git a/fs/dcache.c b/fs/dcache.c
index 101663d15e9f..e7a1a99b7464 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1236,7 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1236 * If no entry exists with the exact case name, allocate new dentry with 1236 * If no entry exists with the exact case name, allocate new dentry with
1237 * the exact case, and return the spliced entry. 1237 * the exact case, and return the spliced entry.
1238 */ 1238 */
1239struct dentry *d_add_ci(struct inode *inode, struct dentry *dentry, 1239struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1240 struct qstr *name) 1240 struct qstr *name)
1241{ 1241{
1242 int error; 1242 int error;
@@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1395 if (dentry->d_parent != parent) 1395 if (dentry->d_parent != parent)
1396 goto next; 1396 goto next;
1397 1397
1398 /* non-existing due to RCU? */
1399 if (d_unhashed(dentry))
1400 goto next;
1401
1398 /* 1402 /*
1399 * It is safe to compare names since d_move() cannot 1403 * It is safe to compare names since d_move() cannot
1400 * change the qstr (protected by d_lock). 1404 * change the qstr (protected by d_lock).
@@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1410 goto next; 1414 goto next;
1411 } 1415 }
1412 1416
1413 if (!d_unhashed(dentry)) { 1417 atomic_inc(&dentry->d_count);
1414 atomic_inc(&dentry->d_count); 1418 found = dentry;
1415 found = dentry;
1416 }
1417 spin_unlock(&dentry->d_lock); 1419 spin_unlock(&dentry->d_lock);
1418 break; 1420 break;
1419next: 1421next:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 08e28c9bb416..3dbe2169cf36 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -26,8 +26,7 @@
26#include <linux/debugfs.h> 26#include <linux/debugfs.h>
27#include <linux/fsnotify.h> 27#include <linux/fsnotify.h>
28#include <linux/string.h> 28#include <linux/string.h>
29 29#include <linux/magic.h>
30#define DEBUGFS_MAGIC 0x64626720
31 30
32static struct vfsmount *debugfs_mount; 31static struct vfsmount *debugfs_mount;
33static int debugfs_mount_count; 32static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 488eb424f662..4a714f6c1bed 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,6 +27,7 @@
27#define DEVPTS_SUPER_MAGIC 0x1cd1 27#define DEVPTS_SUPER_MAGIC 0x1cd1
28 28
29#define DEVPTS_DEFAULT_MODE 0600 29#define DEVPTS_DEFAULT_MODE 0600
30#define PTMX_MINOR 2
30 31
31extern int pty_limit; /* Config limit on Unix98 ptys */ 32extern int pty_limit; /* Config limit on Unix98 ptys */
32static DEFINE_IDA(allocated_ptys); 33static DEFINE_IDA(allocated_ptys);
@@ -48,7 +49,7 @@ enum {
48 Opt_err 49 Opt_err
49}; 50};
50 51
51static match_table_t tokens = { 52static const match_table_t tokens = {
52 {Opt_uid, "uid=%u"}, 53 {Opt_uid, "uid=%u"},
53 {Opt_gid, "gid=%u"}, 54 {Opt_gid, "gid=%u"},
54 {Opt_mode, "mode=%o"}, 55 {Opt_mode, "mode=%o"},
@@ -169,15 +170,7 @@ static struct file_system_type devpts_fs_type = {
169 * to the System V naming convention 170 * to the System V naming convention
170 */ 171 */
171 172
172static struct dentry *get_node(int num) 173int devpts_new_index(struct inode *ptmx_inode)
173{
174 char s[12];
175 struct dentry *root = devpts_root;
176 mutex_lock(&root->d_inode->i_mutex);
177 return lookup_one_len(s, root, sprintf(s, "%d", num));
178}
179
180int devpts_new_index(void)
181{ 174{
182 int index; 175 int index;
183 int ida_ret; 176 int ida_ret;
@@ -205,20 +198,21 @@ retry:
205 return index; 198 return index;
206} 199}
207 200
208void devpts_kill_index(int idx) 201void devpts_kill_index(struct inode *ptmx_inode, int idx)
209{ 202{
210 mutex_lock(&allocated_ptys_lock); 203 mutex_lock(&allocated_ptys_lock);
211 ida_remove(&allocated_ptys, idx); 204 ida_remove(&allocated_ptys, idx);
212 mutex_unlock(&allocated_ptys_lock); 205 mutex_unlock(&allocated_ptys_lock);
213} 206}
214 207
215int devpts_pty_new(struct tty_struct *tty) 208int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
216{ 209{
217 int number = tty->index; /* tty layer puts index from devpts_new_index() in here */ 210 int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
218 struct tty_driver *driver = tty->driver; 211 struct tty_driver *driver = tty->driver;
219 dev_t device = MKDEV(driver->major, driver->minor_start+number); 212 dev_t device = MKDEV(driver->major, driver->minor_start+number);
220 struct dentry *dentry; 213 struct dentry *dentry;
221 struct inode *inode = new_inode(devpts_mnt->mnt_sb); 214 struct inode *inode = new_inode(devpts_mnt->mnt_sb);
215 char s[12];
222 216
223 /* We're supposed to be given the slave end of a pty */ 217 /* We're supposed to be given the slave end of a pty */
224 BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY); 218 BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
@@ -233,10 +227,15 @@ int devpts_pty_new(struct tty_struct *tty)
233 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 227 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
234 init_special_inode(inode, S_IFCHR|config.mode, device); 228 init_special_inode(inode, S_IFCHR|config.mode, device);
235 inode->i_private = tty; 229 inode->i_private = tty;
230 tty->driver_data = inode;
236 231
237 dentry = get_node(number); 232 sprintf(s, "%d", number);
238 if (!IS_ERR(dentry) && !dentry->d_inode) { 233
239 d_instantiate(dentry, inode); 234 mutex_lock(&devpts_root->d_inode->i_mutex);
235
236 dentry = d_alloc_name(devpts_root, s);
237 if (!IS_ERR(dentry)) {
238 d_add(dentry, inode);
240 fsnotify_create(devpts_root->d_inode, dentry); 239 fsnotify_create(devpts_root->d_inode, dentry);
241 } 240 }
242 241
@@ -245,36 +244,31 @@ int devpts_pty_new(struct tty_struct *tty)
245 return 0; 244 return 0;
246} 245}
247 246
248struct tty_struct *devpts_get_tty(int number) 247struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
249{ 248{
250 struct dentry *dentry = get_node(number); 249 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
251 struct tty_struct *tty;
252
253 tty = NULL;
254 if (!IS_ERR(dentry)) {
255 if (dentry->d_inode)
256 tty = dentry->d_inode->i_private;
257 dput(dentry);
258 }
259 250
260 mutex_unlock(&devpts_root->d_inode->i_mutex); 251 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
261 252 return (struct tty_struct *)pts_inode->i_private;
262 return tty; 253 return NULL;
263} 254}
264 255
265void devpts_pty_kill(int number) 256void devpts_pty_kill(struct tty_struct *tty)
266{ 257{
267 struct dentry *dentry = get_node(number); 258 struct inode *inode = tty->driver_data;
259 struct dentry *dentry;
268 260
269 if (!IS_ERR(dentry)) { 261 BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
270 struct inode *inode = dentry->d_inode; 262
271 if (inode) { 263 mutex_lock(&devpts_root->d_inode->i_mutex);
272 inode->i_nlink--; 264
273 d_delete(dentry); 265 dentry = d_find_alias(inode);
274 dput(dentry); 266 if (dentry && !IS_ERR(dentry)) {
275 } 267 inode->i_nlink--;
268 d_delete(dentry);
276 dput(dentry); 269 dput(dentry);
277 } 270 }
271
278 mutex_unlock(&devpts_root->d_inode->i_mutex); 272 mutex_unlock(&devpts_root->d_inode->i_mutex);
279} 273}
280 274
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index c4e7d721bd8d..fd9859f92fad 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,9 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/in.h>
18#include <linux/in6.h>
19#include <net/ipv6.h>
17#include <net/sock.h> 20#include <net/sock.h>
18 21
19#include "config.h" 22#include "config.h"
@@ -30,16 +33,16 @@
30 33
31static struct config_group *space_list; 34static struct config_group *space_list;
32static struct config_group *comm_list; 35static struct config_group *comm_list;
33static struct comm *local_comm; 36static struct dlm_comm *local_comm;
34 37
35struct clusters; 38struct dlm_clusters;
36struct cluster; 39struct dlm_cluster;
37struct spaces; 40struct dlm_spaces;
38struct space; 41struct dlm_space;
39struct comms; 42struct dlm_comms;
40struct comm; 43struct dlm_comm;
41struct nodes; 44struct dlm_nodes;
42struct node; 45struct dlm_node;
43 46
44static struct config_group *make_cluster(struct config_group *, const char *); 47static struct config_group *make_cluster(struct config_group *, const char *);
45static void drop_cluster(struct config_group *, struct config_item *); 48static void drop_cluster(struct config_group *, struct config_item *);
@@ -68,17 +71,22 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
68static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, 71static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
69 const char *buf, size_t len); 72 const char *buf, size_t len);
70 73
71static ssize_t comm_nodeid_read(struct comm *cm, char *buf); 74static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
72static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len); 75static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
73static ssize_t comm_local_read(struct comm *cm, char *buf); 76 size_t len);
74static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len); 77static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
75static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len); 78static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
76static ssize_t node_nodeid_read(struct node *nd, char *buf); 79 size_t len);
77static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len); 80static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
78static ssize_t node_weight_read(struct node *nd, char *buf); 81 size_t len);
79static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len); 82static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
80 83static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
81struct cluster { 84 size_t len);
85static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
86static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
87 size_t len);
88
89struct dlm_cluster {
82 struct config_group group; 90 struct config_group group;
83 unsigned int cl_tcp_port; 91 unsigned int cl_tcp_port;
84 unsigned int cl_buffer_size; 92 unsigned int cl_buffer_size;
@@ -109,11 +117,11 @@ enum {
109 117
110struct cluster_attribute { 118struct cluster_attribute {
111 struct configfs_attribute attr; 119 struct configfs_attribute attr;
112 ssize_t (*show)(struct cluster *, char *); 120 ssize_t (*show)(struct dlm_cluster *, char *);
113 ssize_t (*store)(struct cluster *, const char *, size_t); 121 ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
114}; 122};
115 123
116static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, 124static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
117 int *info_field, int check_zero, 125 int *info_field, int check_zero,
118 const char *buf, size_t len) 126 const char *buf, size_t len)
119{ 127{
@@ -134,12 +142,12 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
134} 142}
135 143
136#define CLUSTER_ATTR(name, check_zero) \ 144#define CLUSTER_ATTR(name, check_zero) \
137static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \ 145static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
138{ \ 146{ \
139 return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ 147 return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
140 check_zero, buf, len); \ 148 check_zero, buf, len); \
141} \ 149} \
142static ssize_t name##_read(struct cluster *cl, char *buf) \ 150static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \
143{ \ 151{ \
144 return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \ 152 return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \
145} \ 153} \
@@ -181,8 +189,8 @@ enum {
181 189
182struct comm_attribute { 190struct comm_attribute {
183 struct configfs_attribute attr; 191 struct configfs_attribute attr;
184 ssize_t (*show)(struct comm *, char *); 192 ssize_t (*show)(struct dlm_comm *, char *);
185 ssize_t (*store)(struct comm *, const char *, size_t); 193 ssize_t (*store)(struct dlm_comm *, const char *, size_t);
186}; 194};
187 195
188static struct comm_attribute comm_attr_nodeid = { 196static struct comm_attribute comm_attr_nodeid = {
@@ -222,8 +230,8 @@ enum {
222 230
223struct node_attribute { 231struct node_attribute {
224 struct configfs_attribute attr; 232 struct configfs_attribute attr;
225 ssize_t (*show)(struct node *, char *); 233 ssize_t (*show)(struct dlm_node *, char *);
226 ssize_t (*store)(struct node *, const char *, size_t); 234 ssize_t (*store)(struct dlm_node *, const char *, size_t);
227}; 235};
228 236
229static struct node_attribute node_attr_nodeid = { 237static struct node_attribute node_attr_nodeid = {
@@ -248,26 +256,26 @@ static struct configfs_attribute *node_attrs[] = {
248 NULL, 256 NULL,
249}; 257};
250 258
251struct clusters { 259struct dlm_clusters {
252 struct configfs_subsystem subsys; 260 struct configfs_subsystem subsys;
253}; 261};
254 262
255struct spaces { 263struct dlm_spaces {
256 struct config_group ss_group; 264 struct config_group ss_group;
257}; 265};
258 266
259struct space { 267struct dlm_space {
260 struct config_group group; 268 struct config_group group;
261 struct list_head members; 269 struct list_head members;
262 struct mutex members_lock; 270 struct mutex members_lock;
263 int members_count; 271 int members_count;
264}; 272};
265 273
266struct comms { 274struct dlm_comms {
267 struct config_group cs_group; 275 struct config_group cs_group;
268}; 276};
269 277
270struct comm { 278struct dlm_comm {
271 struct config_item item; 279 struct config_item item;
272 int nodeid; 280 int nodeid;
273 int local; 281 int local;
@@ -275,11 +283,11 @@ struct comm {
275 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; 283 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
276}; 284};
277 285
278struct nodes { 286struct dlm_nodes {
279 struct config_group ns_group; 287 struct config_group ns_group;
280}; 288};
281 289
282struct node { 290struct dlm_node {
283 struct config_item item; 291 struct config_item item;
284 struct list_head list; /* space->members */ 292 struct list_head list; /* space->members */
285 int nodeid; 293 int nodeid;
@@ -372,38 +380,40 @@ static struct config_item_type node_type = {
372 .ct_owner = THIS_MODULE, 380 .ct_owner = THIS_MODULE,
373}; 381};
374 382
375static struct cluster *to_cluster(struct config_item *i) 383static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
376{ 384{
377 return i ? container_of(to_config_group(i), struct cluster, group):NULL; 385 return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
386 NULL;
378} 387}
379 388
380static struct space *to_space(struct config_item *i) 389static struct dlm_space *config_item_to_space(struct config_item *i)
381{ 390{
382 return i ? container_of(to_config_group(i), struct space, group) : NULL; 391 return i ? container_of(to_config_group(i), struct dlm_space, group) :
392 NULL;
383} 393}
384 394
385static struct comm *to_comm(struct config_item *i) 395static struct dlm_comm *config_item_to_comm(struct config_item *i)
386{ 396{
387 return i ? container_of(i, struct comm, item) : NULL; 397 return i ? container_of(i, struct dlm_comm, item) : NULL;
388} 398}
389 399
390static struct node *to_node(struct config_item *i) 400static struct dlm_node *config_item_to_node(struct config_item *i)
391{ 401{
392 return i ? container_of(i, struct node, item) : NULL; 402 return i ? container_of(i, struct dlm_node, item) : NULL;
393} 403}
394 404
395static struct config_group *make_cluster(struct config_group *g, 405static struct config_group *make_cluster(struct config_group *g,
396 const char *name) 406 const char *name)
397{ 407{
398 struct cluster *cl = NULL; 408 struct dlm_cluster *cl = NULL;
399 struct spaces *sps = NULL; 409 struct dlm_spaces *sps = NULL;
400 struct comms *cms = NULL; 410 struct dlm_comms *cms = NULL;
401 void *gps = NULL; 411 void *gps = NULL;
402 412
403 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL); 413 cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
404 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); 414 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
405 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL); 415 sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
406 cms = kzalloc(sizeof(struct comms), GFP_KERNEL); 416 cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
407 417
408 if (!cl || !gps || !sps || !cms) 418 if (!cl || !gps || !sps || !cms)
409 goto fail; 419 goto fail;
@@ -443,7 +453,7 @@ static struct config_group *make_cluster(struct config_group *g,
443 453
444static void drop_cluster(struct config_group *g, struct config_item *i) 454static void drop_cluster(struct config_group *g, struct config_item *i)
445{ 455{
446 struct cluster *cl = to_cluster(i); 456 struct dlm_cluster *cl = config_item_to_cluster(i);
447 struct config_item *tmp; 457 struct config_item *tmp;
448 int j; 458 int j;
449 459
@@ -461,20 +471,20 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
461 471
462static void release_cluster(struct config_item *i) 472static void release_cluster(struct config_item *i)
463{ 473{
464 struct cluster *cl = to_cluster(i); 474 struct dlm_cluster *cl = config_item_to_cluster(i);
465 kfree(cl->group.default_groups); 475 kfree(cl->group.default_groups);
466 kfree(cl); 476 kfree(cl);
467} 477}
468 478
469static struct config_group *make_space(struct config_group *g, const char *name) 479static struct config_group *make_space(struct config_group *g, const char *name)
470{ 480{
471 struct space *sp = NULL; 481 struct dlm_space *sp = NULL;
472 struct nodes *nds = NULL; 482 struct dlm_nodes *nds = NULL;
473 void *gps = NULL; 483 void *gps = NULL;
474 484
475 sp = kzalloc(sizeof(struct space), GFP_KERNEL); 485 sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
476 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); 486 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
477 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL); 487 nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
478 488
479 if (!sp || !gps || !nds) 489 if (!sp || !gps || !nds)
480 goto fail; 490 goto fail;
@@ -500,7 +510,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
500 510
501static void drop_space(struct config_group *g, struct config_item *i) 511static void drop_space(struct config_group *g, struct config_item *i)
502{ 512{
503 struct space *sp = to_space(i); 513 struct dlm_space *sp = config_item_to_space(i);
504 struct config_item *tmp; 514 struct config_item *tmp;
505 int j; 515 int j;
506 516
@@ -517,16 +527,16 @@ static void drop_space(struct config_group *g, struct config_item *i)
517 527
518static void release_space(struct config_item *i) 528static void release_space(struct config_item *i)
519{ 529{
520 struct space *sp = to_space(i); 530 struct dlm_space *sp = config_item_to_space(i);
521 kfree(sp->group.default_groups); 531 kfree(sp->group.default_groups);
522 kfree(sp); 532 kfree(sp);
523} 533}
524 534
525static struct config_item *make_comm(struct config_group *g, const char *name) 535static struct config_item *make_comm(struct config_group *g, const char *name)
526{ 536{
527 struct comm *cm; 537 struct dlm_comm *cm;
528 538
529 cm = kzalloc(sizeof(struct comm), GFP_KERNEL); 539 cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
530 if (!cm) 540 if (!cm)
531 return ERR_PTR(-ENOMEM); 541 return ERR_PTR(-ENOMEM);
532 542
@@ -539,7 +549,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
539 549
540static void drop_comm(struct config_group *g, struct config_item *i) 550static void drop_comm(struct config_group *g, struct config_item *i)
541{ 551{
542 struct comm *cm = to_comm(i); 552 struct dlm_comm *cm = config_item_to_comm(i);
543 if (local_comm == cm) 553 if (local_comm == cm)
544 local_comm = NULL; 554 local_comm = NULL;
545 dlm_lowcomms_close(cm->nodeid); 555 dlm_lowcomms_close(cm->nodeid);
@@ -550,16 +560,16 @@ static void drop_comm(struct config_group *g, struct config_item *i)
550 560
551static void release_comm(struct config_item *i) 561static void release_comm(struct config_item *i)
552{ 562{
553 struct comm *cm = to_comm(i); 563 struct dlm_comm *cm = config_item_to_comm(i);
554 kfree(cm); 564 kfree(cm);
555} 565}
556 566
557static struct config_item *make_node(struct config_group *g, const char *name) 567static struct config_item *make_node(struct config_group *g, const char *name)
558{ 568{
559 struct space *sp = to_space(g->cg_item.ci_parent); 569 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
560 struct node *nd; 570 struct dlm_node *nd;
561 571
562 nd = kzalloc(sizeof(struct node), GFP_KERNEL); 572 nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
563 if (!nd) 573 if (!nd)
564 return ERR_PTR(-ENOMEM); 574 return ERR_PTR(-ENOMEM);
565 575
@@ -578,8 +588,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
578 588
579static void drop_node(struct config_group *g, struct config_item *i) 589static void drop_node(struct config_group *g, struct config_item *i)
580{ 590{
581 struct space *sp = to_space(g->cg_item.ci_parent); 591 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
582 struct node *nd = to_node(i); 592 struct dlm_node *nd = config_item_to_node(i);
583 593
584 mutex_lock(&sp->members_lock); 594 mutex_lock(&sp->members_lock);
585 list_del(&nd->list); 595 list_del(&nd->list);
@@ -591,11 +601,11 @@ static void drop_node(struct config_group *g, struct config_item *i)
591 601
592static void release_node(struct config_item *i) 602static void release_node(struct config_item *i)
593{ 603{
594 struct node *nd = to_node(i); 604 struct dlm_node *nd = config_item_to_node(i);
595 kfree(nd); 605 kfree(nd);
596} 606}
597 607
598static struct clusters clusters_root = { 608static struct dlm_clusters clusters_root = {
599 .subsys = { 609 .subsys = {
600 .su_group = { 610 .su_group = {
601 .cg_item = { 611 .cg_item = {
@@ -625,7 +635,7 @@ void dlm_config_exit(void)
625static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, 635static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
626 char *buf) 636 char *buf)
627{ 637{
628 struct cluster *cl = to_cluster(i); 638 struct dlm_cluster *cl = config_item_to_cluster(i);
629 struct cluster_attribute *cla = 639 struct cluster_attribute *cla =
630 container_of(a, struct cluster_attribute, attr); 640 container_of(a, struct cluster_attribute, attr);
631 return cla->show ? cla->show(cl, buf) : 0; 641 return cla->show ? cla->show(cl, buf) : 0;
@@ -635,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i,
635 struct configfs_attribute *a, 645 struct configfs_attribute *a,
636 const char *buf, size_t len) 646 const char *buf, size_t len)
637{ 647{
638 struct cluster *cl = to_cluster(i); 648 struct dlm_cluster *cl = config_item_to_cluster(i);
639 struct cluster_attribute *cla = 649 struct cluster_attribute *cla =
640 container_of(a, struct cluster_attribute, attr); 650 container_of(a, struct cluster_attribute, attr);
641 return cla->store ? cla->store(cl, buf, len) : -EINVAL; 651 return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -644,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i,
644static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, 654static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
645 char *buf) 655 char *buf)
646{ 656{
647 struct comm *cm = to_comm(i); 657 struct dlm_comm *cm = config_item_to_comm(i);
648 struct comm_attribute *cma = 658 struct comm_attribute *cma =
649 container_of(a, struct comm_attribute, attr); 659 container_of(a, struct comm_attribute, attr);
650 return cma->show ? cma->show(cm, buf) : 0; 660 return cma->show ? cma->show(cm, buf) : 0;
@@ -653,29 +663,31 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
653static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, 663static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
654 const char *buf, size_t len) 664 const char *buf, size_t len)
655{ 665{
656 struct comm *cm = to_comm(i); 666 struct dlm_comm *cm = config_item_to_comm(i);
657 struct comm_attribute *cma = 667 struct comm_attribute *cma =
658 container_of(a, struct comm_attribute, attr); 668 container_of(a, struct comm_attribute, attr);
659 return cma->store ? cma->store(cm, buf, len) : -EINVAL; 669 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
660} 670}
661 671
662static ssize_t comm_nodeid_read(struct comm *cm, char *buf) 672static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
663{ 673{
664 return sprintf(buf, "%d\n", cm->nodeid); 674 return sprintf(buf, "%d\n", cm->nodeid);
665} 675}
666 676
667static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len) 677static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
678 size_t len)
668{ 679{
669 cm->nodeid = simple_strtol(buf, NULL, 0); 680 cm->nodeid = simple_strtol(buf, NULL, 0);
670 return len; 681 return len;
671} 682}
672 683
673static ssize_t comm_local_read(struct comm *cm, char *buf) 684static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
674{ 685{
675 return sprintf(buf, "%d\n", cm->local); 686 return sprintf(buf, "%d\n", cm->local);
676} 687}
677 688
678static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len) 689static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
690 size_t len)
679{ 691{
680 cm->local= simple_strtol(buf, NULL, 0); 692 cm->local= simple_strtol(buf, NULL, 0);
681 if (cm->local && !local_comm) 693 if (cm->local && !local_comm)
@@ -683,7 +695,7 @@ static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
683 return len; 695 return len;
684} 696}
685 697
686static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len) 698static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
687{ 699{
688 struct sockaddr_storage *addr; 700 struct sockaddr_storage *addr;
689 701
@@ -705,7 +717,7 @@ static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
705static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, 717static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
706 char *buf) 718 char *buf)
707{ 719{
708 struct node *nd = to_node(i); 720 struct dlm_node *nd = config_item_to_node(i);
709 struct node_attribute *nda = 721 struct node_attribute *nda =
710 container_of(a, struct node_attribute, attr); 722 container_of(a, struct node_attribute, attr);
711 return nda->show ? nda->show(nd, buf) : 0; 723 return nda->show ? nda->show(nd, buf) : 0;
@@ -714,29 +726,31 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
714static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, 726static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
715 const char *buf, size_t len) 727 const char *buf, size_t len)
716{ 728{
717 struct node *nd = to_node(i); 729 struct dlm_node *nd = config_item_to_node(i);
718 struct node_attribute *nda = 730 struct node_attribute *nda =
719 container_of(a, struct node_attribute, attr); 731 container_of(a, struct node_attribute, attr);
720 return nda->store ? nda->store(nd, buf, len) : -EINVAL; 732 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
721} 733}
722 734
723static ssize_t node_nodeid_read(struct node *nd, char *buf) 735static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
724{ 736{
725 return sprintf(buf, "%d\n", nd->nodeid); 737 return sprintf(buf, "%d\n", nd->nodeid);
726} 738}
727 739
728static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len) 740static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
741 size_t len)
729{ 742{
730 nd->nodeid = simple_strtol(buf, NULL, 0); 743 nd->nodeid = simple_strtol(buf, NULL, 0);
731 return len; 744 return len;
732} 745}
733 746
734static ssize_t node_weight_read(struct node *nd, char *buf) 747static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
735{ 748{
736 return sprintf(buf, "%d\n", nd->weight); 749 return sprintf(buf, "%d\n", nd->weight);
737} 750}
738 751
739static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len) 752static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
753 size_t len)
740{ 754{
741 nd->weight = simple_strtol(buf, NULL, 0); 755 nd->weight = simple_strtol(buf, NULL, 0);
742 return len; 756 return len;
@@ -746,7 +760,7 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
746 * Functions for the dlm to get the info that's been configured 760 * Functions for the dlm to get the info that's been configured
747 */ 761 */
748 762
749static struct space *get_space(char *name) 763static struct dlm_space *get_space(char *name)
750{ 764{
751 struct config_item *i; 765 struct config_item *i;
752 766
@@ -757,18 +771,45 @@ static struct space *get_space(char *name)
757 i = config_group_find_item(space_list, name); 771 i = config_group_find_item(space_list, name);
758 mutex_unlock(&space_list->cg_subsys->su_mutex); 772 mutex_unlock(&space_list->cg_subsys->su_mutex);
759 773
760 return to_space(i); 774 return config_item_to_space(i);
761} 775}
762 776
763static void put_space(struct space *sp) 777static void put_space(struct dlm_space *sp)
764{ 778{
765 config_item_put(&sp->group.cg_item); 779 config_item_put(&sp->group.cg_item);
766} 780}
767 781
768static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) 782static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
783{
784 switch (x->ss_family) {
785 case AF_INET: {
786 struct sockaddr_in *sinx = (struct sockaddr_in *)x;
787 struct sockaddr_in *siny = (struct sockaddr_in *)y;
788 if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
789 return 0;
790 if (sinx->sin_port != siny->sin_port)
791 return 0;
792 break;
793 }
794 case AF_INET6: {
795 struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
796 struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
797 if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
798 return 0;
799 if (sinx->sin6_port != siny->sin6_port)
800 return 0;
801 break;
802 }
803 default:
804 return 0;
805 }
806 return 1;
807}
808
809static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
769{ 810{
770 struct config_item *i; 811 struct config_item *i;
771 struct comm *cm = NULL; 812 struct dlm_comm *cm = NULL;
772 int found = 0; 813 int found = 0;
773 814
774 if (!comm_list) 815 if (!comm_list)
@@ -777,7 +818,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
777 mutex_lock(&clusters_root.subsys.su_mutex); 818 mutex_lock(&clusters_root.subsys.su_mutex);
778 819
779 list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 820 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
780 cm = to_comm(i); 821 cm = config_item_to_comm(i);
781 822
782 if (nodeid) { 823 if (nodeid) {
783 if (cm->nodeid != nodeid) 824 if (cm->nodeid != nodeid)
@@ -786,8 +827,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
786 config_item_get(i); 827 config_item_get(i);
787 break; 828 break;
788 } else { 829 } else {
789 if (!cm->addr_count || 830 if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
790 memcmp(cm->addr[0], addr, sizeof(*addr)))
791 continue; 831 continue;
792 found = 1; 832 found = 1;
793 config_item_get(i); 833 config_item_get(i);
@@ -801,7 +841,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
801 return cm; 841 return cm;
802} 842}
803 843
804static void put_comm(struct comm *cm) 844static void put_comm(struct dlm_comm *cm)
805{ 845{
806 config_item_put(&cm->item); 846 config_item_put(&cm->item);
807} 847}
@@ -810,8 +850,8 @@ static void put_comm(struct comm *cm)
810int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, 850int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
811 int **new_out, int *new_count_out) 851 int **new_out, int *new_count_out)
812{ 852{
813 struct space *sp; 853 struct dlm_space *sp;
814 struct node *nd; 854 struct dlm_node *nd;
815 int i = 0, rv = 0, ids_count = 0, new_count = 0; 855 int i = 0, rv = 0, ids_count = 0, new_count = 0;
816 int *ids, *new; 856 int *ids, *new;
817 857
@@ -874,8 +914,8 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
874 914
875int dlm_node_weight(char *lsname, int nodeid) 915int dlm_node_weight(char *lsname, int nodeid)
876{ 916{
877 struct space *sp; 917 struct dlm_space *sp;
878 struct node *nd; 918 struct dlm_node *nd;
879 int w = -EEXIST; 919 int w = -EEXIST;
880 920
881 sp = get_space(lsname); 921 sp = get_space(lsname);
@@ -897,7 +937,7 @@ int dlm_node_weight(char *lsname, int nodeid)
897 937
898int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) 938int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
899{ 939{
900 struct comm *cm = get_comm(nodeid, NULL); 940 struct dlm_comm *cm = get_comm(nodeid, NULL);
901 if (!cm) 941 if (!cm)
902 return -EEXIST; 942 return -EEXIST;
903 if (!cm->addr_count) 943 if (!cm->addr_count)
@@ -909,7 +949,7 @@ int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
909 949
910int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) 950int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
911{ 951{
912 struct comm *cm = get_comm(0, addr); 952 struct dlm_comm *cm = get_comm(0, addr);
913 if (!cm) 953 if (!cm)
914 return -EEXIST; 954 return -EEXIST;
915 *nodeid = cm->nodeid; 955 *nodeid = cm->nodeid;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5a7ac33b629c..868e4c9ef127 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -441,8 +441,11 @@ struct dlm_ls {
441 uint32_t ls_global_id; /* global unique lockspace ID */ 441 uint32_t ls_global_id; /* global unique lockspace ID */
442 uint32_t ls_exflags; 442 uint32_t ls_exflags;
443 int ls_lvblen; 443 int ls_lvblen;
444 int ls_count; /* reference count */ 444 int ls_count; /* refcount of processes in
445 the dlm using this ls */
446 int ls_create_count; /* create/release refcount */
445 unsigned long ls_flags; /* LSFL_ */ 447 unsigned long ls_flags; /* LSFL_ */
448 unsigned long ls_scan_time;
446 struct kobject ls_kobj; 449 struct kobject ls_kobj;
447 450
448 struct dlm_rsbtable *ls_rsbtbl; 451 struct dlm_rsbtable *ls_rsbtbl;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 499e16759e96..d910501de6d2 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -23,6 +23,7 @@
23#include "lock.h" 23#include "lock.h"
24#include "recover.h" 24#include "recover.h"
25#include "requestqueue.h" 25#include "requestqueue.h"
26#include "user.h"
26 27
27static int ls_count; 28static int ls_count;
28static struct mutex ls_lock; 29static struct mutex ls_lock;
@@ -211,19 +212,41 @@ void dlm_lockspace_exit(void)
211 kset_unregister(dlm_kset); 212 kset_unregister(dlm_kset);
212} 213}
213 214
215static struct dlm_ls *find_ls_to_scan(void)
216{
217 struct dlm_ls *ls;
218
219 spin_lock(&lslist_lock);
220 list_for_each_entry(ls, &lslist, ls_list) {
221 if (time_after_eq(jiffies, ls->ls_scan_time +
222 dlm_config.ci_scan_secs * HZ)) {
223 spin_unlock(&lslist_lock);
224 return ls;
225 }
226 }
227 spin_unlock(&lslist_lock);
228 return NULL;
229}
230
214static int dlm_scand(void *data) 231static int dlm_scand(void *data)
215{ 232{
216 struct dlm_ls *ls; 233 struct dlm_ls *ls;
234 int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
217 235
218 while (!kthread_should_stop()) { 236 while (!kthread_should_stop()) {
219 list_for_each_entry(ls, &lslist, ls_list) { 237 ls = find_ls_to_scan();
238 if (ls) {
220 if (dlm_lock_recovery_try(ls)) { 239 if (dlm_lock_recovery_try(ls)) {
240 ls->ls_scan_time = jiffies;
221 dlm_scan_rsbs(ls); 241 dlm_scan_rsbs(ls);
222 dlm_scan_timeout(ls); 242 dlm_scan_timeout(ls);
223 dlm_unlock_recovery(ls); 243 dlm_unlock_recovery(ls);
244 } else {
245 ls->ls_scan_time += HZ;
224 } 246 }
247 } else {
248 schedule_timeout_interruptible(timeout_jiffies);
225 } 249 }
226 schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
227 } 250 }
228 return 0; 251 return 0;
229} 252}
@@ -246,23 +269,6 @@ static void dlm_scand_stop(void)
246 kthread_stop(scand_task); 269 kthread_stop(scand_task);
247} 270}
248 271
249static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
250{
251 struct dlm_ls *ls;
252
253 spin_lock(&lslist_lock);
254
255 list_for_each_entry(ls, &lslist, ls_list) {
256 if (ls->ls_namelen == namelen &&
257 memcmp(ls->ls_name, name, namelen) == 0)
258 goto out;
259 }
260 ls = NULL;
261 out:
262 spin_unlock(&lslist_lock);
263 return ls;
264}
265
266struct dlm_ls *dlm_find_lockspace_global(uint32_t id) 272struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
267{ 273{
268 struct dlm_ls *ls; 274 struct dlm_ls *ls;
@@ -327,6 +333,7 @@ static void remove_lockspace(struct dlm_ls *ls)
327 for (;;) { 333 for (;;) {
328 spin_lock(&lslist_lock); 334 spin_lock(&lslist_lock);
329 if (ls->ls_count == 0) { 335 if (ls->ls_count == 0) {
336 WARN_ON(ls->ls_create_count != 0);
330 list_del(&ls->ls_list); 337 list_del(&ls->ls_list);
331 spin_unlock(&lslist_lock); 338 spin_unlock(&lslist_lock);
332 return; 339 return;
@@ -381,7 +388,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
381 uint32_t flags, int lvblen) 388 uint32_t flags, int lvblen)
382{ 389{
383 struct dlm_ls *ls; 390 struct dlm_ls *ls;
384 int i, size, error = -ENOMEM; 391 int i, size, error;
385 int do_unreg = 0; 392 int do_unreg = 0;
386 393
387 if (namelen > DLM_LOCKSPACE_LEN) 394 if (namelen > DLM_LOCKSPACE_LEN)
@@ -393,12 +400,37 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
393 if (!try_module_get(THIS_MODULE)) 400 if (!try_module_get(THIS_MODULE))
394 return -EINVAL; 401 return -EINVAL;
395 402
396 ls = dlm_find_lockspace_name(name, namelen); 403 if (!dlm_user_daemon_available()) {
397 if (ls) { 404 module_put(THIS_MODULE);
398 *lockspace = ls; 405 return -EUNATCH;
406 }
407
408 error = 0;
409
410 spin_lock(&lslist_lock);
411 list_for_each_entry(ls, &lslist, ls_list) {
412 WARN_ON(ls->ls_create_count <= 0);
413 if (ls->ls_namelen != namelen)
414 continue;
415 if (memcmp(ls->ls_name, name, namelen))
416 continue;
417 if (flags & DLM_LSFL_NEWEXCL) {
418 error = -EEXIST;
419 break;
420 }
421 ls->ls_create_count++;
399 module_put(THIS_MODULE); 422 module_put(THIS_MODULE);
400 return -EEXIST; 423 error = 1; /* not an error, return 0 */
424 break;
401 } 425 }
426 spin_unlock(&lslist_lock);
427
428 if (error < 0)
429 goto out;
430 if (error)
431 goto ret_zero;
432
433 error = -ENOMEM;
402 434
403 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); 435 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
404 if (!ls) 436 if (!ls)
@@ -408,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
408 ls->ls_lvblen = lvblen; 440 ls->ls_lvblen = lvblen;
409 ls->ls_count = 0; 441 ls->ls_count = 0;
410 ls->ls_flags = 0; 442 ls->ls_flags = 0;
443 ls->ls_scan_time = jiffies;
411 444
412 if (flags & DLM_LSFL_TIMEWARN) 445 if (flags & DLM_LSFL_TIMEWARN)
413 set_bit(LSFL_TIMEWARN, &ls->ls_flags); 446 set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -418,8 +451,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
418 ls->ls_allocation = GFP_KERNEL; 451 ls->ls_allocation = GFP_KERNEL;
419 452
420 /* ls_exflags are forced to match among nodes, and we don't 453 /* ls_exflags are forced to match among nodes, and we don't
421 need to require all nodes to have TIMEWARN or FS set */ 454 need to require all nodes to have some flags set */
422 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS)); 455 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
456 DLM_LSFL_NEWEXCL));
423 457
424 size = dlm_config.ci_rsbtbl_size; 458 size = dlm_config.ci_rsbtbl_size;
425 ls->ls_rsbtbl_size = size; 459 ls->ls_rsbtbl_size = size;
@@ -510,6 +544,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
510 down_write(&ls->ls_in_recovery); 544 down_write(&ls->ls_in_recovery);
511 545
512 spin_lock(&lslist_lock); 546 spin_lock(&lslist_lock);
547 ls->ls_create_count = 1;
513 list_add(&ls->ls_list, &lslist); 548 list_add(&ls->ls_list, &lslist);
514 spin_unlock(&lslist_lock); 549 spin_unlock(&lslist_lock);
515 550
@@ -548,7 +583,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
548 dlm_create_debug_file(ls); 583 dlm_create_debug_file(ls);
549 584
550 log_debug(ls, "join complete"); 585 log_debug(ls, "join complete");
551 586 ret_zero:
552 *lockspace = ls; 587 *lockspace = ls;
553 return 0; 588 return 0;
554 589
@@ -635,13 +670,34 @@ static int release_lockspace(struct dlm_ls *ls, int force)
635 struct dlm_lkb *lkb; 670 struct dlm_lkb *lkb;
636 struct dlm_rsb *rsb; 671 struct dlm_rsb *rsb;
637 struct list_head *head; 672 struct list_head *head;
638 int i; 673 int i, busy, rv;
639 int busy = lockspace_busy(ls); 674
675 busy = lockspace_busy(ls);
676
677 spin_lock(&lslist_lock);
678 if (ls->ls_create_count == 1) {
679 if (busy > force)
680 rv = -EBUSY;
681 else {
682 /* remove_lockspace takes ls off lslist */
683 ls->ls_create_count = 0;
684 rv = 0;
685 }
686 } else if (ls->ls_create_count > 1) {
687 rv = --ls->ls_create_count;
688 } else {
689 rv = -EINVAL;
690 }
691 spin_unlock(&lslist_lock);
640 692
641 if (busy > force) 693 if (rv) {
642 return -EBUSY; 694 log_debug(ls, "release_lockspace no remove %d", rv);
695 return rv;
696 }
697
698 dlm_device_deregister(ls);
643 699
644 if (force < 3) 700 if (force < 3 && dlm_user_daemon_available())
645 do_uevent(ls, 0); 701 do_uevent(ls, 0);
646 702
647 dlm_recoverd_stop(ls); 703 dlm_recoverd_stop(ls);
@@ -720,15 +776,10 @@ static int release_lockspace(struct dlm_ls *ls, int force)
720 dlm_clear_members(ls); 776 dlm_clear_members(ls);
721 dlm_clear_members_gone(ls); 777 dlm_clear_members_gone(ls);
722 kfree(ls->ls_node_array); 778 kfree(ls->ls_node_array);
779 log_debug(ls, "release_lockspace final free");
723 kobject_put(&ls->ls_kobj); 780 kobject_put(&ls->ls_kobj);
724 /* The ls structure will be freed when the kobject is done with */ 781 /* The ls structure will be freed when the kobject is done with */
725 782
726 mutex_lock(&ls_lock);
727 ls_count--;
728 if (!ls_count)
729 threads_stop();
730 mutex_unlock(&ls_lock);
731
732 module_put(THIS_MODULE); 783 module_put(THIS_MODULE);
733 return 0; 784 return 0;
734} 785}
@@ -750,11 +801,38 @@ static int release_lockspace(struct dlm_ls *ls, int force)
750int dlm_release_lockspace(void *lockspace, int force) 801int dlm_release_lockspace(void *lockspace, int force)
751{ 802{
752 struct dlm_ls *ls; 803 struct dlm_ls *ls;
804 int error;
753 805
754 ls = dlm_find_lockspace_local(lockspace); 806 ls = dlm_find_lockspace_local(lockspace);
755 if (!ls) 807 if (!ls)
756 return -EINVAL; 808 return -EINVAL;
757 dlm_put_lockspace(ls); 809 dlm_put_lockspace(ls);
758 return release_lockspace(ls, force); 810
811 mutex_lock(&ls_lock);
812 error = release_lockspace(ls, force);
813 if (!error)
814 ls_count--;
815 else if (!ls_count)
816 threads_stop();
817 mutex_unlock(&ls_lock);
818
819 return error;
820}
821
822void dlm_stop_lockspaces(void)
823{
824 struct dlm_ls *ls;
825
826 restart:
827 spin_lock(&lslist_lock);
828 list_for_each_entry(ls, &lslist, ls_list) {
829 if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
830 continue;
831 spin_unlock(&lslist_lock);
832 log_error(ls, "no userland control daemon, stopping lockspace");
833 dlm_ls_stop(ls);
834 goto restart;
835 }
836 spin_unlock(&lslist_lock);
759} 837}
760 838
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 891eabbdd021..f879f87901f8 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id); 20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor); 21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls); 22void dlm_put_lockspace(struct dlm_ls *ls);
23void dlm_stop_lockspaces(void);
23 24
24#endif /* __LOCKSPACE_DOT_H__ */ 25#endif /* __LOCKSPACE_DOT_H__ */
25 26
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 929e48ae7591..b3832c67194a 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -15,7 +15,6 @@
15#include <linux/poll.h> 15#include <linux/poll.h>
16#include <linux/signal.h> 16#include <linux/signal.h>
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/smp_lock.h>
19#include <linux/dlm.h> 18#include <linux/dlm.h>
20#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
21 20
@@ -27,6 +26,8 @@
27 26
28static const char name_prefix[] = "dlm"; 27static const char name_prefix[] = "dlm";
29static const struct file_operations device_fops; 28static const struct file_operations device_fops;
29static atomic_t dlm_monitor_opened;
30static int dlm_monitor_unused = 1;
30 31
31#ifdef CONFIG_COMPAT 32#ifdef CONFIG_COMPAT
32 33
@@ -340,10 +341,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc,
340 return error; 341 return error;
341} 342}
342 343
343static int create_misc_device(struct dlm_ls *ls, char *name) 344static int dlm_device_register(struct dlm_ls *ls, char *name)
344{ 345{
345 int error, len; 346 int error, len;
346 347
348 /* The device is already registered. This happens when the
349 lockspace is created multiple times from userspace. */
350 if (ls->ls_device.name)
351 return 0;
352
347 error = -ENOMEM; 353 error = -ENOMEM;
348 len = strlen(name) + strlen(name_prefix) + 2; 354 len = strlen(name) + strlen(name_prefix) + 2;
349 ls->ls_device.name = kzalloc(len, GFP_KERNEL); 355 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
@@ -363,6 +369,22 @@ fail:
363 return error; 369 return error;
364} 370}
365 371
372int dlm_device_deregister(struct dlm_ls *ls)
373{
374 int error;
375
376 /* The device is not registered. This happens when the lockspace
377 was never used from userspace, or when device_create_lockspace()
378 calls dlm_release_lockspace() after the register fails. */
379 if (!ls->ls_device.name)
380 return 0;
381
382 error = misc_deregister(&ls->ls_device);
383 if (!error)
384 kfree(ls->ls_device.name);
385 return error;
386}
387
366static int device_user_purge(struct dlm_user_proc *proc, 388static int device_user_purge(struct dlm_user_proc *proc,
367 struct dlm_purge_params *params) 389 struct dlm_purge_params *params)
368{ 390{
@@ -397,7 +419,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
397 if (!ls) 419 if (!ls)
398 return -ENOENT; 420 return -ENOENT;
399 421
400 error = create_misc_device(ls, params->name); 422 error = dlm_device_register(ls, params->name);
401 dlm_put_lockspace(ls); 423 dlm_put_lockspace(ls);
402 424
403 if (error) 425 if (error)
@@ -421,31 +443,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
421 if (!ls) 443 if (!ls)
422 return -ENOENT; 444 return -ENOENT;
423 445
424 /* Deregister the misc device first, so we don't have
425 * a device that's not attached to a lockspace. If
426 * dlm_release_lockspace fails then we can recreate it
427 */
428 error = misc_deregister(&ls->ls_device);
429 if (error) {
430 dlm_put_lockspace(ls);
431 goto out;
432 }
433 kfree(ls->ls_device.name);
434
435 if (params->flags & DLM_USER_LSFLG_FORCEFREE) 446 if (params->flags & DLM_USER_LSFLG_FORCEFREE)
436 force = 2; 447 force = 2;
437 448
438 lockspace = ls->ls_local_handle; 449 lockspace = ls->ls_local_handle;
450 dlm_put_lockspace(ls);
439 451
440 /* dlm_release_lockspace waits for references to go to zero, 452 /* The final dlm_release_lockspace waits for references to go to
441 so all processes will need to close their device for the ls 453 zero, so all processes will need to close their device for the
442 before the release will procede */ 454 ls before the release will proceed. release also calls the
455 device_deregister above. Converting a positive return value
456 from release to zero means that userspace won't know when its
457 release was the final one, but it shouldn't need to know. */
443 458
444 dlm_put_lockspace(ls);
445 error = dlm_release_lockspace(lockspace, force); 459 error = dlm_release_lockspace(lockspace, force);
446 if (error) 460 if (error > 0)
447 create_misc_device(ls, ls->ls_name); 461 error = 0;
448 out:
449 return error; 462 return error;
450} 463}
451 464
@@ -527,8 +540,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
527 k32buf = (struct dlm_write_request32 *)kbuf; 540 k32buf = (struct dlm_write_request32 *)kbuf;
528 kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) - 541 kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) -
529 sizeof(struct dlm_write_request32)), GFP_KERNEL); 542 sizeof(struct dlm_write_request32)), GFP_KERNEL);
530 if (!kbuf) 543 if (!kbuf) {
544 kfree(k32buf);
531 return -ENOMEM; 545 return -ENOMEM;
546 }
532 547
533 if (proc) 548 if (proc)
534 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); 549 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
@@ -539,8 +554,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
539 554
540 /* do we really need this? can a write happen after a close? */ 555 /* do we really need this? can a write happen after a close? */
541 if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) && 556 if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
542 (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) 557 (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
543 return -EINVAL; 558 error = -EINVAL;
559 goto out_free;
560 }
544 561
545 sigfillset(&allsigs); 562 sigfillset(&allsigs);
546 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); 563 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
@@ -619,17 +636,13 @@ static int device_open(struct inode *inode, struct file *file)
619 struct dlm_user_proc *proc; 636 struct dlm_user_proc *proc;
620 struct dlm_ls *ls; 637 struct dlm_ls *ls;
621 638
622 lock_kernel();
623 ls = dlm_find_lockspace_device(iminor(inode)); 639 ls = dlm_find_lockspace_device(iminor(inode));
624 if (!ls) { 640 if (!ls)
625 unlock_kernel();
626 return -ENOENT; 641 return -ENOENT;
627 }
628 642
629 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); 643 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
630 if (!proc) { 644 if (!proc) {
631 dlm_put_lockspace(ls); 645 dlm_put_lockspace(ls);
632 unlock_kernel();
633 return -ENOMEM; 646 return -ENOMEM;
634 } 647 }
635 648
@@ -641,7 +654,6 @@ static int device_open(struct inode *inode, struct file *file)
641 spin_lock_init(&proc->locks_spin); 654 spin_lock_init(&proc->locks_spin);
642 init_waitqueue_head(&proc->wait); 655 init_waitqueue_head(&proc->wait);
643 file->private_data = proc; 656 file->private_data = proc;
644 unlock_kernel();
645 657
646 return 0; 658 return 0;
647} 659}
@@ -874,9 +886,28 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
874 return 0; 886 return 0;
875} 887}
876 888
889int dlm_user_daemon_available(void)
890{
891 /* dlm_controld hasn't started (or, has started, but not
892 properly populated configfs) */
893
894 if (!dlm_our_nodeid())
895 return 0;
896
897 /* This is to deal with versions of dlm_controld that don't
898 know about the monitor device. We assume that if the
899 dlm_controld was started (above), but the monitor device
900 was never opened, that it's an old version. dlm_controld
901 should open the monitor device before populating configfs. */
902
903 if (dlm_monitor_unused)
904 return 1;
905
906 return atomic_read(&dlm_monitor_opened) ? 1 : 0;
907}
908
877static int ctl_device_open(struct inode *inode, struct file *file) 909static int ctl_device_open(struct inode *inode, struct file *file)
878{ 910{
879 cycle_kernel_lock();
880 file->private_data = NULL; 911 file->private_data = NULL;
881 return 0; 912 return 0;
882} 913}
@@ -886,6 +917,20 @@ static int ctl_device_close(struct inode *inode, struct file *file)
886 return 0; 917 return 0;
887} 918}
888 919
920static int monitor_device_open(struct inode *inode, struct file *file)
921{
922 atomic_inc(&dlm_monitor_opened);
923 dlm_monitor_unused = 0;
924 return 0;
925}
926
927static int monitor_device_close(struct inode *inode, struct file *file)
928{
929 if (atomic_dec_and_test(&dlm_monitor_opened))
930 dlm_stop_lockspaces();
931 return 0;
932}
933
889static const struct file_operations device_fops = { 934static const struct file_operations device_fops = {
890 .open = device_open, 935 .open = device_open,
891 .release = device_close, 936 .release = device_close,
@@ -909,19 +954,42 @@ static struct miscdevice ctl_device = {
909 .minor = MISC_DYNAMIC_MINOR, 954 .minor = MISC_DYNAMIC_MINOR,
910}; 955};
911 956
957static const struct file_operations monitor_device_fops = {
958 .open = monitor_device_open,
959 .release = monitor_device_close,
960 .owner = THIS_MODULE,
961};
962
963static struct miscdevice monitor_device = {
964 .name = "dlm-monitor",
965 .fops = &monitor_device_fops,
966 .minor = MISC_DYNAMIC_MINOR,
967};
968
912int __init dlm_user_init(void) 969int __init dlm_user_init(void)
913{ 970{
914 int error; 971 int error;
915 972
973 atomic_set(&dlm_monitor_opened, 0);
974
916 error = misc_register(&ctl_device); 975 error = misc_register(&ctl_device);
917 if (error) 976 if (error) {
918 log_print("misc_register failed for control device"); 977 log_print("misc_register failed for control device");
978 goto out;
979 }
919 980
981 error = misc_register(&monitor_device);
982 if (error) {
983 log_print("misc_register failed for monitor device");
984 misc_deregister(&ctl_device);
985 }
986 out:
920 return error; 987 return error;
921} 988}
922 989
923void dlm_user_exit(void) 990void dlm_user_exit(void)
924{ 991{
925 misc_deregister(&ctl_device); 992 misc_deregister(&ctl_device);
993 misc_deregister(&monitor_device);
926} 994}
927 995
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index d38e9f3e4151..35eb6a13d616 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -12,5 +12,7 @@
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls);
16int dlm_user_daemon_available(void);
15 17
16#endif 18#endif
diff --git a/fs/dquot.c b/fs/dquot.c
index 8ec4d6cc7633..ad7e59003e04 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -895,10 +895,9 @@ static void print_warning(struct dquot *dquot, const int warntype)
895 warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) 895 warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
896 return; 896 return;
897 897
898 mutex_lock(&tty_mutex);
899 tty = get_current_tty(); 898 tty = get_current_tty();
900 if (!tty) 899 if (!tty)
901 goto out_lock; 900 return;
902 tty_write_message(tty, dquot->dq_sb->s_id); 901 tty_write_message(tty, dquot->dq_sb->s_id);
903 if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) 902 if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
904 tty_write_message(tty, ": warning, "); 903 tty_write_message(tty, ": warning, ");
@@ -926,8 +925,7 @@ static void print_warning(struct dquot *dquot, const int warntype)
926 break; 925 break;
927 } 926 }
928 tty_write_message(tty, msg); 927 tty_write_message(tty, msg);
929out_lock: 928 tty_kref_put(tty);
930 mutex_unlock(&tty_mutex);
931} 929}
932#endif 930#endif
933 931
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 448dfd597b5f..8ebe9a5d1d99 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -211,7 +211,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
211 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 211 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
212 ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; 212 ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
213 213
214static match_table_t tokens = { 214static const match_table_t tokens = {
215 {ecryptfs_opt_sig, "sig=%s"}, 215 {ecryptfs_opt_sig, "sig=%s"},
216 {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, 216 {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
217 {ecryptfs_opt_cipher, "cipher=%s"}, 217 {ecryptfs_opt_cipher, "cipher=%s"},
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 3a404e7fad53..291abb11e20e 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
74 } 74 }
75 unlock_kernel(); 75 unlock_kernel();
76 76
77 d_add(dentry, inode); 77 return d_splice_alias(inode, dentry);
78 return NULL;
79} 78}
80 79
81static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, 80static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 567b134fa1f1..73b19cfc91fc 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -341,8 +341,6 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
341 sb->inode_blocks * 341 sb->inode_blocks *
342 (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); 342 (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
343 buf->f_ffree = sb->inode_free; /* free inodes */ 343 buf->f_ffree = sb->inode_free; /* free inodes */
344 buf->f_fsid.val[0] = (sb->fs_magic >> 16) & 0xffff; /* fs ID */
345 buf->f_fsid.val[1] = sb->fs_magic & 0xffff; /* fs ID */
346 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ 344 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
347 345
348 return 0; 346 return 0;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0c87474f7917..7cc0eb756b55 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1041,10 +1041,7 @@ retry:
1041} 1041}
1042 1042
1043/* 1043/*
1044 * It opens an eventpoll file descriptor. The "size" parameter is there 1044 * Open an eventpoll file descriptor.
1045 * for historical reasons, when epoll was using an hash instead of an
1046 * RB tree. With the current implementation, the "size" parameter is ignored
1047 * (besides sanity checks).
1048 */ 1045 */
1049asmlinkage long sys_epoll_create1(int flags) 1046asmlinkage long sys_epoll_create1(int flags)
1050{ 1047{
diff --git a/fs/exec.c b/fs/exec.c
index 32993beecbe9..cecee501ce78 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)
752 tsk->active_mm = mm; 752 tsk->active_mm = mm;
753 activate_mm(active_mm, mm); 753 activate_mm(active_mm, mm);
754 task_unlock(tsk); 754 task_unlock(tsk);
755 mm_update_next_owner(old_mm);
756 arch_pick_mmap_layout(mm); 755 arch_pick_mmap_layout(mm);
757 if (old_mm) { 756 if (old_mm) {
758 up_read(&old_mm->mmap_sem); 757 up_read(&old_mm->mmap_sem);
759 BUG_ON(active_mm != old_mm); 758 BUG_ON(active_mm != old_mm);
759 mm_update_next_owner(old_mm);
760 mmput(old_mm); 760 mmput(old_mm);
761 return 0; 761 return 0;
762 } 762 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da2d33b..bae998c1e44e 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *);
133extern int ext2_setattr (struct dentry *, struct iattr *); 133extern int ext2_setattr (struct dentry *, struct iattr *);
134extern void ext2_set_inode_flags(struct inode *inode); 134extern void ext2_set_inode_flags(struct inode *inode);
135extern void ext2_get_inode_flags(struct ext2_inode_info *); 135extern void ext2_get_inode_flags(struct ext2_inode_info *);
136extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
137 u64 start, u64 len);
136int __ext2_write_begin(struct file *file, struct address_space *mapping, 138int __ext2_write_begin(struct file *file, struct address_space *mapping,
137 loff_t pos, unsigned len, unsigned flags, 139 loff_t pos, unsigned len, unsigned flags,
138 struct page **pagep, void **fsdata); 140 struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c36293..45ed07122182 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = {
86#endif 86#endif
87 .setattr = ext2_setattr, 87 .setattr = ext2_setattr,
88 .permission = ext2_permission, 88 .permission = ext2_permission,
89 .fiemap = ext2_fiemap,
89}; 90};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 991d6dfeb51f..7658b33e2653 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
31#include <linux/writeback.h> 31#include <linux/writeback.h>
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include <linux/mpage.h> 33#include <linux/mpage.h>
34#include <linux/fiemap.h>
34#include "ext2.h" 35#include "ext2.h"
35#include "acl.h" 36#include "acl.h"
36#include "xip.h" 37#include "xip.h"
@@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_
704 705
705} 706}
706 707
708int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
709 u64 start, u64 len)
710{
711 return generic_block_fiemap(inode, fieinfo, start, len,
712 ext2_get_block);
713}
714
707static int ext2_writepage(struct page *page, struct writeback_control *wbc) 715static int ext2_writepage(struct page *page, struct writeback_control *wbc)
708{ 716{
709 return block_write_full_page(page, ext2_get_block, wbc); 717 return block_write_full_page(page, ext2_get_block, wbc);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index fd88c7b43e66..647cd888ac87 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -393,7 +393,7 @@ enum {
393 Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation 393 Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
394}; 394};
395 395
396static match_table_t tokens = { 396static const match_table_t tokens = {
397 {Opt_bsd_df, "bsddf"}, 397 {Opt_bsd_df, "bsddf"},
398 {Opt_minix_df, "minixdf"}, 398 {Opt_minix_df, "minixdf"},
399 {Opt_grpid, "grpid"}, 399 {Opt_grpid, "grpid"},
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913d3019..3be1e0689c9a 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = {
134 .removexattr = generic_removexattr, 134 .removexattr = generic_removexattr,
135#endif 135#endif
136 .permission = ext3_permission, 136 .permission = ext3_permission,
137 .fiemap = ext3_fiemap,
137}; 138};
138 139
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 507d8689b111..ebfec4d0148e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
36#include <linux/mpage.h> 36#include <linux/mpage.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/fiemap.h>
39#include "xattr.h" 40#include "xattr.h"
40#include "acl.h" 41#include "acl.h"
41 42
@@ -981,6 +982,13 @@ out:
981 return ret; 982 return ret;
982} 983}
983 984
985int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
986 u64 start, u64 len)
987{
988 return generic_block_fiemap(inode, fieinfo, start, len,
989 ext3_get_block);
990}
991
984/* 992/*
985 * `handle' can be NULL if create is zero 993 * `handle' can be NULL if create is zero
986 */ 994 */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f38a5afc39a1..399a96a6c556 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -760,7 +760,7 @@ enum {
760 Opt_grpquota 760 Opt_grpquota
761}; 761};
762 762
763static match_table_t tokens = { 763static const match_table_t tokens = {
764 {Opt_bsd_df, "bsddf"}, 764 {Opt_bsd_df, "bsddf"},
765 {Opt_minix_df, "minixdf"}, 765 {Opt_minix_df, "minixdf"},
766 {Opt_grpid, "grpid"}, 766 {Opt_grpid, "grpid"},
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2f..a8ff003a00f7 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
2# Makefile for the linux ext4-filesystem routines. 2# Makefile for the linux ext4-filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o 9 ext4_jbd2.o migrate.o mballoc.o
10 10
11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o 12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
13ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o 13ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cd2b855a07d6..cb45257a246e 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size)
51 } 51 }
52} 52}
53 53
54#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl 56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
57 if the ACL has not been cached */ 57 if the ACL has not been cached */
58#define EXT4_ACL_NOT_CACHED ((void *)-1) 58#define EXT4_ACL_NOT_CACHED ((void *)-1)
59 59
60/* acl.c */ 60/* acl.c */
61extern int ext4_permission (struct inode *, int); 61extern int ext4_permission(struct inode *, int);
62extern int ext4_acl_chmod (struct inode *); 62extern int ext4_acl_chmod(struct inode *);
63extern int ext4_init_acl (handle_t *, struct inode *, struct inode *); 63extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
64 64
65#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */ 65#else /* CONFIG_EXT4_FS_POSIX_ACL */
66#include <linux/sched.h> 66#include <linux/sched.h>
67#define ext4_permission NULL 67#define ext4_permission NULL
68 68
@@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
77{ 77{
78 return 0; 78 return 0;
79} 79}
80#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */ 80#endif /* CONFIG_EXT4_FS_POSIX_ACL */
81 81
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ae5004e93fc..bd2ece228827 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
83 } 83 }
84 return used_blocks; 84 return used_blocks;
85} 85}
86
86/* Initializes an uninitialized block bitmap if given, and returns the 87/* Initializes an uninitialized block bitmap if given, and returns the
87 * number of blocks free in the group. */ 88 * number of blocks free in the group. */
88unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, 89unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -132,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
132 */ 133 */
133 group_blocks = ext4_blocks_count(sbi->s_es) - 134 group_blocks = ext4_blocks_count(sbi->s_es) -
134 le32_to_cpu(sbi->s_es->s_first_data_block) - 135 le32_to_cpu(sbi->s_es->s_first_data_block) -
135 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1)); 136 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
136 } else { 137 } else {
137 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 138 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
138 } 139 }
@@ -200,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
200 * @bh: pointer to the buffer head to store the block 201 * @bh: pointer to the buffer head to store the block
201 * group descriptor 202 * group descriptor
202 */ 203 */
203struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 204struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
204 ext4_group_t block_group, 205 ext4_group_t block_group,
205 struct buffer_head ** bh) 206 struct buffer_head **bh)
206{ 207{
207 unsigned long group_desc; 208 unsigned long group_desc;
208 unsigned long offset; 209 unsigned long offset;
209 struct ext4_group_desc * desc; 210 struct ext4_group_desc *desc;
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 211 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 212
212 if (block_group >= sbi->s_groups_count) { 213 if (block_group >= sbi->s_groups_count) {
213 ext4_error (sb, "ext4_get_group_desc", 214 ext4_error(sb, "ext4_get_group_desc",
214 "block_group >= groups_count - " 215 "block_group >= groups_count - "
215 "block_group = %lu, groups_count = %lu", 216 "block_group = %lu, groups_count = %lu",
216 block_group, sbi->s_groups_count); 217 block_group, sbi->s_groups_count);
217 218
218 return NULL; 219 return NULL;
219 } 220 }
@@ -222,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
222 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 223 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
223 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 224 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
224 if (!sbi->s_group_desc[group_desc]) { 225 if (!sbi->s_group_desc[group_desc]) {
225 ext4_error (sb, "ext4_get_group_desc", 226 ext4_error(sb, "ext4_get_group_desc",
226 "Group descriptor not loaded - " 227 "Group descriptor not loaded - "
227 "block_group = %lu, group_desc = %lu, desc = %lu", 228 "block_group = %lu, group_desc = %lu, desc = %lu",
228 block_group, group_desc, offset); 229 block_group, group_desc, offset);
229 return NULL; 230 return NULL;
230 } 231 }
231 232
@@ -302,8 +303,8 @@ err_out:
302struct buffer_head * 303struct buffer_head *
303ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 304ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
304{ 305{
305 struct ext4_group_desc * desc; 306 struct ext4_group_desc *desc;
306 struct buffer_head * bh = NULL; 307 struct buffer_head *bh = NULL;
307 ext4_fsblk_t bitmap_blk; 308 ext4_fsblk_t bitmap_blk;
308 309
309 desc = ext4_get_group_desc(sb, block_group, NULL); 310 desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -318,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
318 block_group, bitmap_blk); 319 block_group, bitmap_blk);
319 return NULL; 320 return NULL;
320 } 321 }
321 if (bh_uptodate_or_lock(bh)) 322 if (buffer_uptodate(bh) &&
323 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
322 return bh; 324 return bh;
323 325
326 lock_buffer(bh);
324 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 327 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
325 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 328 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
326 ext4_init_block_bitmap(sb, bh, block_group, desc); 329 ext4_init_block_bitmap(sb, bh, block_group, desc);
@@ -345,301 +348,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
345 */ 348 */
346 return bh; 349 return bh;
347} 350}
348/*
349 * The reservation window structure operations
350 * --------------------------------------------
351 * Operations include:
352 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
353 *
354 * We use a red-black tree to represent per-filesystem reservation
355 * windows.
356 *
357 */
358
359/**
360 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
361 * @rb_root: root of per-filesystem reservation rb tree
362 * @verbose: verbose mode
363 * @fn: function which wishes to dump the reservation map
364 *
365 * If verbose is turned on, it will print the whole block reservation
366 * windows(start, end). Otherwise, it will only print out the "bad" windows,
367 * those windows that overlap with their immediate neighbors.
368 */
369#if 1
370static void __rsv_window_dump(struct rb_root *root, int verbose,
371 const char *fn)
372{
373 struct rb_node *n;
374 struct ext4_reserve_window_node *rsv, *prev;
375 int bad;
376
377restart:
378 n = rb_first(root);
379 bad = 0;
380 prev = NULL;
381
382 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
383 while (n) {
384 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
385 if (verbose)
386 printk("reservation window 0x%p "
387 "start: %llu, end: %llu\n",
388 rsv, rsv->rsv_start, rsv->rsv_end);
389 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
390 printk("Bad reservation %p (start >= end)\n",
391 rsv);
392 bad = 1;
393 }
394 if (prev && prev->rsv_end >= rsv->rsv_start) {
395 printk("Bad reservation %p (prev->end >= start)\n",
396 rsv);
397 bad = 1;
398 }
399 if (bad) {
400 if (!verbose) {
401 printk("Restarting reservation walk in verbose mode\n");
402 verbose = 1;
403 goto restart;
404 }
405 }
406 n = rb_next(n);
407 prev = rsv;
408 }
409 printk("Window map complete.\n");
410 BUG_ON(bad);
411}
412#define rsv_window_dump(root, verbose) \
413 __rsv_window_dump((root), (verbose), __func__)
414#else
415#define rsv_window_dump(root, verbose) do {} while (0)
416#endif
417
418/**
419 * goal_in_my_reservation()
420 * @rsv: inode's reservation window
421 * @grp_goal: given goal block relative to the allocation block group
422 * @group: the current allocation block group
423 * @sb: filesystem super block
424 *
425 * Test if the given goal block (group relative) is within the file's
426 * own block reservation window range.
427 *
428 * If the reservation window is outside the goal allocation group, return 0;
429 * grp_goal (given goal block) could be -1, which means no specific
430 * goal block. In this case, always return 1.
431 * If the goal block is within the reservation window, return 1;
432 * otherwise, return 0;
433 */
434static int
435goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
436 ext4_group_t group, struct super_block *sb)
437{
438 ext4_fsblk_t group_first_block, group_last_block;
439
440 group_first_block = ext4_group_first_block_no(sb, group);
441 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
442
443 if ((rsv->_rsv_start > group_last_block) ||
444 (rsv->_rsv_end < group_first_block))
445 return 0;
446 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
447 || (grp_goal + group_first_block > rsv->_rsv_end)))
448 return 0;
449 return 1;
450}
451
452/**
453 * search_reserve_window()
454 * @rb_root: root of reservation tree
455 * @goal: target allocation block
456 *
457 * Find the reserved window which includes the goal, or the previous one
458 * if the goal is not in any window.
459 * Returns NULL if there are no windows or if all windows start after the goal.
460 */
461static struct ext4_reserve_window_node *
462search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
463{
464 struct rb_node *n = root->rb_node;
465 struct ext4_reserve_window_node *rsv;
466
467 if (!n)
468 return NULL;
469
470 do {
471 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
472
473 if (goal < rsv->rsv_start)
474 n = n->rb_left;
475 else if (goal > rsv->rsv_end)
476 n = n->rb_right;
477 else
478 return rsv;
479 } while (n);
480 /*
481 * We've fallen off the end of the tree: the goal wasn't inside
482 * any particular node. OK, the previous node must be to one
483 * side of the interval containing the goal. If it's the RHS,
484 * we need to back up one.
485 */
486 if (rsv->rsv_start > goal) {
487 n = rb_prev(&rsv->rsv_node);
488 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
489 }
490 return rsv;
491}
492
493/**
494 * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
495 * @sb: super block
496 * @rsv: reservation window to add
497 *
498 * Must be called with rsv_lock hold.
499 */
500void ext4_rsv_window_add(struct super_block *sb,
501 struct ext4_reserve_window_node *rsv)
502{
503 struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
504 struct rb_node *node = &rsv->rsv_node;
505 ext4_fsblk_t start = rsv->rsv_start;
506
507 struct rb_node ** p = &root->rb_node;
508 struct rb_node * parent = NULL;
509 struct ext4_reserve_window_node *this;
510
511 while (*p)
512 {
513 parent = *p;
514 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
515
516 if (start < this->rsv_start)
517 p = &(*p)->rb_left;
518 else if (start > this->rsv_end)
519 p = &(*p)->rb_right;
520 else {
521 rsv_window_dump(root, 1);
522 BUG();
523 }
524 }
525
526 rb_link_node(node, parent, p);
527 rb_insert_color(node, root);
528}
529
530/**
531 * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
532 * @sb: super block
533 * @rsv: reservation window to remove
534 *
535 * Mark the block reservation window as not allocated, and unlink it
536 * from the filesystem reservation window rb tree. Must be called with
537 * rsv_lock hold.
538 */
539static void rsv_window_remove(struct super_block *sb,
540 struct ext4_reserve_window_node *rsv)
541{
542 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
543 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
544 rsv->rsv_alloc_hit = 0;
545 rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
546}
547
548/*
549 * rsv_is_empty() -- Check if the reservation window is allocated.
550 * @rsv: given reservation window to check
551 *
552 * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
553 */
554static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
555{
556 /* a valid reservation end block could not be 0 */
557 return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
558}
559
560/**
561 * ext4_init_block_alloc_info()
562 * @inode: file inode structure
563 *
564 * Allocate and initialize the reservation window structure, and
565 * link the window to the ext4 inode structure at last
566 *
567 * The reservation window structure is only dynamically allocated
568 * and linked to ext4 inode the first time the open file
569 * needs a new block. So, before every ext4_new_block(s) call, for
570 * regular files, we should check whether the reservation window
571 * structure exists or not. In the latter case, this function is called.
572 * Fail to do so will result in block reservation being turned off for that
573 * open file.
574 *
575 * This function is called from ext4_get_blocks_handle(), also called
576 * when setting the reservation window size through ioctl before the file
577 * is open for write (needs block allocation).
578 *
579 * Needs down_write(i_data_sem) protection prior to call this function.
580 */
581void ext4_init_block_alloc_info(struct inode *inode)
582{
583 struct ext4_inode_info *ei = EXT4_I(inode);
584 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
585 struct super_block *sb = inode->i_sb;
586
587 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
588 if (block_i) {
589 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
590
591 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
592 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
593
594 /*
595 * if filesystem is mounted with NORESERVATION, the goal
596 * reservation window size is set to zero to indicate
597 * block reservation is off
598 */
599 if (!test_opt(sb, RESERVATION))
600 rsv->rsv_goal_size = 0;
601 else
602 rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
603 rsv->rsv_alloc_hit = 0;
604 block_i->last_alloc_logical_block = 0;
605 block_i->last_alloc_physical_block = 0;
606 }
607 ei->i_block_alloc_info = block_i;
608}
609
610/**
611 * ext4_discard_reservation()
612 * @inode: inode
613 *
614 * Discard(free) block reservation window on last file close, or truncate
615 * or at last iput().
616 *
617 * It is being called in three cases:
618 * ext4_release_file(): last writer close the file
619 * ext4_clear_inode(): last iput(), when nobody link to this file.
620 * ext4_truncate(): when the block indirect map is about to change.
621 *
622 */
623void ext4_discard_reservation(struct inode *inode)
624{
625 struct ext4_inode_info *ei = EXT4_I(inode);
626 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
627 struct ext4_reserve_window_node *rsv;
628 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
629
630 ext4_mb_discard_inode_preallocations(inode);
631
632 if (!block_i)
633 return;
634
635 rsv = &block_i->rsv_window_node;
636 if (!rsv_is_empty(&rsv->rsv_window)) {
637 spin_lock(rsv_lock);
638 if (!rsv_is_empty(&rsv->rsv_window))
639 rsv_window_remove(inode->i_sb, rsv);
640 spin_unlock(rsv_lock);
641 }
642}
643 351
644/** 352/**
645 * ext4_free_blocks_sb() -- Free given blocks and update quota 353 * ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -648,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode)
648 * @block: start physcial block to free 356 * @block: start physcial block to free
649 * @count: number of blocks to free 357 * @count: number of blocks to free
650 * @pdquot_freed_blocks: pointer to quota 358 * @pdquot_freed_blocks: pointer to quota
359 *
360 * XXX This function is only used by the on-line resizing code, which
361 * should probably be fixed up to call the mballoc variant. There
362 * this needs to be cleaned up later; in fact, I'm not convinced this
363 * is 100% correct in the face of the mballoc code. The online resizing
364 * code needs to be fixed up to more tightly (and correctly) interlock
365 * with the mballoc code.
651 */ 366 */
652void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 367void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
653 ext4_fsblk_t block, unsigned long count, 368 ext4_fsblk_t block, unsigned long count,
@@ -659,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
659 ext4_grpblk_t bit; 374 ext4_grpblk_t bit;
660 unsigned long i; 375 unsigned long i;
661 unsigned long overflow; 376 unsigned long overflow;
662 struct ext4_group_desc * desc; 377 struct ext4_group_desc *desc;
663 struct ext4_super_block * es; 378 struct ext4_super_block *es;
664 struct ext4_sb_info *sbi; 379 struct ext4_sb_info *sbi;
665 int err = 0, ret; 380 int err = 0, ret;
666 ext4_grpblk_t group_freed; 381 ext4_grpblk_t group_freed;
@@ -671,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
671 if (block < le32_to_cpu(es->s_first_data_block) || 386 if (block < le32_to_cpu(es->s_first_data_block) ||
672 block + count < block || 387 block + count < block ||
673 block + count > ext4_blocks_count(es)) { 388 block + count > ext4_blocks_count(es)) {
674 ext4_error (sb, "ext4_free_blocks", 389 ext4_error(sb, "ext4_free_blocks",
675 "Freeing blocks not in datazone - " 390 "Freeing blocks not in datazone - "
676 "block = %llu, count = %lu", block, count); 391 "block = %llu, count = %lu", block, count);
677 goto error_return; 392 goto error_return;
678 } 393 }
679 394
680 ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1); 395 ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
681 396
682do_more: 397do_more:
683 overflow = 0; 398 overflow = 0;
@@ -694,7 +409,7 @@ do_more:
694 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 409 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
695 if (!bitmap_bh) 410 if (!bitmap_bh)
696 goto error_return; 411 goto error_return;
697 desc = ext4_get_group_desc (sb, block_group, &gd_bh); 412 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
698 if (!desc) 413 if (!desc)
699 goto error_return; 414 goto error_return;
700 415
@@ -703,10 +418,10 @@ do_more:
703 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 418 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
704 in_range(block + count - 1, ext4_inode_table(sb, desc), 419 in_range(block + count - 1, ext4_inode_table(sb, desc),
705 sbi->s_itb_per_group)) { 420 sbi->s_itb_per_group)) {
706 ext4_error (sb, "ext4_free_blocks", 421 ext4_error(sb, "ext4_free_blocks",
707 "Freeing blocks in system zones - " 422 "Freeing blocks in system zones - "
708 "Block = %llu, count = %lu", 423 "Block = %llu, count = %lu",
709 block, count); 424 block, count);
710 goto error_return; 425 goto error_return;
711 } 426 }
712 427
@@ -848,7 +563,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
848 ext4_fsblk_t block, unsigned long count, 563 ext4_fsblk_t block, unsigned long count,
849 int metadata) 564 int metadata)
850{ 565{
851 struct super_block * sb; 566 struct super_block *sb;
852 unsigned long dquot_freed_blocks; 567 unsigned long dquot_freed_blocks;
853 568
854 /* this isn't the right place to decide whether block is metadata 569 /* this isn't the right place to decide whether block is metadata
@@ -859,748 +574,52 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
859 574
860 sb = inode->i_sb; 575 sb = inode->i_sb;
861 576
862 if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info) 577 ext4_mb_free_blocks(handle, inode, block, count,
863 ext4_free_blocks_sb(handle, sb, block, count, 578 metadata, &dquot_freed_blocks);
864 &dquot_freed_blocks);
865 else
866 ext4_mb_free_blocks(handle, inode, block, count,
867 metadata, &dquot_freed_blocks);
868 if (dquot_freed_blocks) 579 if (dquot_freed_blocks)
869 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 580 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
870 return; 581 return;
871} 582}
872 583
873/** 584int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
874 * ext4_test_allocatable() 585 s64 nblocks)
875 * @nr: given allocation block group
876 * @bh: bufferhead contains the bitmap of the given block group
877 *
878 * For ext4 allocations, we must not reuse any blocks which are
879 * allocated in the bitmap buffer's "last committed data" copy. This
880 * prevents deletes from freeing up the page for reuse until we have
881 * committed the delete transaction.
882 *
883 * If we didn't do this, then deleting something and reallocating it as
884 * data would allow the old block to be overwritten before the
885 * transaction committed (because we force data to disk before commit).
886 * This would lead to corruption if we crashed between overwriting the
887 * data and committing the delete.
888 *
889 * @@@ We may want to make this allocation behaviour conditional on
890 * data-writes at some point, and disable it for metadata allocations or
891 * sync-data inodes.
892 */
893static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
894{
895 int ret;
896 struct journal_head *jh = bh2jh(bh);
897
898 if (ext4_test_bit(nr, bh->b_data))
899 return 0;
900
901 jbd_lock_bh_state(bh);
902 if (!jh->b_committed_data)
903 ret = 1;
904 else
905 ret = !ext4_test_bit(nr, jh->b_committed_data);
906 jbd_unlock_bh_state(bh);
907 return ret;
908}
909
910/**
911 * bitmap_search_next_usable_block()
912 * @start: the starting block (group relative) of the search
913 * @bh: bufferhead contains the block group bitmap
914 * @maxblocks: the ending block (group relative) of the reservation
915 *
916 * The bitmap search --- search forward alternately through the actual
917 * bitmap on disk and the last-committed copy in journal, until we find a
918 * bit free in both bitmaps.
919 */
920static ext4_grpblk_t
921bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
922 ext4_grpblk_t maxblocks)
923{ 586{
924 ext4_grpblk_t next; 587 s64 free_blocks, dirty_blocks;
925 struct journal_head *jh = bh2jh(bh); 588 s64 root_blocks = 0;
926 589 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
927 while (start < maxblocks) { 590 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
928 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
929 if (next >= maxblocks)
930 return -1;
931 if (ext4_test_allocatable(next, bh))
932 return next;
933 jbd_lock_bh_state(bh);
934 if (jh->b_committed_data)
935 start = ext4_find_next_zero_bit(jh->b_committed_data,
936 maxblocks, next);
937 jbd_unlock_bh_state(bh);
938 }
939 return -1;
940}
941 591
942/** 592 free_blocks = percpu_counter_read_positive(fbc);
943 * find_next_usable_block() 593 dirty_blocks = percpu_counter_read_positive(dbc);
944 * @start: the starting block (group relative) to find next
945 * allocatable block in bitmap.
946 * @bh: bufferhead contains the block group bitmap
947 * @maxblocks: the ending block (group relative) for the search
948 *
949 * Find an allocatable block in a bitmap. We honor both the bitmap and
950 * its last-committed copy (if that exists), and perform the "most
951 * appropriate allocation" algorithm of looking for a free block near
952 * the initial goal; then for a free byte somewhere in the bitmap; then
953 * for any free bit in the bitmap.
954 */
955static ext4_grpblk_t
956find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
957 ext4_grpblk_t maxblocks)
958{
959 ext4_grpblk_t here, next;
960 char *p, *r;
961
962 if (start > 0) {
963 /*
964 * The goal was occupied; search forward for a free
965 * block within the next XX blocks.
966 *
967 * end_goal is more or less random, but it has to be
968 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
969 * next 64-bit boundary is simple..
970 */
971 ext4_grpblk_t end_goal = (start + 63) & ~63;
972 if (end_goal > maxblocks)
973 end_goal = maxblocks;
974 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
975 if (here < end_goal && ext4_test_allocatable(here, bh))
976 return here;
977 ext4_debug("Bit not found near goal\n");
978 }
979
980 here = start;
981 if (here < 0)
982 here = 0;
983
984 p = ((char *)bh->b_data) + (here >> 3);
985 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
986 next = (r - ((char *)bh->b_data)) << 3;
987
988 if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
989 return next;
990
991 /*
992 * The bitmap search --- search forward alternately through the actual
993 * bitmap and the last-committed copy until we find a bit free in
994 * both
995 */
996 here = bitmap_search_next_usable_block(here, bh, maxblocks);
997 return here;
998}
999
1000/**
1001 * claim_block()
1002 * @block: the free block (group relative) to allocate
1003 * @bh: the bufferhead containts the block group bitmap
1004 *
1005 * We think we can allocate this block in this bitmap. Try to set the bit.
1006 * If that succeeds then check that nobody has allocated and then freed the
1007 * block since we saw that is was not marked in b_committed_data. If it _was_
1008 * allocated and freed then clear the bit in the bitmap again and return
1009 * zero (failure).
1010 */
1011static inline int
1012claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
1013{
1014 struct journal_head *jh = bh2jh(bh);
1015 int ret;
1016
1017 if (ext4_set_bit_atomic(lock, block, bh->b_data))
1018 return 0;
1019 jbd_lock_bh_state(bh);
1020 if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
1021 ext4_clear_bit_atomic(lock, block, bh->b_data);
1022 ret = 0;
1023 } else {
1024 ret = 1;
1025 }
1026 jbd_unlock_bh_state(bh);
1027 return ret;
1028}
1029 594
1030/** 595 if (!capable(CAP_SYS_RESOURCE) &&
1031 * ext4_try_to_allocate() 596 sbi->s_resuid != current->fsuid &&
1032 * @sb: superblock 597 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1033 * @handle: handle to this transaction 598 root_blocks = ext4_r_blocks_count(sbi->s_es);
1034 * @group: given allocation block group
1035 * @bitmap_bh: bufferhead holds the block bitmap
1036 * @grp_goal: given target block within the group
1037 * @count: target number of blocks to allocate
1038 * @my_rsv: reservation window
1039 *
1040 * Attempt to allocate blocks within a give range. Set the range of allocation
1041 * first, then find the first free bit(s) from the bitmap (within the range),
1042 * and at last, allocate the blocks by claiming the found free bit as allocated.
1043 *
1044 * To set the range of this allocation:
1045 * if there is a reservation window, only try to allocate block(s) from the
1046 * file's own reservation window;
1047 * Otherwise, the allocation range starts from the give goal block, ends at
1048 * the block group's last block.
1049 *
1050 * If we failed to allocate the desired block then we may end up crossing to a
1051 * new bitmap. In that case we must release write access to the old one via
1052 * ext4_journal_release_buffer(), else we'll run out of credits.
1053 */
1054static ext4_grpblk_t
1055ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
1056 ext4_group_t group, struct buffer_head *bitmap_bh,
1057 ext4_grpblk_t grp_goal, unsigned long *count,
1058 struct ext4_reserve_window *my_rsv)
1059{
1060 ext4_fsblk_t group_first_block;
1061 ext4_grpblk_t start, end;
1062 unsigned long num = 0;
1063
1064 /* we do allocation within the reservation window if we have a window */
1065 if (my_rsv) {
1066 group_first_block = ext4_group_first_block_no(sb, group);
1067 if (my_rsv->_rsv_start >= group_first_block)
1068 start = my_rsv->_rsv_start - group_first_block;
1069 else
1070 /* reservation window cross group boundary */
1071 start = 0;
1072 end = my_rsv->_rsv_end - group_first_block + 1;
1073 if (end > EXT4_BLOCKS_PER_GROUP(sb))
1074 /* reservation window crosses group boundary */
1075 end = EXT4_BLOCKS_PER_GROUP(sb);
1076 if ((start <= grp_goal) && (grp_goal < end))
1077 start = grp_goal;
1078 else
1079 grp_goal = -1;
1080 } else {
1081 if (grp_goal > 0)
1082 start = grp_goal;
1083 else
1084 start = 0;
1085 end = EXT4_BLOCKS_PER_GROUP(sb);
1086 }
1087
1088 BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
1089
1090repeat:
1091 if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
1092 grp_goal = find_next_usable_block(start, bitmap_bh, end);
1093 if (grp_goal < 0)
1094 goto fail_access;
1095 if (!my_rsv) {
1096 int i;
1097
1098 for (i = 0; i < 7 && grp_goal > start &&
1099 ext4_test_allocatable(grp_goal - 1,
1100 bitmap_bh);
1101 i++, grp_goal--)
1102 ;
1103 }
1104 }
1105 start = grp_goal;
1106
1107 if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1108 grp_goal, bitmap_bh)) {
1109 /*
1110 * The block was allocated by another thread, or it was
1111 * allocated and then freed by another thread
1112 */
1113 start++;
1114 grp_goal++;
1115 if (start >= end)
1116 goto fail_access;
1117 goto repeat;
1118 }
1119 num++;
1120 grp_goal++;
1121 while (num < *count && grp_goal < end
1122 && ext4_test_allocatable(grp_goal, bitmap_bh)
1123 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1124 grp_goal, bitmap_bh)) {
1125 num++;
1126 grp_goal++;
1127 }
1128 *count = num;
1129 return grp_goal - num;
1130fail_access:
1131 *count = num;
1132 return -1;
1133}
1134
1135/**
1136 * find_next_reservable_window():
1137 * find a reservable space within the given range.
1138 * It does not allocate the reservation window for now:
1139 * alloc_new_reservation() will do the work later.
1140 *
1141 * @search_head: the head of the searching list;
1142 * This is not necessarily the list head of the whole filesystem
1143 *
1144 * We have both head and start_block to assist the search
1145 * for the reservable space. The list starts from head,
1146 * but we will shift to the place where start_block is,
1147 * then start from there, when looking for a reservable space.
1148 *
1149 * @size: the target new reservation window size
1150 *
1151 * @group_first_block: the first block we consider to start
1152 * the real search from
1153 *
1154 * @last_block:
1155 * the maximum block number that our goal reservable space
1156 * could start from. This is normally the last block in this
1157 * group. The search will end when we found the start of next
1158 * possible reservable space is out of this boundary.
1159 * This could handle the cross boundary reservation window
1160 * request.
1161 *
1162 * basically we search from the given range, rather than the whole
1163 * reservation double linked list, (start_block, last_block)
1164 * to find a free region that is of my size and has not
1165 * been reserved.
1166 *
1167 */
1168static int find_next_reservable_window(
1169 struct ext4_reserve_window_node *search_head,
1170 struct ext4_reserve_window_node *my_rsv,
1171 struct super_block * sb,
1172 ext4_fsblk_t start_block,
1173 ext4_fsblk_t last_block)
1174{
1175 struct rb_node *next;
1176 struct ext4_reserve_window_node *rsv, *prev;
1177 ext4_fsblk_t cur;
1178 int size = my_rsv->rsv_goal_size;
1179
1180 /* TODO: make the start of the reservation window byte-aligned */
1181 /* cur = *start_block & ~7;*/
1182 cur = start_block;
1183 rsv = search_head;
1184 if (!rsv)
1185 return -1;
1186
1187 while (1) {
1188 if (cur <= rsv->rsv_end)
1189 cur = rsv->rsv_end + 1;
1190
1191 /* TODO?
1192 * in the case we could not find a reservable space
1193 * that is what is expected, during the re-search, we could
1194 * remember what's the largest reservable space we could have
1195 * and return that one.
1196 *
1197 * For now it will fail if we could not find the reservable
1198 * space with expected-size (or more)...
1199 */
1200 if (cur > last_block)
1201 return -1; /* fail */
1202
1203 prev = rsv;
1204 next = rb_next(&rsv->rsv_node);
1205 rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
1206 599
1207 /* 600 if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
1208 * Reached the last reservation, we can just append to the 601 EXT4_FREEBLOCKS_WATERMARK) {
1209 * previous one. 602 free_blocks = percpu_counter_sum(fbc);
1210 */ 603 dirty_blocks = percpu_counter_sum(dbc);
1211 if (!next) 604 if (dirty_blocks < 0) {
1212 break; 605 printk(KERN_CRIT "Dirty block accounting "
1213 606 "went wrong %lld\n",
1214 if (cur + size <= rsv->rsv_start) { 607 dirty_blocks);
1215 /*
1216 * Found a reserveable space big enough. We could
1217 * have a reservation across the group boundary here
1218 */
1219 break;
1220 } 608 }
1221 } 609 }
1222 /* 610 /* Check whether we have space after
1223 * we come here either : 611 * accounting for current dirty blocks
1224 * when we reach the end of the whole list,
1225 * and there is empty reservable space after last entry in the list.
1226 * append it to the end of the list.
1227 *
1228 * or we found one reservable space in the middle of the list,
1229 * return the reservation window that we could append to.
1230 * succeed.
1231 */ 612 */
613 if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
614 /* we don't have free space */
615 return -ENOSPC;
1232 616
1233 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) 617 /* Add the blocks to nblocks */
1234 rsv_window_remove(sb, my_rsv); 618 percpu_counter_add(dbc, nblocks);
1235
1236 /*
1237 * Let's book the whole avaliable window for now. We will check the
1238 * disk bitmap later and then, if there are free blocks then we adjust
1239 * the window size if it's larger than requested.
1240 * Otherwise, we will remove this node from the tree next time
1241 * call find_next_reservable_window.
1242 */
1243 my_rsv->rsv_start = cur;
1244 my_rsv->rsv_end = cur + size - 1;
1245 my_rsv->rsv_alloc_hit = 0;
1246
1247 if (prev != my_rsv)
1248 ext4_rsv_window_add(sb, my_rsv);
1249
1250 return 0; 619 return 0;
1251} 620}
1252 621
1253/** 622/**
1254 * alloc_new_reservation()--allocate a new reservation window
1255 *
1256 * To make a new reservation, we search part of the filesystem
1257 * reservation list (the list that inside the group). We try to
1258 * allocate a new reservation window near the allocation goal,
1259 * or the beginning of the group, if there is no goal.
1260 *
1261 * We first find a reservable space after the goal, then from
1262 * there, we check the bitmap for the first free block after
1263 * it. If there is no free block until the end of group, then the
1264 * whole group is full, we failed. Otherwise, check if the free
1265 * block is inside the expected reservable space, if so, we
1266 * succeed.
1267 * If the first free block is outside the reservable space, then
1268 * start from the first free block, we search for next available
1269 * space, and go on.
1270 *
1271 * on succeed, a new reservation will be found and inserted into the list
1272 * It contains at least one free block, and it does not overlap with other
1273 * reservation windows.
1274 *
1275 * failed: we failed to find a reservation window in this group
1276 *
1277 * @rsv: the reservation
1278 *
1279 * @grp_goal: The goal (group-relative). It is where the search for a
1280 * free reservable space should start from.
1281 * if we have a grp_goal(grp_goal >0 ), then start from there,
1282 * no grp_goal(grp_goal = -1), we start from the first block
1283 * of the group.
1284 *
1285 * @sb: the super block
1286 * @group: the group we are trying to allocate in
1287 * @bitmap_bh: the block group block bitmap
1288 *
1289 */
1290static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1291 ext4_grpblk_t grp_goal, struct super_block *sb,
1292 ext4_group_t group, struct buffer_head *bitmap_bh)
1293{
1294 struct ext4_reserve_window_node *search_head;
1295 ext4_fsblk_t group_first_block, group_end_block, start_block;
1296 ext4_grpblk_t first_free_block;
1297 struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1298 unsigned long size;
1299 int ret;
1300 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1301
1302 group_first_block = ext4_group_first_block_no(sb, group);
1303 group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1304
1305 if (grp_goal < 0)
1306 start_block = group_first_block;
1307 else
1308 start_block = grp_goal + group_first_block;
1309
1310 size = my_rsv->rsv_goal_size;
1311
1312 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1313 /*
1314 * if the old reservation is cross group boundary
1315 * and if the goal is inside the old reservation window,
1316 * we will come here when we just failed to allocate from
1317 * the first part of the window. We still have another part
1318 * that belongs to the next group. In this case, there is no
1319 * point to discard our window and try to allocate a new one
1320 * in this group(which will fail). we should
1321 * keep the reservation window, just simply move on.
1322 *
1323 * Maybe we could shift the start block of the reservation
1324 * window to the first block of next group.
1325 */
1326
1327 if ((my_rsv->rsv_start <= group_end_block) &&
1328 (my_rsv->rsv_end > group_end_block) &&
1329 (start_block >= my_rsv->rsv_start))
1330 return -1;
1331
1332 if ((my_rsv->rsv_alloc_hit >
1333 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1334 /*
1335 * if the previously allocation hit ratio is
1336 * greater than 1/2, then we double the size of
1337 * the reservation window the next time,
1338 * otherwise we keep the same size window
1339 */
1340 size = size * 2;
1341 if (size > EXT4_MAX_RESERVE_BLOCKS)
1342 size = EXT4_MAX_RESERVE_BLOCKS;
1343 my_rsv->rsv_goal_size= size;
1344 }
1345 }
1346
1347 spin_lock(rsv_lock);
1348 /*
1349 * shift the search start to the window near the goal block
1350 */
1351 search_head = search_reserve_window(fs_rsv_root, start_block);
1352
1353 /*
1354 * find_next_reservable_window() simply finds a reservable window
1355 * inside the given range(start_block, group_end_block).
1356 *
1357 * To make sure the reservation window has a free bit inside it, we
1358 * need to check the bitmap after we found a reservable window.
1359 */
1360retry:
1361 ret = find_next_reservable_window(search_head, my_rsv, sb,
1362 start_block, group_end_block);
1363
1364 if (ret == -1) {
1365 if (!rsv_is_empty(&my_rsv->rsv_window))
1366 rsv_window_remove(sb, my_rsv);
1367 spin_unlock(rsv_lock);
1368 return -1;
1369 }
1370
1371 /*
1372 * On success, find_next_reservable_window() returns the
1373 * reservation window where there is a reservable space after it.
1374 * Before we reserve this reservable space, we need
1375 * to make sure there is at least a free block inside this region.
1376 *
1377 * searching the first free bit on the block bitmap and copy of
1378 * last committed bitmap alternatively, until we found a allocatable
1379 * block. Search start from the start block of the reservable space
1380 * we just found.
1381 */
1382 spin_unlock(rsv_lock);
1383 first_free_block = bitmap_search_next_usable_block(
1384 my_rsv->rsv_start - group_first_block,
1385 bitmap_bh, group_end_block - group_first_block + 1);
1386
1387 if (first_free_block < 0) {
1388 /*
1389 * no free block left on the bitmap, no point
1390 * to reserve the space. return failed.
1391 */
1392 spin_lock(rsv_lock);
1393 if (!rsv_is_empty(&my_rsv->rsv_window))
1394 rsv_window_remove(sb, my_rsv);
1395 spin_unlock(rsv_lock);
1396 return -1; /* failed */
1397 }
1398
1399 start_block = first_free_block + group_first_block;
1400 /*
1401 * check if the first free block is within the
1402 * free space we just reserved
1403 */
1404 if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1405 return 0; /* success */
1406 /*
1407 * if the first free bit we found is out of the reservable space
1408 * continue search for next reservable space,
1409 * start from where the free block is,
1410 * we also shift the list head to where we stopped last time
1411 */
1412 search_head = my_rsv;
1413 spin_lock(rsv_lock);
1414 goto retry;
1415}
1416
1417/**
1418 * try_to_extend_reservation()
1419 * @my_rsv: given reservation window
1420 * @sb: super block
1421 * @size: the delta to extend
1422 *
1423 * Attempt to expand the reservation window large enough to have
1424 * required number of free blocks
1425 *
1426 * Since ext4_try_to_allocate() will always allocate blocks within
1427 * the reservation window range, if the window size is too small,
1428 * multiple blocks allocation has to stop at the end of the reservation
1429 * window. To make this more efficient, given the total number of
1430 * blocks needed and the current size of the window, we try to
1431 * expand the reservation window size if necessary on a best-effort
1432 * basis before ext4_new_blocks() tries to allocate blocks,
1433 */
1434static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1435 struct super_block *sb, int size)
1436{
1437 struct ext4_reserve_window_node *next_rsv;
1438 struct rb_node *next;
1439 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1440
1441 if (!spin_trylock(rsv_lock))
1442 return;
1443
1444 next = rb_next(&my_rsv->rsv_node);
1445
1446 if (!next)
1447 my_rsv->rsv_end += size;
1448 else {
1449 next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
1450
1451 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1452 my_rsv->rsv_end += size;
1453 else
1454 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1455 }
1456 spin_unlock(rsv_lock);
1457}
1458
1459/**
1460 * ext4_try_to_allocate_with_rsv()
1461 * @sb: superblock
1462 * @handle: handle to this transaction
1463 * @group: given allocation block group
1464 * @bitmap_bh: bufferhead holds the block bitmap
1465 * @grp_goal: given target block within the group
1466 * @count: target number of blocks to allocate
1467 * @my_rsv: reservation window
1468 * @errp: pointer to store the error code
1469 *
1470 * This is the main function used to allocate a new block and its reservation
1471 * window.
1472 *
1473 * Each time when a new block allocation is need, first try to allocate from
1474 * its own reservation. If it does not have a reservation window, instead of
1475 * looking for a free bit on bitmap first, then look up the reservation list to
1476 * see if it is inside somebody else's reservation window, we try to allocate a
1477 * reservation window for it starting from the goal first. Then do the block
1478 * allocation within the reservation window.
1479 *
1480 * This will avoid keeping on searching the reservation list again and
1481 * again when somebody is looking for a free block (without
1482 * reservation), and there are lots of free blocks, but they are all
1483 * being reserved.
1484 *
1485 * We use a red-black tree for the per-filesystem reservation list.
1486 *
1487 */
1488static ext4_grpblk_t
1489ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1490 ext4_group_t group, struct buffer_head *bitmap_bh,
1491 ext4_grpblk_t grp_goal,
1492 struct ext4_reserve_window_node * my_rsv,
1493 unsigned long *count, int *errp)
1494{
1495 ext4_fsblk_t group_first_block, group_last_block;
1496 ext4_grpblk_t ret = 0;
1497 int fatal;
1498 unsigned long num = *count;
1499
1500 *errp = 0;
1501
1502 /*
1503 * Make sure we use undo access for the bitmap, because it is critical
1504 * that we do the frozen_data COW on bitmap buffers in all cases even
1505 * if the buffer is in BJ_Forget state in the committing transaction.
1506 */
1507 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1508 fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1509 if (fatal) {
1510 *errp = fatal;
1511 return -1;
1512 }
1513
1514 /*
1515 * we don't deal with reservation when
1516 * filesystem is mounted without reservation
1517 * or the file is not a regular file
1518 * or last attempt to allocate a block with reservation turned on failed
1519 */
1520 if (my_rsv == NULL ) {
1521 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1522 grp_goal, count, NULL);
1523 goto out;
1524 }
1525 /*
1526 * grp_goal is a group relative block number (if there is a goal)
1527 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1528 * first block is a filesystem wide block number
1529 * first block is the block number of the first block in this group
1530 */
1531 group_first_block = ext4_group_first_block_no(sb, group);
1532 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1533
1534 /*
1535 * Basically we will allocate a new block from inode's reservation
1536 * window.
1537 *
1538 * We need to allocate a new reservation window, if:
1539 * a) inode does not have a reservation window; or
1540 * b) last attempt to allocate a block from existing reservation
1541 * failed; or
1542 * c) we come here with a goal and with a reservation window
1543 *
1544 * We do not need to allocate a new reservation window if we come here
1545 * at the beginning with a goal and the goal is inside the window, or
1546 * we don't have a goal but already have a reservation window.
1547 * then we could go to allocate from the reservation window directly.
1548 */
1549 while (1) {
1550 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1551 !goal_in_my_reservation(&my_rsv->rsv_window,
1552 grp_goal, group, sb)) {
1553 if (my_rsv->rsv_goal_size < *count)
1554 my_rsv->rsv_goal_size = *count;
1555 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1556 group, bitmap_bh);
1557 if (ret < 0)
1558 break; /* failed */
1559
1560 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1561 grp_goal, group, sb))
1562 grp_goal = -1;
1563 } else if (grp_goal >= 0) {
1564 int curr = my_rsv->rsv_end -
1565 (grp_goal + group_first_block) + 1;
1566
1567 if (curr < *count)
1568 try_to_extend_reservation(my_rsv, sb,
1569 *count - curr);
1570 }
1571
1572 if ((my_rsv->rsv_start > group_last_block) ||
1573 (my_rsv->rsv_end < group_first_block)) {
1574 rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1575 BUG();
1576 }
1577 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1578 grp_goal, &num, &my_rsv->rsv_window);
1579 if (ret >= 0) {
1580 my_rsv->rsv_alloc_hit += num;
1581 *count = num;
1582 break; /* succeed */
1583 }
1584 num = *count;
1585 }
1586out:
1587 if (ret >= 0) {
1588 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1589 "bitmap block");
1590 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1591 if (fatal) {
1592 *errp = fatal;
1593 return -1;
1594 }
1595 return ret;
1596 }
1597
1598 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1599 ext4_journal_release_buffer(handle, bitmap_bh);
1600 return ret;
1601}
1602
1603/**
1604 * ext4_has_free_blocks() 623 * ext4_has_free_blocks()
1605 * @sbi: in-core super block structure. 624 * @sbi: in-core super block structure.
1606 * @nblocks: number of neeed blocks 625 * @nblocks: number of neeed blocks
@@ -1610,26 +629,34 @@ out:
1610 * On success, return nblocks 629 * On success, return nblocks
1611 */ 630 */
1612ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, 631ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1613 ext4_fsblk_t nblocks) 632 s64 nblocks)
1614{ 633{
1615 ext4_fsblk_t free_blocks; 634 s64 free_blocks, dirty_blocks;
1616 ext4_fsblk_t root_blocks = 0; 635 s64 root_blocks = 0;
636 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
637 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
1617 638
1618 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 639 free_blocks = percpu_counter_read_positive(fbc);
640 dirty_blocks = percpu_counter_read_positive(dbc);
1619 641
1620 if (!capable(CAP_SYS_RESOURCE) && 642 if (!capable(CAP_SYS_RESOURCE) &&
1621 sbi->s_resuid != current->fsuid && 643 sbi->s_resuid != current->fsuid &&
1622 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) 644 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1623 root_blocks = ext4_r_blocks_count(sbi->s_es); 645 root_blocks = ext4_r_blocks_count(sbi->s_es);
1624#ifdef CONFIG_SMP 646
1625 if (free_blocks - root_blocks < FBC_BATCH) 647 if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
1626 free_blocks = 648 EXT4_FREEBLOCKS_WATERMARK) {
1627 percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); 649 free_blocks = percpu_counter_sum(fbc);
1628#endif 650 dirty_blocks = percpu_counter_sum(dbc);
1629 if (free_blocks - root_blocks < nblocks) 651 }
1630 return free_blocks - root_blocks; 652 if (free_blocks <= (root_blocks + dirty_blocks))
653 /* we don't have free space */
654 return 0;
655
656 if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
657 return free_blocks - (root_blocks + dirty_blocks);
1631 return nblocks; 658 return nblocks;
1632 } 659}
1633 660
1634 661
1635/** 662/**
@@ -1654,303 +681,6 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1654 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 681 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1655} 682}
1656 683
1657/**
1658 * ext4_old_new_blocks() -- core block bitmap based block allocation function
1659 *
1660 * @handle: handle to this transaction
1661 * @inode: file inode
1662 * @goal: given target block(filesystem wide)
1663 * @count: target number of blocks to allocate
1664 * @errp: error code
1665 *
1666 * ext4_old_new_blocks uses a goal block to assist allocation and look up
1667 * the block bitmap directly to do block allocation. It tries to
1668 * allocate block(s) from the block group contains the goal block first. If
1669 * that fails, it will try to allocate block(s) from other block groups
1670 * without any specific goal block.
1671 *
1672 * This function is called when -o nomballoc mount option is enabled
1673 *
1674 */
1675ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1676 ext4_fsblk_t goal, unsigned long *count, int *errp)
1677{
1678 struct buffer_head *bitmap_bh = NULL;
1679 struct buffer_head *gdp_bh;
1680 ext4_group_t group_no;
1681 ext4_group_t goal_group;
1682 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1683 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1684 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1685 ext4_group_t bgi; /* blockgroup iteration index */
1686 int fatal = 0, err;
1687 int performed_allocation = 0;
1688 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
1689 struct super_block *sb;
1690 struct ext4_group_desc *gdp;
1691 struct ext4_super_block *es;
1692 struct ext4_sb_info *sbi;
1693 struct ext4_reserve_window_node *my_rsv = NULL;
1694 struct ext4_block_alloc_info *block_i;
1695 unsigned short windowsz = 0;
1696 ext4_group_t ngroups;
1697 unsigned long num = *count;
1698
1699 sb = inode->i_sb;
1700 if (!sb) {
1701 *errp = -ENODEV;
1702 printk("ext4_new_block: nonexistent device");
1703 return 0;
1704 }
1705
1706 sbi = EXT4_SB(sb);
1707 if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1708 /*
1709 * With delalloc we already reserved the blocks
1710 */
1711 *count = ext4_has_free_blocks(sbi, *count);
1712 }
1713 if (*count == 0) {
1714 *errp = -ENOSPC;
1715 return 0; /*return with ENOSPC error */
1716 }
1717 num = *count;
1718
1719 /*
1720 * Check quota for allocation of this block.
1721 */
1722 if (DQUOT_ALLOC_BLOCK(inode, num)) {
1723 *errp = -EDQUOT;
1724 return 0;
1725 }
1726
1727 sbi = EXT4_SB(sb);
1728 es = EXT4_SB(sb)->s_es;
1729 ext4_debug("goal=%llu.\n", goal);
1730 /*
1731 * Allocate a block from reservation only when
1732 * filesystem is mounted with reservation(default,-o reservation), and
1733 * it's a regular file, and
1734 * the desired window size is greater than 0 (One could use ioctl
1735 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1736 * reservation on that particular file)
1737 */
1738 block_i = EXT4_I(inode)->i_block_alloc_info;
1739 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1740 my_rsv = &block_i->rsv_window_node;
1741
1742 /*
1743 * First, test whether the goal block is free.
1744 */
1745 if (goal < le32_to_cpu(es->s_first_data_block) ||
1746 goal >= ext4_blocks_count(es))
1747 goal = le32_to_cpu(es->s_first_data_block);
1748 ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1749 goal_group = group_no;
1750retry_alloc:
1751 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1752 if (!gdp)
1753 goto io_error;
1754
1755 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1756 /*
1757 * if there is not enough free blocks to make a new resevation
1758 * turn off reservation for this allocation
1759 */
1760 if (my_rsv && (free_blocks < windowsz)
1761 && (rsv_is_empty(&my_rsv->rsv_window)))
1762 my_rsv = NULL;
1763
1764 if (free_blocks > 0) {
1765 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1766 if (!bitmap_bh)
1767 goto io_error;
1768 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1769 group_no, bitmap_bh, grp_target_blk,
1770 my_rsv, &num, &fatal);
1771 if (fatal)
1772 goto out;
1773 if (grp_alloc_blk >= 0)
1774 goto allocated;
1775 }
1776
1777 ngroups = EXT4_SB(sb)->s_groups_count;
1778 smp_rmb();
1779
1780 /*
1781 * Now search the rest of the groups. We assume that
1782 * group_no and gdp correctly point to the last group visited.
1783 */
1784 for (bgi = 0; bgi < ngroups; bgi++) {
1785 group_no++;
1786 if (group_no >= ngroups)
1787 group_no = 0;
1788 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1789 if (!gdp)
1790 goto io_error;
1791 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1792 /*
1793 * skip this group if the number of
1794 * free blocks is less than half of the reservation
1795 * window size.
1796 */
1797 if (free_blocks <= (windowsz/2))
1798 continue;
1799
1800 brelse(bitmap_bh);
1801 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1802 if (!bitmap_bh)
1803 goto io_error;
1804 /*
1805 * try to allocate block(s) from this group, without a goal(-1).
1806 */
1807 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1808 group_no, bitmap_bh, -1, my_rsv,
1809 &num, &fatal);
1810 if (fatal)
1811 goto out;
1812 if (grp_alloc_blk >= 0)
1813 goto allocated;
1814 }
1815 /*
1816 * We may end up a bogus ealier ENOSPC error due to
1817 * filesystem is "full" of reservations, but
1818 * there maybe indeed free blocks avaliable on disk
1819 * In this case, we just forget about the reservations
1820 * just do block allocation as without reservations.
1821 */
1822 if (my_rsv) {
1823 my_rsv = NULL;
1824 windowsz = 0;
1825 group_no = goal_group;
1826 goto retry_alloc;
1827 }
1828 /* No space left on the device */
1829 *errp = -ENOSPC;
1830 goto out;
1831
1832allocated:
1833
1834 ext4_debug("using block group %lu(%d)\n",
1835 group_no, gdp->bg_free_blocks_count);
1836
1837 BUFFER_TRACE(gdp_bh, "get_write_access");
1838 fatal = ext4_journal_get_write_access(handle, gdp_bh);
1839 if (fatal)
1840 goto out;
1841
1842 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1843
1844 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1845 in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
1846 in_range(ret_block, ext4_inode_table(sb, gdp),
1847 EXT4_SB(sb)->s_itb_per_group) ||
1848 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1849 EXT4_SB(sb)->s_itb_per_group)) {
1850 ext4_error(sb, "ext4_new_block",
1851 "Allocating block in system zone - "
1852 "blocks from %llu, length %lu",
1853 ret_block, num);
1854 /*
1855 * claim_block marked the blocks we allocated
1856 * as in use. So we may want to selectively
1857 * mark some of the blocks as free
1858 */
1859 goto retry_alloc;
1860 }
1861
1862 performed_allocation = 1;
1863
1864#ifdef CONFIG_JBD2_DEBUG
1865 {
1866 struct buffer_head *debug_bh;
1867
1868 /* Record bitmap buffer state in the newly allocated block */
1869 debug_bh = sb_find_get_block(sb, ret_block);
1870 if (debug_bh) {
1871 BUFFER_TRACE(debug_bh, "state when allocated");
1872 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1873 brelse(debug_bh);
1874 }
1875 }
1876 jbd_lock_bh_state(bitmap_bh);
1877 spin_lock(sb_bgl_lock(sbi, group_no));
1878 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1879 int i;
1880
1881 for (i = 0; i < num; i++) {
1882 if (ext4_test_bit(grp_alloc_blk+i,
1883 bh2jh(bitmap_bh)->b_committed_data)) {
1884 printk("%s: block was unexpectedly set in "
1885 "b_committed_data\n", __func__);
1886 }
1887 }
1888 }
1889 ext4_debug("found bit %d\n", grp_alloc_blk);
1890 spin_unlock(sb_bgl_lock(sbi, group_no));
1891 jbd_unlock_bh_state(bitmap_bh);
1892#endif
1893
1894 if (ret_block + num - 1 >= ext4_blocks_count(es)) {
1895 ext4_error(sb, "ext4_new_block",
1896 "block(%llu) >= blocks count(%llu) - "
1897 "block_group = %lu, es == %p ", ret_block,
1898 ext4_blocks_count(es), group_no, es);
1899 goto out;
1900 }
1901
1902 /*
1903 * It is up to the caller to add the new buffer to a journal
1904 * list of some description. We don't know in advance whether
1905 * the caller wants to use it as metadata or data.
1906 */
1907 spin_lock(sb_bgl_lock(sbi, group_no));
1908 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1909 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1910 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1911 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1912 spin_unlock(sb_bgl_lock(sbi, group_no));
1913 if (!EXT4_I(inode)->i_delalloc_reserved_flag)
1914 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1915
1916 if (sbi->s_log_groups_per_flex) {
1917 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1918 spin_lock(sb_bgl_lock(sbi, flex_group));
1919 sbi->s_flex_groups[flex_group].free_blocks -= num;
1920 spin_unlock(sb_bgl_lock(sbi, flex_group));
1921 }
1922
1923 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1924 err = ext4_journal_dirty_metadata(handle, gdp_bh);
1925 if (!fatal)
1926 fatal = err;
1927
1928 sb->s_dirt = 1;
1929 if (fatal)
1930 goto out;
1931
1932 *errp = 0;
1933 brelse(bitmap_bh);
1934 DQUOT_FREE_BLOCK(inode, *count-num);
1935 *count = num;
1936 return ret_block;
1937
1938io_error:
1939 *errp = -EIO;
1940out:
1941 if (fatal) {
1942 *errp = fatal;
1943 ext4_std_error(sb, fatal);
1944 }
1945 /*
1946 * Undo the block allocation
1947 */
1948 if (!performed_allocation)
1949 DQUOT_FREE_BLOCK(inode, *count);
1950 brelse(bitmap_bh);
1951 return 0;
1952}
1953
1954#define EXT4_META_BLOCK 0x1 684#define EXT4_META_BLOCK 0x1
1955 685
1956static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, 686static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
@@ -1960,10 +690,6 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
1960 struct ext4_allocation_request ar; 690 struct ext4_allocation_request ar;
1961 ext4_fsblk_t ret; 691 ext4_fsblk_t ret;
1962 692
1963 if (!test_opt(inode->i_sb, MBALLOC)) {
1964 return ext4_old_new_blocks(handle, inode, goal, count, errp);
1965 }
1966
1967 memset(&ar, 0, sizeof(ar)); 693 memset(&ar, 0, sizeof(ar));
1968 /* Fill with neighbour allocated blocks */ 694 /* Fill with neighbour allocated blocks */
1969 695
@@ -2005,7 +731,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
2005 /* 731 /*
2006 * Account for the allocated meta blocks 732 * Account for the allocated meta blocks
2007 */ 733 */
2008 if (!(*errp)) { 734 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
2009 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 735 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2010 EXT4_I(inode)->i_allocated_meta_blocks += *count; 736 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2011 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 737 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2090,10 +816,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
2090 bitmap_count += x; 816 bitmap_count += x;
2091 } 817 }
2092 brelse(bitmap_bh); 818 brelse(bitmap_bh);
2093 printk("ext4_count_free_blocks: stored = %llu" 819 printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
2094 ", computed = %llu, %llu\n", 820 ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
2095 ext4_free_blocks_count(es), 821 desc_count, bitmap_count);
2096 desc_count, bitmap_count);
2097 return bitmap_count; 822 return bitmap_count;
2098#else 823#else
2099 desc_count = 0; 824 desc_count = 0;
@@ -2180,8 +905,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
2180 905
2181 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || 906 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
2182 metagroup < first_meta_bg) 907 metagroup < first_meta_bg)
2183 return ext4_bg_num_gdb_nometa(sb,group); 908 return ext4_bg_num_gdb_nometa(sb, group);
2184 909
2185 return ext4_bg_num_gdb_meta(sb,group); 910 return ext4_bg_num_gdb_meta(sb,group);
2186 911
2187} 912}
913
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea6750454..0a7a6663c190 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
15 15
16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17 17
18unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars) 18unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
19{ 19{
20 unsigned int i; 20 unsigned int i;
21 unsigned long sum = 0; 21 unsigned long sum = 0;
22 22
23 if (!map) 23 if (!map)
24 return (0); 24 return 0;
25 for (i = 0; i < numchars; i++) 25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] + 26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf]; 27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum); 28 return sum;
29} 29}
30 30
31#endif /* EXT4FS_DEBUG */ 31#endif /* EXT4FS_DEBUG */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d3d23d73c08b..3ca6a2b7632d 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = {
33}; 33};
34 34
35static int ext4_readdir(struct file *, void *, filldir_t); 35static int ext4_readdir(struct file *, void *, filldir_t);
36static int ext4_dx_readdir(struct file * filp, 36static int ext4_dx_readdir(struct file *filp,
37 void * dirent, filldir_t filldir); 37 void *dirent, filldir_t filldir);
38static int ext4_release_dir (struct inode * inode, 38static int ext4_release_dir(struct inode *inode,
39 struct file * filp); 39 struct file *filp);
40 40
41const struct file_operations ext4_dir_operations = { 41const struct file_operations ext4_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
@@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
61} 61}
62 62
63 63
64int ext4_check_dir_entry (const char * function, struct inode * dir, 64int ext4_check_dir_entry(const char *function, struct inode *dir,
65 struct ext4_dir_entry_2 * de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head * bh, 66 struct buffer_head *bh,
67 unsigned long offset) 67 unsigned long offset)
68{ 68{
69 const char * error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT4_DIR_REC_LEN(1)) 72 if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
82 error_msg = "inode out of bounds"; 82 error_msg = "inode out of bounds";
83 83
84 if (error_msg != NULL) 84 if (error_msg != NULL)
85 ext4_error (dir->i_sb, function, 85 ext4_error(dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92} 92}
93 93
94static int ext4_readdir(struct file * filp, 94static int ext4_readdir(struct file *filp,
95 void * dirent, filldir_t filldir) 95 void *dirent, filldir_t filldir)
96{ 96{
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned long offset;
@@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp,
102 int err; 102 int err;
103 struct inode *inode = filp->f_path.dentry->d_inode; 103 struct inode *inode = filp->f_path.dentry->d_inode;
104 int ret = 0; 104 int ret = 0;
105 int dir_has_error = 0;
105 106
106 sb = inode->i_sb; 107 sb = inode->i_sb;
107 108
@@ -148,9 +149,13 @@ static int ext4_readdir(struct file * filp,
148 * of recovering data when there's a bad sector 149 * of recovering data when there's a bad sector
149 */ 150 */
150 if (!bh) { 151 if (!bh) {
151 ext4_error (sb, "ext4_readdir", 152 if (!dir_has_error) {
152 "directory #%lu contains a hole at offset %lu", 153 ext4_error(sb, __func__, "directory #%lu "
153 inode->i_ino, (unsigned long)filp->f_pos); 154 "contains a hole at offset %Lu",
155 inode->i_ino,
156 (unsigned long long) filp->f_pos);
157 dir_has_error = 1;
158 }
154 /* corrupt size? Maybe no more blocks to read */ 159 /* corrupt size? Maybe no more blocks to read */
155 if (filp->f_pos > inode->i_blocks << 9) 160 if (filp->f_pos > inode->i_blocks << 9)
156 break; 161 break;
@@ -187,14 +192,14 @@ revalidate:
187 while (!error && filp->f_pos < inode->i_size 192 while (!error && filp->f_pos < inode->i_size
188 && offset < sb->s_blocksize) { 193 && offset < sb->s_blocksize) {
189 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 194 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
190 if (!ext4_check_dir_entry ("ext4_readdir", inode, de, 195 if (!ext4_check_dir_entry("ext4_readdir", inode, de,
191 bh, offset)) { 196 bh, offset)) {
192 /* 197 /*
193 * On error, skip the f_pos to the next block 198 * On error, skip the f_pos to the next block
194 */ 199 */
195 filp->f_pos = (filp->f_pos | 200 filp->f_pos = (filp->f_pos |
196 (sb->s_blocksize - 1)) + 1; 201 (sb->s_blocksize - 1)) + 1;
197 brelse (bh); 202 brelse(bh);
198 ret = stored; 203 ret = stored;
199 goto out; 204 goto out;
200 } 205 }
@@ -218,12 +223,12 @@ revalidate:
218 break; 223 break;
219 if (version != filp->f_version) 224 if (version != filp->f_version)
220 goto revalidate; 225 goto revalidate;
221 stored ++; 226 stored++;
222 } 227 }
223 filp->f_pos += ext4_rec_len_from_disk(de->rec_len); 228 filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
224 } 229 }
225 offset = 0; 230 offset = 0;
226 brelse (bh); 231 brelse(bh);
227 } 232 }
228out: 233out:
229 return ret; 234 return ret;
@@ -290,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root)
290 parent = rb_parent(n); 295 parent = rb_parent(n);
291 fname = rb_entry(n, struct fname, rb_hash); 296 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) { 297 while (fname) {
293 struct fname * old = fname; 298 struct fname *old = fname;
294 fname = fname->next; 299 fname = fname->next;
295 kfree (old); 300 kfree(old);
296 } 301 }
297 if (!parent) 302 if (!parent)
298 root->rb_node = NULL; 303 root->rb_node = NULL;
@@ -331,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
331 struct ext4_dir_entry_2 *dirent) 336 struct ext4_dir_entry_2 *dirent)
332{ 337{
333 struct rb_node **p, *parent = NULL; 338 struct rb_node **p, *parent = NULL;
334 struct fname * fname, *new_fn; 339 struct fname *fname, *new_fn;
335 struct dir_private_info *info; 340 struct dir_private_info *info;
336 int len; 341 int len;
337 342
@@ -388,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
388 * for all entres on the fname linked list. (Normally there is only 393 * for all entres on the fname linked list. (Normally there is only
389 * one entry on the linked list, unless there are 62 bit hash collisions.) 394 * one entry on the linked list, unless there are 62 bit hash collisions.)
390 */ 395 */
391static int call_filldir(struct file * filp, void * dirent, 396static int call_filldir(struct file *filp, void *dirent,
392 filldir_t filldir, struct fname *fname) 397 filldir_t filldir, struct fname *fname)
393{ 398{
394 struct dir_private_info *info = filp->private_data; 399 struct dir_private_info *info = filp->private_data;
395 loff_t curr_pos; 400 loff_t curr_pos;
396 struct inode *inode = filp->f_path.dentry->d_inode; 401 struct inode *inode = filp->f_path.dentry->d_inode;
397 struct super_block * sb; 402 struct super_block *sb;
398 int error; 403 int error;
399 404
400 sb = inode->i_sb; 405 sb = inode->i_sb;
401 406
402 if (!fname) { 407 if (!fname) {
403 printk("call_filldir: called with null fname?!?\n"); 408 printk(KERN_ERR "ext4: call_filldir: called with "
409 "null fname?!?\n");
404 return 0; 410 return 0;
405 } 411 }
406 curr_pos = hash2pos(fname->hash, fname->minor_hash); 412 curr_pos = hash2pos(fname->hash, fname->minor_hash);
@@ -411,7 +417,7 @@ static int call_filldir(struct file * filp, void * dirent,
411 get_dtype(sb, fname->file_type)); 417 get_dtype(sb, fname->file_type));
412 if (error) { 418 if (error) {
413 filp->f_pos = curr_pos; 419 filp->f_pos = curr_pos;
414 info->extra_fname = fname->next; 420 info->extra_fname = fname;
415 return error; 421 return error;
416 } 422 }
417 fname = fname->next; 423 fname = fname->next;
@@ -419,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent,
419 return 0; 425 return 0;
420} 426}
421 427
422static int ext4_dx_readdir(struct file * filp, 428static int ext4_dx_readdir(struct file *filp,
423 void * dirent, filldir_t filldir) 429 void *dirent, filldir_t filldir)
424{ 430{
425 struct dir_private_info *info = filp->private_data; 431 struct dir_private_info *info = filp->private_data;
426 struct inode *inode = filp->f_path.dentry->d_inode; 432 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -450,11 +456,21 @@ static int ext4_dx_readdir(struct file * filp,
450 * If there are any leftover names on the hash collision 456 * If there are any leftover names on the hash collision
451 * chain, return them first. 457 * chain, return them first.
452 */ 458 */
453 if (info->extra_fname && 459 if (info->extra_fname) {
454 call_filldir(filp, dirent, filldir, info->extra_fname)) 460 if (call_filldir(filp, dirent, filldir, info->extra_fname))
455 goto finished; 461 goto finished;
456 462
457 if (!info->curr_node) 463 info->extra_fname = NULL;
464 info->curr_node = rb_next(info->curr_node);
465 if (!info->curr_node) {
466 if (info->next_hash == ~0) {
467 filp->f_pos = EXT4_HTREE_EOF;
468 goto finished;
469 }
470 info->curr_hash = info->next_hash;
471 info->curr_minor_hash = 0;
472 }
473 } else if (!info->curr_node)
458 info->curr_node = rb_first(&info->root); 474 info->curr_node = rb_first(&info->root);
459 475
460 while (1) { 476 while (1) {
@@ -501,7 +517,7 @@ finished:
501 return 0; 517 return 0;
502} 518}
503 519
504static int ext4_release_dir (struct inode * inode, struct file * filp) 520static int ext4_release_dir(struct inode *inode, struct file *filp)
505{ 521{
506 if (filp->private_data) 522 if (filp->private_data)
507 ext4_htree_free_dir_info(filp->private_data); 523 ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6c7924d9e358..6690a41cdd9f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -44,9 +44,9 @@
44#ifdef EXT4FS_DEBUG 44#ifdef EXT4FS_DEBUG
45#define ext4_debug(f, a...) \ 45#define ext4_debug(f, a...) \
46 do { \ 46 do { \
47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ 47 printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
48 __FILE__, __LINE__, __func__); \ 48 __FILE__, __LINE__, __func__); \
49 printk (KERN_DEBUG f, ## a); \ 49 printk(KERN_DEBUG f, ## a); \
50 } while (0) 50 } while (0)
51#else 51#else
52#define ext4_debug(f, a...) do {} while (0) 52#define ext4_debug(f, a...) do {} while (0)
@@ -128,7 +128,7 @@ struct ext4_allocation_request {
128#else 128#else
129# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) 129# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
130#endif 130#endif
131#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32)) 131#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
132#ifdef __KERNEL__ 132#ifdef __KERNEL__
133# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) 133# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
134#else 134#else
@@ -245,7 +245,7 @@ struct flex_groups {
245#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 245#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
246 246
247#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 247#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
248#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ 248#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
249 249
250/* 250/*
251 * Inode dynamic state flags 251 * Inode dynamic state flags
@@ -291,8 +291,6 @@ struct ext4_new_group_data {
291#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS 291#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
292#define EXT4_IOC_GETVERSION _IOR('f', 3, long) 292#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
293#define EXT4_IOC_SETVERSION _IOW('f', 4, long) 293#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
294#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
295#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input)
296#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION 294#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
297#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION 295#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
298#ifdef CONFIG_JBD2_DEBUG 296#ifdef CONFIG_JBD2_DEBUG
@@ -300,7 +298,10 @@ struct ext4_new_group_data {
300#endif 298#endif
301#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) 299#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
302#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) 300#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
303#define EXT4_IOC_MIGRATE _IO('f', 7) 301#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
302#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
303#define EXT4_IOC_MIGRATE _IO('f', 9)
304 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
304 305
305/* 306/*
306 * ioctl commands in 32 bit emulation 307 * ioctl commands in 32 bit emulation
@@ -538,8 +539,9 @@ do { \
538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 539#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 540#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 541#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
541#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
543#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
544
543/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 545/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
544#ifndef _LINUX_EXT2_FS_H 546#ifndef _LINUX_EXT2_FS_H
545#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 547#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -667,7 +669,7 @@ struct ext4_super_block {
667}; 669};
668 670
669#ifdef __KERNEL__ 671#ifdef __KERNEL__
670static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb) 672static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
671{ 673{
672 return sb->s_fs_info; 674 return sb->s_fs_info;
673} 675}
@@ -725,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
725 */ 727 */
726 728
727#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ 729#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
728 ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) 730 (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
729#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ 731#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
730 ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) 732 (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
731#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ 733#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
732 ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) 734 (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
733#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ 735#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
734 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) 736 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
735#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ 737#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -789,6 +791,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
789#define EXT4_DEF_RESUID 0 791#define EXT4_DEF_RESUID 0
790#define EXT4_DEF_RESGID 0 792#define EXT4_DEF_RESGID 0
791 793
794#define EXT4_DEF_INODE_READAHEAD_BLKS 32
795
792/* 796/*
793 * Default mount options 797 * Default mount options
794 */ 798 */
@@ -954,6 +958,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
954void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 958void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
955 unsigned long *blockgrpp, ext4_grpblk_t *offsetp); 959 unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
956 960
961extern struct proc_dir_entry *ext4_proc_root;
962
963#ifdef CONFIG_PROC_FS
964extern const struct file_operations ext4_ui_proc_fops;
965
966#define EXT4_PROC_HANDLER(name, var) \
967do { \
968 proc = proc_create_data(name, mode, sbi->s_proc, \
969 &ext4_ui_proc_fops, &sbi->s_##var); \
970 if (proc == NULL) { \
971 printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
972 goto err_out; \
973 } \
974} while (0)
975#else
976#define EXT4_PROC_HANDLER(name, var)
977#endif
978
957/* 979/*
958 * Function prototypes 980 * Function prototypes
959 */ 981 */
@@ -981,23 +1003,20 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
981extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1003extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
982 ext4_lblk_t iblock, ext4_fsblk_t goal, 1004 ext4_lblk_t iblock, ext4_fsblk_t goal,
983 unsigned long *count, int *errp); 1005 unsigned long *count, int *errp);
984extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, 1006extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
985 ext4_fsblk_t goal, unsigned long *count, int *errp);
986extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, 1007extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
987 ext4_fsblk_t nblocks); 1008 s64 nblocks);
988extern void ext4_free_blocks (handle_t *handle, struct inode *inode, 1009extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
989 ext4_fsblk_t block, unsigned long count, int metadata); 1010 ext4_fsblk_t block, unsigned long count, int metadata);
990extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, 1011extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
991 ext4_fsblk_t block, unsigned long count, 1012 ext4_fsblk_t block, unsigned long count,
992 unsigned long *pdquot_freed_blocks); 1013 unsigned long *pdquot_freed_blocks);
993extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); 1014extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
994extern void ext4_check_blocks_bitmap (struct super_block *); 1015extern void ext4_check_blocks_bitmap(struct super_block *);
995extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1016extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
996 ext4_group_t block_group, 1017 ext4_group_t block_group,
997 struct buffer_head ** bh); 1018 struct buffer_head ** bh);
998extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1019extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
999extern void ext4_init_block_alloc_info(struct inode *);
1000extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
1001 1020
1002/* dir.c */ 1021/* dir.c */
1003extern int ext4_check_dir_entry(const char *, struct inode *, 1022extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1009,20 +1028,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1009extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1028extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1010 1029
1011/* fsync.c */ 1030/* fsync.c */
1012extern int ext4_sync_file (struct file *, struct dentry *, int); 1031extern int ext4_sync_file(struct file *, struct dentry *, int);
1013 1032
1014/* hash.c */ 1033/* hash.c */
1015extern int ext4fs_dirhash(const char *name, int len, struct 1034extern int ext4fs_dirhash(const char *name, int len, struct
1016 dx_hash_info *hinfo); 1035 dx_hash_info *hinfo);
1017 1036
1018/* ialloc.c */ 1037/* ialloc.c */
1019extern struct inode * ext4_new_inode (handle_t *, struct inode *, int); 1038extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
1020extern void ext4_free_inode (handle_t *, struct inode *); 1039extern void ext4_free_inode(handle_t *, struct inode *);
1021extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); 1040extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1022extern unsigned long ext4_count_free_inodes (struct super_block *); 1041extern unsigned long ext4_count_free_inodes(struct super_block *);
1023extern unsigned long ext4_count_dirs (struct super_block *); 1042extern unsigned long ext4_count_dirs(struct super_block *);
1024extern void ext4_check_inodes_bitmap (struct super_block *); 1043extern void ext4_check_inodes_bitmap(struct super_block *);
1025extern unsigned long ext4_count_free (struct buffer_head *, unsigned); 1044extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
1026 1045
1027/* mballoc.c */ 1046/* mballoc.c */
1028extern long ext4_mb_stats; 1047extern long ext4_mb_stats;
@@ -1032,7 +1051,7 @@ extern int ext4_mb_release(struct super_block *);
1032extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, 1051extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1033 struct ext4_allocation_request *, int *); 1052 struct ext4_allocation_request *, int *);
1034extern int ext4_mb_reserve_blocks(struct super_block *, int); 1053extern int ext4_mb_reserve_blocks(struct super_block *, int);
1035extern void ext4_mb_discard_inode_preallocations(struct inode *); 1054extern void ext4_discard_preallocations(struct inode *);
1036extern int __init init_ext4_mballoc(void); 1055extern int __init init_ext4_mballoc(void);
1037extern void exit_ext4_mballoc(void); 1056extern void exit_ext4_mballoc(void);
1038extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1057extern void ext4_mb_free_blocks(handle_t *, struct inode *,
@@ -1050,39 +1069,41 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1050 ext4_lblk_t, int, int *); 1069 ext4_lblk_t, int, int *);
1051struct buffer_head *ext4_bread(handle_t *, struct inode *, 1070struct buffer_head *ext4_bread(handle_t *, struct inode *,
1052 ext4_lblk_t, int, int *); 1071 ext4_lblk_t, int, int *);
1072int ext4_get_block(struct inode *inode, sector_t iblock,
1073 struct buffer_head *bh_result, int create);
1053int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 1074int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1054 ext4_lblk_t iblock, unsigned long maxblocks, 1075 ext4_lblk_t iblock, unsigned long maxblocks,
1055 struct buffer_head *bh_result, 1076 struct buffer_head *bh_result,
1056 int create, int extend_disksize); 1077 int create, int extend_disksize);
1057 1078
1058extern struct inode *ext4_iget(struct super_block *, unsigned long); 1079extern struct inode *ext4_iget(struct super_block *, unsigned long);
1059extern int ext4_write_inode (struct inode *, int); 1080extern int ext4_write_inode(struct inode *, int);
1060extern int ext4_setattr (struct dentry *, struct iattr *); 1081extern int ext4_setattr(struct dentry *, struct iattr *);
1061extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1082extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1062 struct kstat *stat); 1083 struct kstat *stat);
1063extern void ext4_delete_inode (struct inode *); 1084extern void ext4_delete_inode(struct inode *);
1064extern int ext4_sync_inode (handle_t *, struct inode *); 1085extern int ext4_sync_inode(handle_t *, struct inode *);
1065extern void ext4_discard_reservation (struct inode *);
1066extern void ext4_dirty_inode(struct inode *); 1086extern void ext4_dirty_inode(struct inode *);
1067extern int ext4_change_inode_journal_flag(struct inode *, int); 1087extern int ext4_change_inode_journal_flag(struct inode *, int);
1068extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1069extern int ext4_can_truncate(struct inode *inode); 1089extern int ext4_can_truncate(struct inode *inode);
1070extern void ext4_truncate (struct inode *); 1090extern void ext4_truncate(struct inode *);
1071extern void ext4_set_inode_flags(struct inode *); 1091extern void ext4_set_inode_flags(struct inode *);
1072extern void ext4_get_inode_flags(struct ext4_inode_info *); 1092extern void ext4_get_inode_flags(struct ext4_inode_info *);
1073extern void ext4_set_aops(struct inode *inode); 1093extern void ext4_set_aops(struct inode *inode);
1074extern int ext4_writepage_trans_blocks(struct inode *); 1094extern int ext4_writepage_trans_blocks(struct inode *);
1095extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1096extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1075extern int ext4_block_truncate_page(handle_t *handle, 1097extern int ext4_block_truncate_page(handle_t *handle,
1076 struct address_space *mapping, loff_t from); 1098 struct address_space *mapping, loff_t from);
1077extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); 1099extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1078 1100
1079/* ioctl.c */ 1101/* ioctl.c */
1080extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1102extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1081extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); 1103extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1082 1104
1083/* migrate.c */ 1105/* migrate.c */
1084extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, 1106extern int ext4_ext_migrate(struct inode *);
1085 unsigned long);
1086/* namei.c */ 1107/* namei.c */
1087extern int ext4_orphan_add(handle_t *, struct inode *); 1108extern int ext4_orphan_add(handle_t *, struct inode *);
1088extern int ext4_orphan_del(handle_t *, struct inode *); 1109extern int ext4_orphan_del(handle_t *, struct inode *);
@@ -1097,14 +1118,14 @@ extern int ext4_group_extend(struct super_block *sb,
1097 ext4_fsblk_t n_blocks_count); 1118 ext4_fsblk_t n_blocks_count);
1098 1119
1099/* super.c */ 1120/* super.c */
1100extern void ext4_error (struct super_block *, const char *, const char *, ...) 1121extern void ext4_error(struct super_block *, const char *, const char *, ...)
1101 __attribute__ ((format (printf, 3, 4))); 1122 __attribute__ ((format (printf, 3, 4)));
1102extern void __ext4_std_error (struct super_block *, const char *, int); 1123extern void __ext4_std_error(struct super_block *, const char *, int);
1103extern void ext4_abort (struct super_block *, const char *, const char *, ...) 1124extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1104 __attribute__ ((format (printf, 3, 4))); 1125 __attribute__ ((format (printf, 3, 4)));
1105extern void ext4_warning (struct super_block *, const char *, const char *, ...) 1126extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1106 __attribute__ ((format (printf, 3, 4))); 1127 __attribute__ ((format (printf, 3, 4)));
1107extern void ext4_update_dynamic_rev (struct super_block *sb); 1128extern void ext4_update_dynamic_rev(struct super_block *sb);
1108extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1129extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1109 __u32 compat); 1130 __u32 compat);
1110extern int ext4_update_rocompat_feature(handle_t *handle, 1131extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1177,7 +1198,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
1177 1198
1178static inline 1199static inline
1179struct ext4_group_info *ext4_get_group_info(struct super_block *sb, 1200struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1180 ext4_group_t group) 1201 ext4_group_t group)
1181{ 1202{
1182 struct ext4_group_info ***grp_info; 1203 struct ext4_group_info ***grp_info;
1183 long indexv, indexh; 1204 long indexv, indexh;
@@ -1205,6 +1226,28 @@ do { \
1205 __ext4_std_error((sb), __func__, (errno)); \ 1226 __ext4_std_error((sb), __func__, (errno)); \
1206} while (0) 1227} while (0)
1207 1228
1229#ifdef CONFIG_SMP
1230/* Each CPU can accumulate FBC_BATCH blocks in their local
1231 * counters. So we need to make sure we have free blocks more
1232 * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
1233 */
1234#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
1235#else
1236#define EXT4_FREEBLOCKS_WATERMARK 0
1237#endif
1238
1239static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1240{
1241 /*
1242 * XXX: replace with spinlock if seen contended -bzzz
1243 */
1244 down_write(&EXT4_I(inode)->i_data_sem);
1245 if (newsize > EXT4_I(inode)->i_disksize)
1246 EXT4_I(inode)->i_disksize = newsize;
1247 up_write(&EXT4_I(inode)->i_data_sem);
1248 return ;
1249}
1250
1208/* 1251/*
1209 * Inodes and files operations 1252 * Inodes and files operations
1210 */ 1253 */
@@ -1227,6 +1270,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1227/* extents.c */ 1270/* extents.c */
1228extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 1271extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1229extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1272extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1273extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1274 int chunk);
1230extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1275extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1231 ext4_lblk_t iblock, 1276 ext4_lblk_t iblock,
1232 unsigned long max_blocks, struct buffer_head *bh_result, 1277 unsigned long max_blocks, struct buffer_head *bh_result,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 6c166c0a54b7..bec7ce59fc0d 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@ struct ext4_ext_path {
124#define EXT4_EXT_CACHE_GAP 1 124#define EXT4_EXT_CACHE_GAP 1
125#define EXT4_EXT_CACHE_EXTENT 2 125#define EXT4_EXT_CACHE_EXTENT 2
126 126
127/*
128 * to be called by ext4_ext_walk_space()
129 * negative retcode - error
130 * positive retcode - signal for ext4_ext_walk_space(), see below
131 * callback must return valid extent (passed or newly created)
132 */
133typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
134 struct ext4_ext_cache *,
135 struct ext4_extent *, void *);
136
137#define EXT_CONTINUE 0
138#define EXT_BREAK 1
139#define EXT_REPEAT 2
127 140
128#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
129 142
@@ -216,12 +229,16 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
216extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 229extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
217extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); 230extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
218extern int ext4_extent_tree_init(handle_t *, struct inode *); 231extern int ext4_extent_tree_init(handle_t *, struct inode *);
219extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); 232extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
233 int num,
234 struct ext4_ext_path *path);
220extern int ext4_ext_try_to_merge(struct inode *inode, 235extern int ext4_ext_try_to_merge(struct inode *inode,
221 struct ext4_ext_path *path, 236 struct ext4_ext_path *path,
222 struct ext4_extent *); 237 struct ext4_extent *);
223extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 238extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
224extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 239extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
240extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
241 ext_prepare_callback, void *);
225extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 242extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
226 struct ext4_ext_path *); 243 struct ext4_ext_path *);
227extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, 244extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index ef7409f0e7e4..5c124c0ac6d3 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t;
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned long ext4_group_t; 34typedef unsigned long ext4_group_t;
35 35
36struct ext4_reserve_window {
37 ext4_fsblk_t _rsv_start; /* First byte reserved */
38 ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */
39};
40
41struct ext4_reserve_window_node {
42 struct rb_node rsv_node;
43 __u32 rsv_goal_size;
44 __u32 rsv_alloc_hit;
45 struct ext4_reserve_window rsv_window;
46};
47
48struct ext4_block_alloc_info {
49 /* information about reservation window */
50 struct ext4_reserve_window_node rsv_window_node;
51 /*
52 * was i_next_alloc_block in ext4_inode_info
53 * is the logical (file-relative) number of the
54 * most-recently-allocated block in this file.
55 * We use this for detecting linearly ascending allocation requests.
56 */
57 ext4_lblk_t last_alloc_logical_block;
58 /*
59 * Was i_next_alloc_goal in ext4_inode_info
60 * is the *physical* companion to i_next_alloc_block.
61 * it the physical block number of the block which was most-recentl
62 * allocated to this file. This give us the goal (target) for the next
63 * allocation when we detect linearly ascending requests.
64 */
65 ext4_fsblk_t last_alloc_physical_block;
66};
67
68#define rsv_start rsv_window._rsv_start 36#define rsv_start rsv_window._rsv_start
69#define rsv_end rsv_window._rsv_end 37#define rsv_end rsv_window._rsv_end
70 38
@@ -97,11 +65,8 @@ struct ext4_inode_info {
97 ext4_group_t i_block_group; 65 ext4_group_t i_block_group;
98 __u32 i_state; /* Dynamic state flags for ext4 */ 66 __u32 i_state; /* Dynamic state flags for ext4 */
99 67
100 /* block reservation info */
101 struct ext4_block_alloc_info *i_block_alloc_info;
102
103 ext4_lblk_t i_dir_start_lookup; 68 ext4_lblk_t i_dir_start_lookup;
104#ifdef CONFIG_EXT4DEV_FS_XATTR 69#ifdef CONFIG_EXT4_FS_XATTR
105 /* 70 /*
106 * Extended attributes can be read independently of the main file 71 * Extended attributes can be read independently of the main file
107 * data. Taking i_mutex even when reading would cause contention 72 * data. Taking i_mutex even when reading would cause contention
@@ -111,7 +76,7 @@ struct ext4_inode_info {
111 */ 76 */
112 struct rw_semaphore xattr_sem; 77 struct rw_semaphore xattr_sem;
113#endif 78#endif
114#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 79#ifdef CONFIG_EXT4_FS_POSIX_ACL
115 struct posix_acl *i_acl; 80 struct posix_acl *i_acl;
116 struct posix_acl *i_default_acl; 81 struct posix_acl *i_default_acl;
117#endif 82#endif
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index eb8bc3afe6e9..b455c685a98b 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -51,6 +51,14 @@
51 EXT4_XATTR_TRANS_BLOCKS - 2 + \ 51 EXT4_XATTR_TRANS_BLOCKS - 2 + \
52 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 52 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
53 53
54/*
55 * Define the number of metadata blocks we need to account to modify data.
56 *
57 * This include super block, inode block, quota blocks and xattr blocks
58 */
59#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
60 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
61
54/* Delete operations potentially hit one directory's namespace plus an 62/* Delete operations potentially hit one directory's namespace plus an
55 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be 63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
56 * generous. We can grow the delete transaction later if necessary. */ 64 * generous. We can grow the delete transaction later if necessary. */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226d5531..6a0b40d43264 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -40,8 +40,8 @@ struct ext4_sb_info {
40 unsigned long s_blocks_last; /* Last seen block count */ 40 unsigned long s_blocks_last; /* Last seen block count */
41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
42 struct buffer_head * s_sbh; /* Buffer containing the super block */ 42 struct buffer_head * s_sbh; /* Buffer containing the super block */
43 struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ 43 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
44 struct buffer_head ** s_group_desc; 44 struct buffer_head **s_group_desc;
45 unsigned long s_mount_opt; 45 unsigned long s_mount_opt;
46 ext4_fsblk_t s_sb_block; 46 ext4_fsblk_t s_sb_block;
47 uid_t s_resuid; 47 uid_t s_resuid;
@@ -52,6 +52,7 @@ struct ext4_sb_info {
52 int s_desc_per_block_bits; 52 int s_desc_per_block_bits;
53 int s_inode_size; 53 int s_inode_size;
54 int s_first_ino; 54 int s_first_ino;
55 unsigned int s_inode_readahead_blks;
55 spinlock_t s_next_gen_lock; 56 spinlock_t s_next_gen_lock;
56 u32 s_next_generation; 57 u32 s_next_generation;
57 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
@@ -59,16 +60,17 @@ struct ext4_sb_info {
59 struct percpu_counter s_freeblocks_counter; 60 struct percpu_counter s_freeblocks_counter;
60 struct percpu_counter s_freeinodes_counter; 61 struct percpu_counter s_freeinodes_counter;
61 struct percpu_counter s_dirs_counter; 62 struct percpu_counter s_dirs_counter;
63 struct percpu_counter s_dirtyblocks_counter;
62 struct blockgroup_lock s_blockgroup_lock; 64 struct blockgroup_lock s_blockgroup_lock;
65 struct proc_dir_entry *s_proc;
63 66
64 /* root of the per fs reservation window tree */ 67 /* root of the per fs reservation window tree */
65 spinlock_t s_rsv_window_lock; 68 spinlock_t s_rsv_window_lock;
66 struct rb_root s_rsv_window_root; 69 struct rb_root s_rsv_window_root;
67 struct ext4_reserve_window_node s_rsv_window_head;
68 70
69 /* Journaling */ 71 /* Journaling */
70 struct inode * s_journal_inode; 72 struct inode *s_journal_inode;
71 struct journal_s * s_journal; 73 struct journal_s *s_journal;
72 struct list_head s_orphan; 74 struct list_head s_orphan;
73 unsigned long s_commit_interval; 75 unsigned long s_commit_interval;
74 struct block_device *journal_bdev; 76 struct block_device *journal_bdev;
@@ -106,12 +108,12 @@ struct ext4_sb_info {
106 108
107 /* tunables */ 109 /* tunables */
108 unsigned long s_stripe; 110 unsigned long s_stripe;
109 unsigned long s_mb_stream_request; 111 unsigned int s_mb_stream_request;
110 unsigned long s_mb_max_to_scan; 112 unsigned int s_mb_max_to_scan;
111 unsigned long s_mb_min_to_scan; 113 unsigned int s_mb_min_to_scan;
112 unsigned long s_mb_stats; 114 unsigned int s_mb_stats;
113 unsigned long s_mb_order2_reqs; 115 unsigned int s_mb_order2_reqs;
114 unsigned long s_mb_group_prealloc; 116 unsigned int s_mb_group_prealloc;
115 /* where last allocation was done - for stream allocation */ 117 /* where last allocation was done - for stream allocation */
116 unsigned long s_mb_last_group; 118 unsigned long s_mb_last_group;
117 unsigned long s_mb_last_start; 119 unsigned long s_mb_last_start;
@@ -121,7 +123,6 @@ struct ext4_sb_info {
121 int s_mb_history_cur; 123 int s_mb_history_cur;
122 int s_mb_history_max; 124 int s_mb_history_max;
123 int s_mb_history_num; 125 int s_mb_history_num;
124 struct proc_dir_entry *s_mb_proc;
125 spinlock_t s_mb_history_lock; 126 spinlock_t s_mb_history_lock;
126 int s_mb_history_filter; 127 int s_mb_history_filter;
127 128
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 612c3d2c3824..ea2ce3c0ae66 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/falloc.h> 41#include <linux/falloc.h>
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
44#include "ext4_extents.h" 45#include "ext4_extents.h"
45 46
@@ -383,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
383 ext_debug("\n"); 384 ext_debug("\n");
384} 385}
385#else 386#else
386#define ext4_ext_show_path(inode,path) 387#define ext4_ext_show_path(inode, path)
387#define ext4_ext_show_leaf(inode,path) 388#define ext4_ext_show_leaf(inode, path)
388#endif 389#endif
389 390
390void ext4_ext_drop_refs(struct ext4_ext_path *path) 391void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -440,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
440 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 441 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
441 if (k != 0 && 442 if (k != 0 &&
442 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 443 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
443 printk("k=%d, ix=0x%p, first=0x%p\n", k, 444 printk(KERN_DEBUG "k=%d, ix=0x%p, "
444 ix, EXT_FIRST_INDEX(eh)); 445 "first=0x%p\n", k,
445 printk("%u <= %u\n", 446 ix, EXT_FIRST_INDEX(eh));
447 printk(KERN_DEBUG "%u <= %u\n",
446 le32_to_cpu(ix->ei_block), 448 le32_to_cpu(ix->ei_block),
447 le32_to_cpu(ix[-1].ei_block)); 449 le32_to_cpu(ix[-1].ei_block));
448 } 450 }
@@ -1475,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1475 struct ext4_ext_path *path, 1477 struct ext4_ext_path *path,
1476 struct ext4_extent *newext) 1478 struct ext4_extent *newext)
1477{ 1479{
1478 struct ext4_extent_header * eh; 1480 struct ext4_extent_header *eh;
1479 struct ext4_extent *ex, *fex; 1481 struct ext4_extent *ex, *fex;
1480 struct ext4_extent *nearex; /* nearest extent */ 1482 struct ext4_extent *nearex; /* nearest extent */
1481 struct ext4_ext_path *npath = NULL; 1483 struct ext4_ext_path *npath = NULL;
@@ -1625,6 +1627,113 @@ cleanup:
1625 return err; 1627 return err;
1626} 1628}
1627 1629
1630int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1631 ext4_lblk_t num, ext_prepare_callback func,
1632 void *cbdata)
1633{
1634 struct ext4_ext_path *path = NULL;
1635 struct ext4_ext_cache cbex;
1636 struct ext4_extent *ex;
1637 ext4_lblk_t next, start = 0, end = 0;
1638 ext4_lblk_t last = block + num;
1639 int depth, exists, err = 0;
1640
1641 BUG_ON(func == NULL);
1642 BUG_ON(inode == NULL);
1643
1644 while (block < last && block != EXT_MAX_BLOCK) {
1645 num = last - block;
1646 /* find extent for this block */
1647 path = ext4_ext_find_extent(inode, block, path);
1648 if (IS_ERR(path)) {
1649 err = PTR_ERR(path);
1650 path = NULL;
1651 break;
1652 }
1653
1654 depth = ext_depth(inode);
1655 BUG_ON(path[depth].p_hdr == NULL);
1656 ex = path[depth].p_ext;
1657 next = ext4_ext_next_allocated_block(path);
1658
1659 exists = 0;
1660 if (!ex) {
1661 /* there is no extent yet, so try to allocate
1662 * all requested space */
1663 start = block;
1664 end = block + num;
1665 } else if (le32_to_cpu(ex->ee_block) > block) {
1666 /* need to allocate space before found extent */
1667 start = block;
1668 end = le32_to_cpu(ex->ee_block);
1669 if (block + num < end)
1670 end = block + num;
1671 } else if (block >= le32_to_cpu(ex->ee_block)
1672 + ext4_ext_get_actual_len(ex)) {
1673 /* need to allocate space after found extent */
1674 start = block;
1675 end = block + num;
1676 if (end >= next)
1677 end = next;
1678 } else if (block >= le32_to_cpu(ex->ee_block)) {
1679 /*
1680 * some part of requested space is covered
1681 * by found extent
1682 */
1683 start = block;
1684 end = le32_to_cpu(ex->ee_block)
1685 + ext4_ext_get_actual_len(ex);
1686 if (block + num < end)
1687 end = block + num;
1688 exists = 1;
1689 } else {
1690 BUG();
1691 }
1692 BUG_ON(end <= start);
1693
1694 if (!exists) {
1695 cbex.ec_block = start;
1696 cbex.ec_len = end - start;
1697 cbex.ec_start = 0;
1698 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1699 } else {
1700 cbex.ec_block = le32_to_cpu(ex->ee_block);
1701 cbex.ec_len = ext4_ext_get_actual_len(ex);
1702 cbex.ec_start = ext_pblock(ex);
1703 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1704 }
1705
1706 BUG_ON(cbex.ec_len == 0);
1707 err = func(inode, path, &cbex, ex, cbdata);
1708 ext4_ext_drop_refs(path);
1709
1710 if (err < 0)
1711 break;
1712
1713 if (err == EXT_REPEAT)
1714 continue;
1715 else if (err == EXT_BREAK) {
1716 err = 0;
1717 break;
1718 }
1719
1720 if (ext_depth(inode) != depth) {
1721 /* depth was changed. we have to realloc path */
1722 kfree(path);
1723 path = NULL;
1724 }
1725
1726 block = cbex.ec_block + cbex.ec_len;
1727 }
1728
1729 if (path) {
1730 ext4_ext_drop_refs(path);
1731 kfree(path);
1732 }
1733
1734 return err;
1735}
1736
1628static void 1737static void
1629ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1738ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1630 __u32 len, ext4_fsblk_t start, int type) 1739 __u32 len, ext4_fsblk_t start, int type)
@@ -1747,54 +1856,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1747} 1856}
1748 1857
1749/* 1858/*
1750 * ext4_ext_calc_credits_for_insert: 1859 * ext4_ext_calc_credits_for_single_extent:
1751 * This routine returns max. credits that the extent tree can consume. 1860 * This routine returns max. credits that needed to insert an extent
1752 * It should be OK for low-performance paths like ->writepage() 1861 * to the extent tree.
1753 * To allow many writing processes to fit into a single transaction, 1862 * When pass the actual path, the caller should calculate credits
1754 * the caller should calculate credits under i_data_sem and 1863 * under i_data_sem.
1755 * pass the actual path.
1756 */ 1864 */
1757int ext4_ext_calc_credits_for_insert(struct inode *inode, 1865int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1758 struct ext4_ext_path *path) 1866 struct ext4_ext_path *path)
1759{ 1867{
1760 int depth, needed;
1761
1762 if (path) { 1868 if (path) {
1869 int depth = ext_depth(inode);
1870 int ret = 0;
1871
1763 /* probably there is space in leaf? */ 1872 /* probably there is space in leaf? */
1764 depth = ext_depth(inode);
1765 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 1873 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1766 < le16_to_cpu(path[depth].p_hdr->eh_max)) 1874 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
1767 return 1;
1768 }
1769
1770 /*
1771 * given 32-bit logical block (4294967296 blocks), max. tree
1772 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1773 * Let's also add one more level for imbalance.
1774 */
1775 depth = 5;
1776 1875
1777 /* allocation of new data block(s) */ 1876 /*
1778 needed = 2; 1877 * There are some space in the leaf tree, no
1878 * need to account for leaf block credit
1879 *
1880 * bitmaps and block group descriptor blocks
1881 * and other metadat blocks still need to be
1882 * accounted.
1883 */
1884 /* 1 bitmap, 1 block group descriptor */
1885 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
1886 }
1887 }
1779 1888
1780 /* 1889 return ext4_chunk_trans_blocks(inode, nrblocks);
1781 * tree can be full, so it would need to grow in depth: 1890}
1782 * we need one credit to modify old root, credits for
1783 * new root will be added in split accounting
1784 */
1785 needed += 1;
1786 1891
1787 /* 1892/*
1788 * Index split can happen, we would need: 1893 * How many index/leaf blocks need to change/allocate to modify nrblocks?
1789 * allocate intermediate indexes (bitmap + group) 1894 *
1790 * + change two blocks at each level, but root (already included) 1895 * if nrblocks are fit in a single extent (chunk flag is 1), then
1791 */ 1896 * in the worse case, each tree level index/leaf need to be changed
1792 needed += (depth * 2) + (depth * 2); 1897 * if the tree split due to insert a new extent, then the old tree
1898 * index/leaf need to be updated too
1899 *
1900 * If the nrblocks are discontiguous, they could cause
1901 * the whole tree split more than once, but this is really rare.
1902 */
1903int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
1904{
1905 int index;
1906 int depth = ext_depth(inode);
1793 1907
1794 /* any allocation modifies superblock */ 1908 if (chunk)
1795 needed += 1; 1909 index = depth * 2;
1910 else
1911 index = depth * 3;
1796 1912
1797 return needed; 1913 return index;
1798} 1914}
1799 1915
1800static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 1916static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
@@ -1921,9 +2037,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1921 correct_index = 1; 2037 correct_index = 1;
1922 credits += (ext_depth(inode)) + 1; 2038 credits += (ext_depth(inode)) + 1;
1923 } 2039 }
1924#ifdef CONFIG_QUOTA
1925 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2040 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1926#endif
1927 2041
1928 err = ext4_ext_journal_restart(handle, credits); 2042 err = ext4_ext_journal_restart(handle, credits);
1929 if (err) 2043 if (err)
@@ -2137,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb)
2137 */ 2251 */
2138 2252
2139 if (test_opt(sb, EXTENTS)) { 2253 if (test_opt(sb, EXTENTS)) {
2140 printk("EXT4-fs: file extents enabled"); 2254 printk(KERN_INFO "EXT4-fs: file extents enabled");
2141#ifdef AGGRESSIVE_TEST 2255#ifdef AGGRESSIVE_TEST
2142 printk(", aggressive tests"); 2256 printk(", aggressive tests");
2143#endif 2257#endif
@@ -2691,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2691 goto out2; 2805 goto out2;
2692 } 2806 }
2693 /* 2807 /*
2694 * Okay, we need to do block allocation. Lazily initialize the block 2808 * Okay, we need to do block allocation.
2695 * allocation info here if necessary.
2696 */ 2809 */
2697 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2698 ext4_init_block_alloc_info(inode);
2699 2810
2700 /* find neighbour allocated blocks */ 2811 /* find neighbour allocated blocks */
2701 ar.lleft = iblock; 2812 ar.lleft = iblock;
@@ -2755,7 +2866,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2755 /* free data blocks we just allocated */ 2866 /* free data blocks we just allocated */
2756 /* not a good idea to call discard here directly, 2867 /* not a good idea to call discard here directly,
2757 * but otherwise we'd need to call it every free() */ 2868 * but otherwise we'd need to call it every free() */
2758 ext4_mb_discard_inode_preallocations(inode); 2869 ext4_discard_preallocations(inode);
2759 ext4_free_blocks(handle, inode, ext_pblock(&newex), 2870 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2760 ext4_ext_get_actual_len(&newex), 0); 2871 ext4_ext_get_actual_len(&newex), 0);
2761 goto out2; 2872 goto out2;
@@ -2805,7 +2916,7 @@ void ext4_ext_truncate(struct inode *inode)
2805 /* 2916 /*
2806 * probably first extent we're gonna free will be last in block 2917 * probably first extent we're gonna free will be last in block
2807 */ 2918 */
2808 err = ext4_writepage_trans_blocks(inode) + 3; 2919 err = ext4_writepage_trans_blocks(inode);
2809 handle = ext4_journal_start(inode, err); 2920 handle = ext4_journal_start(inode, err);
2810 if (IS_ERR(handle)) 2921 if (IS_ERR(handle))
2811 return; 2922 return;
@@ -2819,7 +2930,7 @@ void ext4_ext_truncate(struct inode *inode)
2819 down_write(&EXT4_I(inode)->i_data_sem); 2930 down_write(&EXT4_I(inode)->i_data_sem);
2820 ext4_ext_invalidate_cache(inode); 2931 ext4_ext_invalidate_cache(inode);
2821 2932
2822 ext4_mb_discard_inode_preallocations(inode); 2933 ext4_discard_preallocations(inode);
2823 2934
2824 /* 2935 /*
2825 * TODO: optimization is possible here. 2936 * TODO: optimization is possible here.
@@ -2858,27 +2969,6 @@ out_stop:
2858 ext4_journal_stop(handle); 2969 ext4_journal_stop(handle);
2859} 2970}
2860 2971
2861/*
2862 * ext4_ext_writepage_trans_blocks:
2863 * calculate max number of blocks we could modify
2864 * in order to allocate new block for an inode
2865 */
2866int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2867{
2868 int needed;
2869
2870 needed = ext4_ext_calc_credits_for_insert(inode, NULL);
2871
2872 /* caller wants to allocate num blocks, but note it includes sb */
2873 needed = needed * num - (num - 1);
2874
2875#ifdef CONFIG_QUOTA
2876 needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2877#endif
2878
2879 return needed;
2880}
2881
2882static void ext4_falloc_update_inode(struct inode *inode, 2972static void ext4_falloc_update_inode(struct inode *inode,
2883 int mode, loff_t new_size, int update_ctime) 2973 int mode, loff_t new_size, int update_ctime)
2884{ 2974{
@@ -2893,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode,
2893 * Update only when preallocation was requested beyond 2983 * Update only when preallocation was requested beyond
2894 * the file size. 2984 * the file size.
2895 */ 2985 */
2896 if (!(mode & FALLOC_FL_KEEP_SIZE) && 2986 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2897 new_size > i_size_read(inode)) { 2987 if (new_size > i_size_read(inode))
2898 i_size_write(inode, new_size); 2988 i_size_write(inode, new_size);
2899 EXT4_I(inode)->i_disksize = new_size; 2989 if (new_size > EXT4_I(inode)->i_disksize)
2990 ext4_update_i_disksize(inode, new_size);
2900 } 2991 }
2901 2992
2902} 2993}
@@ -2939,10 +3030,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2939 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3030 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
2940 - block; 3031 - block;
2941 /* 3032 /*
2942 * credits to insert 1 extent into extent tree + buffers to be able to 3033 * credits to insert 1 extent into extent tree
2943 * modify 1 super block, 1 block bitmap and 1 group descriptor.
2944 */ 3034 */
2945 credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; 3035 credits = ext4_chunk_trans_blocks(inode, max_blocks);
2946 mutex_lock(&inode->i_mutex); 3036 mutex_lock(&inode->i_mutex);
2947retry: 3037retry:
2948 while (ret >= 0 && ret < max_blocks) { 3038 while (ret >= 0 && ret < max_blocks) {
@@ -2989,3 +3079,143 @@ retry:
2989 mutex_unlock(&inode->i_mutex); 3079 mutex_unlock(&inode->i_mutex);
2990 return ret > 0 ? ret2 : ret; 3080 return ret > 0 ? ret2 : ret;
2991} 3081}
3082
3083/*
3084 * Callback function called for each extent to gather FIEMAP information.
3085 */
3086int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3087 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3088 void *data)
3089{
3090 struct fiemap_extent_info *fieinfo = data;
3091 unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
3092 __u64 logical;
3093 __u64 physical;
3094 __u64 length;
3095 __u32 flags = 0;
3096 int error;
3097
3098 logical = (__u64)newex->ec_block << blksize_bits;
3099
3100 if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3101 pgoff_t offset;
3102 struct page *page;
3103 struct buffer_head *bh = NULL;
3104
3105 offset = logical >> PAGE_SHIFT;
3106 page = find_get_page(inode->i_mapping, offset);
3107 if (!page || !page_has_buffers(page))
3108 return EXT_CONTINUE;
3109
3110 bh = page_buffers(page);
3111
3112 if (!bh)
3113 return EXT_CONTINUE;
3114
3115 if (buffer_delay(bh)) {
3116 flags |= FIEMAP_EXTENT_DELALLOC;
3117 page_cache_release(page);
3118 } else {
3119 page_cache_release(page);
3120 return EXT_CONTINUE;
3121 }
3122 }
3123
3124 physical = (__u64)newex->ec_start << blksize_bits;
3125 length = (__u64)newex->ec_len << blksize_bits;
3126
3127 if (ex && ext4_ext_is_uninitialized(ex))
3128 flags |= FIEMAP_EXTENT_UNWRITTEN;
3129
3130 /*
3131 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3132 *
3133 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3134 * this also indicates no more allocated blocks.
3135 *
3136 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3137 */
3138 if (logical + length - 1 == EXT_MAX_BLOCK ||
3139 ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
3140 flags |= FIEMAP_EXTENT_LAST;
3141
3142 error = fiemap_fill_next_extent(fieinfo, logical, physical,
3143 length, flags);
3144 if (error < 0)
3145 return error;
3146 if (error == 1)
3147 return EXT_BREAK;
3148
3149 return EXT_CONTINUE;
3150}
3151
3152/* fiemap flags we can handle specified here */
3153#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3154
3155int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
3156{
3157 __u64 physical = 0;
3158 __u64 length;
3159 __u32 flags = FIEMAP_EXTENT_LAST;
3160 int blockbits = inode->i_sb->s_blocksize_bits;
3161 int error = 0;
3162
3163 /* in-inode? */
3164 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
3165 struct ext4_iloc iloc;
3166 int offset; /* offset of xattr in inode */
3167
3168 error = ext4_get_inode_loc(inode, &iloc);
3169 if (error)
3170 return error;
3171 physical = iloc.bh->b_blocknr << blockbits;
3172 offset = EXT4_GOOD_OLD_INODE_SIZE +
3173 EXT4_I(inode)->i_extra_isize;
3174 physical += offset;
3175 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3176 flags |= FIEMAP_EXTENT_DATA_INLINE;
3177 } else { /* external block */
3178 physical = EXT4_I(inode)->i_file_acl << blockbits;
3179 length = inode->i_sb->s_blocksize;
3180 }
3181
3182 if (physical)
3183 error = fiemap_fill_next_extent(fieinfo, 0, physical,
3184 length, flags);
3185 return (error < 0 ? error : 0);
3186}
3187
3188int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3189 __u64 start, __u64 len)
3190{
3191 ext4_lblk_t start_blk;
3192 ext4_lblk_t len_blks;
3193 int error = 0;
3194
3195 /* fallback to generic here if not in extents fmt */
3196 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3197 return generic_block_fiemap(inode, fieinfo, start, len,
3198 ext4_get_block);
3199
3200 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3201 return -EBADR;
3202
3203 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3204 error = ext4_xattr_fiemap(inode, fieinfo);
3205 } else {
3206 start_blk = start >> inode->i_sb->s_blocksize_bits;
3207 len_blks = len >> inode->i_sb->s_blocksize_bits;
3208
3209 /*
3210 * Walk the extent tree gathering extent information.
3211 * ext4_ext_fiemap_cb will push extents back to user.
3212 */
3213 down_write(&EXT4_I(inode)->i_data_sem);
3214 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3215 ext4_ext_fiemap_cb, fieinfo);
3216 up_write(&EXT4_I(inode)->i_data_sem);
3217 }
3218
3219 return error;
3220}
3221
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 430eb7978db4..6bd11fba71f7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,14 +31,14 @@
31 * from ext4_file_open: open gets called at every open, but release 31 * from ext4_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed. 32 * gets called only when /all/ the files are closed.
33 */ 33 */
34static int ext4_release_file (struct inode * inode, struct file * filp) 34static int ext4_release_file(struct inode *inode, struct file *filp)
35{ 35{
36 /* if we are the last writer on the inode, drop the block reservation */ 36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 38 (atomic_read(&inode->i_writecount) == 1))
39 { 39 {
40 down_write(&EXT4_I(inode)->i_data_sem); 40 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_reservation(inode); 41 ext4_discard_preallocations(inode);
42 up_write(&EXT4_I(inode)->i_data_sem); 42 up_write(&EXT4_I(inode)->i_data_sem);
43 } 43 }
44 if (is_dx(inode) && filp->private_data) 44 if (is_dx(inode) && filp->private_data)
@@ -140,6 +140,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
140 return 0; 140 return 0;
141} 141}
142 142
143extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
144 __u64 start, __u64 len);
145
143const struct file_operations ext4_file_operations = { 146const struct file_operations ext4_file_operations = {
144 .llseek = generic_file_llseek, 147 .llseek = generic_file_llseek,
145 .read = do_sync_read, 148 .read = do_sync_read,
@@ -162,7 +165,7 @@ const struct inode_operations ext4_file_inode_operations = {
162 .truncate = ext4_truncate, 165 .truncate = ext4_truncate,
163 .setattr = ext4_setattr, 166 .setattr = ext4_setattr,
164 .getattr = ext4_getattr, 167 .getattr = ext4_getattr,
165#ifdef CONFIG_EXT4DEV_FS_XATTR 168#ifdef CONFIG_EXT4_FS_XATTR
166 .setxattr = generic_setxattr, 169 .setxattr = generic_setxattr,
167 .getxattr = generic_getxattr, 170 .getxattr = generic_getxattr,
168 .listxattr = ext4_listxattr, 171 .listxattr = ext4_listxattr,
@@ -170,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = {
170#endif 173#endif
171 .permission = ext4_permission, 174 .permission = ext4_permission,
172 .fallocate = ext4_fallocate, 175 .fallocate = ext4_fallocate,
176 .fiemap = ext4_fiemap,
173}; 177};
174 178
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a45c3737ad31..5afe4370840b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/marker.h>
31#include "ext4.h" 32#include "ext4.h"
32#include "ext4_jbd2.h" 33#include "ext4_jbd2.h"
33 34
@@ -43,7 +44,7 @@
43 * inode to disk. 44 * inode to disk.
44 */ 45 */
45 46
46int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) 47int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
47{ 48{
48 struct inode *inode = dentry->d_inode; 49 struct inode *inode = dentry->d_inode;
49 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
@@ -51,6 +52,10 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
51 52
52 J_ASSERT(ext4_journal_current_handle() == NULL); 53 J_ASSERT(ext4_journal_current_handle() == NULL);
53 54
55 trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
56 inode->i_sb->s_id, datasync, inode->i_ino,
57 dentry->d_parent->d_inode->i_ino);
58
54 /* 59 /*
55 * data=writeback: 60 * data=writeback:
56 * The caller's filemap_fdatawrite()/wait will sync the data. 61 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329dbe390..556ca8eba3db 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
27 sum += DELTA; 27 sum += DELTA;
28 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); 28 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
29 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); 29 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
30 } while(--n); 30 } while (--n);
31 31
32 buf[0] += b0; 32 buf[0] += b0;
33 buf[1] += b1; 33 buf[1] += b1;
@@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash (const char *name, int len) 38static __u32 dx_hack_hash(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 while (len--) { 41 while (len--) {
@@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
59 val = pad; 59 val = pad;
60 if (len > num*4) 60 if (len > num*4)
61 len = num * 4; 61 len = num * 4;
62 for (i=0; i < len; i++) { 62 for (i = 0; i < len; i++) {
63 if ((i % 4) == 0) 63 if ((i % 4) == 0)
64 val = pad; 64 val = pad;
65 val = msg[i] + (val << 8); 65 val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
104 104
105 /* Check to see if the seed is all zero's */ 105 /* Check to see if the seed is all zero's */
106 if (hinfo->seed) { 106 if (hinfo->seed) {
107 for (i=0; i < 4; i++) { 107 for (i = 0; i < 4; i++) {
108 if (hinfo->seed[i]) 108 if (hinfo->seed[i])
109 break; 109 break;
110 } 110 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 655e760212b8..fe34d74cfb19 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
115 block_group, bitmap_blk); 115 block_group, bitmap_blk);
116 return NULL; 116 return NULL;
117 } 117 }
118 if (bh_uptodate_or_lock(bh)) 118 if (buffer_uptodate(bh) &&
119 !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
119 return bh; 120 return bh;
120 121
122 lock_buffer(bh);
121 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 123 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
122 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 124 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
123 ext4_init_inode_bitmap(sb, bh, block_group, desc); 125 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -154,39 +156,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
154 * though), and then we'd have two inodes sharing the 156 * though), and then we'd have two inodes sharing the
155 * same inode number and space on the harddisk. 157 * same inode number and space on the harddisk.
156 */ 158 */
157void ext4_free_inode (handle_t *handle, struct inode * inode) 159void ext4_free_inode(handle_t *handle, struct inode *inode)
158{ 160{
159 struct super_block * sb = inode->i_sb; 161 struct super_block *sb = inode->i_sb;
160 int is_directory; 162 int is_directory;
161 unsigned long ino; 163 unsigned long ino;
162 struct buffer_head *bitmap_bh = NULL; 164 struct buffer_head *bitmap_bh = NULL;
163 struct buffer_head *bh2; 165 struct buffer_head *bh2;
164 ext4_group_t block_group; 166 ext4_group_t block_group;
165 unsigned long bit; 167 unsigned long bit;
166 struct ext4_group_desc * gdp; 168 struct ext4_group_desc *gdp;
167 struct ext4_super_block * es; 169 struct ext4_super_block *es;
168 struct ext4_sb_info *sbi; 170 struct ext4_sb_info *sbi;
169 int fatal = 0, err; 171 int fatal = 0, err;
170 ext4_group_t flex_group; 172 ext4_group_t flex_group;
171 173
172 if (atomic_read(&inode->i_count) > 1) { 174 if (atomic_read(&inode->i_count) > 1) {
173 printk ("ext4_free_inode: inode has count=%d\n", 175 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
174 atomic_read(&inode->i_count)); 176 atomic_read(&inode->i_count));
175 return; 177 return;
176 } 178 }
177 if (inode->i_nlink) { 179 if (inode->i_nlink) {
178 printk ("ext4_free_inode: inode has nlink=%d\n", 180 printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
179 inode->i_nlink); 181 inode->i_nlink);
180 return; 182 return;
181 } 183 }
182 if (!sb) { 184 if (!sb) {
183 printk("ext4_free_inode: inode on nonexistent device\n"); 185 printk(KERN_ERR "ext4_free_inode: inode on "
186 "nonexistent device\n");
184 return; 187 return;
185 } 188 }
186 sbi = EXT4_SB(sb); 189 sbi = EXT4_SB(sb);
187 190
188 ino = inode->i_ino; 191 ino = inode->i_ino;
189 ext4_debug ("freeing inode %lu\n", ino); 192 ext4_debug("freeing inode %lu\n", ino);
190 193
191 /* 194 /*
192 * Note: we must free any quota before locking the superblock, 195 * Note: we must free any quota before locking the superblock,
@@ -200,12 +203,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
200 is_directory = S_ISDIR(inode->i_mode); 203 is_directory = S_ISDIR(inode->i_mode);
201 204
202 /* Do this BEFORE marking the inode not in use or returning an error */ 205 /* Do this BEFORE marking the inode not in use or returning an error */
203 clear_inode (inode); 206 clear_inode(inode);
204 207
205 es = EXT4_SB(sb)->s_es; 208 es = EXT4_SB(sb)->s_es;
206 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 209 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
207 ext4_error (sb, "ext4_free_inode", 210 ext4_error(sb, "ext4_free_inode",
208 "reserved or nonexistent inode %lu", ino); 211 "reserved or nonexistent inode %lu", ino);
209 goto error_return; 212 goto error_return;
210 } 213 }
211 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 214 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -222,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
222 /* Ok, now we can actually update the inode bitmaps.. */ 225 /* Ok, now we can actually update the inode bitmaps.. */
223 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 226 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
224 bit, bitmap_bh->b_data)) 227 bit, bitmap_bh->b_data))
225 ext4_error (sb, "ext4_free_inode", 228 ext4_error(sb, "ext4_free_inode",
226 "bit already cleared for inode %lu", ino); 229 "bit already cleared for inode %lu", ino);
227 else { 230 else {
228 gdp = ext4_get_group_desc (sb, block_group, &bh2); 231 gdp = ext4_get_group_desc(sb, block_group, &bh2);
229 232
230 BUFFER_TRACE(bh2, "get_write_access"); 233 BUFFER_TRACE(bh2, "get_write_access");
231 fatal = ext4_journal_get_write_access(handle, bh2); 234 fatal = ext4_journal_get_write_access(handle, bh2);
@@ -287,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
287 avefreei = freei / ngroups; 290 avefreei = freei / ngroups;
288 291
289 for (group = 0; group < ngroups; group++) { 292 for (group = 0; group < ngroups; group++) {
290 desc = ext4_get_group_desc (sb, group, NULL); 293 desc = ext4_get_group_desc(sb, group, NULL);
291 if (!desc || !desc->bg_free_inodes_count) 294 if (!desc || !desc->bg_free_inodes_count)
292 continue; 295 continue;
293 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 296 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -351,7 +354,7 @@ find_close_to_parent:
351 goto found_flexbg; 354 goto found_flexbg;
352 } 355 }
353 356
354 if (best_flex < 0 || 357 if (flex_group[best_flex].free_inodes == 0 ||
355 (flex_group[i].free_blocks > 358 (flex_group[i].free_blocks >
356 flex_group[best_flex].free_blocks && 359 flex_group[best_flex].free_blocks &&
357 flex_group[i].free_inodes)) 360 flex_group[i].free_inodes))
@@ -576,16 +579,16 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
576 * For other inodes, search forward from the parent directory's block 579 * For other inodes, search forward from the parent directory's block
577 * group to find a free inode. 580 * group to find a free inode.
578 */ 581 */
579struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) 582struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
580{ 583{
581 struct super_block *sb; 584 struct super_block *sb;
582 struct buffer_head *bitmap_bh = NULL; 585 struct buffer_head *bitmap_bh = NULL;
583 struct buffer_head *bh2; 586 struct buffer_head *bh2;
584 ext4_group_t group = 0; 587 ext4_group_t group = 0;
585 unsigned long ino = 0; 588 unsigned long ino = 0;
586 struct inode * inode; 589 struct inode *inode;
587 struct ext4_group_desc * gdp = NULL; 590 struct ext4_group_desc *gdp = NULL;
588 struct ext4_super_block * es; 591 struct ext4_super_block *es;
589 struct ext4_inode_info *ei; 592 struct ext4_inode_info *ei;
590 struct ext4_sb_info *sbi; 593 struct ext4_sb_info *sbi;
591 int ret2, err = 0; 594 int ret2, err = 0;
@@ -613,7 +616,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
613 } 616 }
614 617
615 if (S_ISDIR(mode)) { 618 if (S_ISDIR(mode)) {
616 if (test_opt (sb, OLDALLOC)) 619 if (test_opt(sb, OLDALLOC))
617 ret2 = find_group_dir(sb, dir, &group); 620 ret2 = find_group_dir(sb, dir, &group);
618 else 621 else
619 ret2 = find_group_orlov(sb, dir, &group); 622 ret2 = find_group_orlov(sb, dir, &group);
@@ -783,7 +786,7 @@ got:
783 } 786 }
784 787
785 inode->i_uid = current->fsuid; 788 inode->i_uid = current->fsuid;
786 if (test_opt (sb, GRPID)) 789 if (test_opt(sb, GRPID))
787 inode->i_gid = dir->i_gid; 790 inode->i_gid = dir->i_gid;
788 else if (dir->i_mode & S_ISGID) { 791 else if (dir->i_mode & S_ISGID) {
789 inode->i_gid = dir->i_gid; 792 inode->i_gid = dir->i_gid;
@@ -816,7 +819,6 @@ got:
816 ei->i_flags &= ~EXT4_DIRSYNC_FL; 819 ei->i_flags &= ~EXT4_DIRSYNC_FL;
817 ei->i_file_acl = 0; 820 ei->i_file_acl = 0;
818 ei->i_dtime = 0; 821 ei->i_dtime = 0;
819 ei->i_block_alloc_info = NULL;
820 ei->i_block_group = group; 822 ei->i_block_group = group;
821 823
822 ext4_set_inode_flags(inode); 824 ext4_set_inode_flags(inode);
@@ -832,7 +834,7 @@ got:
832 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 834 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
833 835
834 ret = inode; 836 ret = inode;
835 if(DQUOT_ALLOC_INODE(inode)) { 837 if (DQUOT_ALLOC_INODE(inode)) {
836 err = -EDQUOT; 838 err = -EDQUOT;
837 goto fail_drop; 839 goto fail_drop;
838 } 840 }
@@ -841,7 +843,7 @@ got:
841 if (err) 843 if (err)
842 goto fail_free_drop; 844 goto fail_free_drop;
843 845
844 err = ext4_init_security(handle,inode, dir); 846 err = ext4_init_security(handle, inode, dir);
845 if (err) 847 if (err)
846 goto fail_free_drop; 848 goto fail_free_drop;
847 849
@@ -959,7 +961,7 @@ error:
959 return ERR_PTR(err); 961 return ERR_PTR(err);
960} 962}
961 963
962unsigned long ext4_count_free_inodes (struct super_block * sb) 964unsigned long ext4_count_free_inodes(struct super_block *sb)
963{ 965{
964 unsigned long desc_count; 966 unsigned long desc_count;
965 struct ext4_group_desc *gdp; 967 struct ext4_group_desc *gdp;
@@ -974,7 +976,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
974 bitmap_count = 0; 976 bitmap_count = 0;
975 gdp = NULL; 977 gdp = NULL;
976 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 978 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
977 gdp = ext4_get_group_desc (sb, i, NULL); 979 gdp = ext4_get_group_desc(sb, i, NULL);
978 if (!gdp) 980 if (!gdp)
979 continue; 981 continue;
980 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 982 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -989,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
989 bitmap_count += x; 991 bitmap_count += x;
990 } 992 }
991 brelse(bitmap_bh); 993 brelse(bitmap_bh);
992 printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n", 994 printk(KERN_DEBUG "ext4_count_free_inodes: "
993 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); 995 "stored = %u, computed = %lu, %lu\n",
996 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
994 return desc_count; 997 return desc_count;
995#else 998#else
996 desc_count = 0; 999 desc_count = 0;
997 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1000 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
998 gdp = ext4_get_group_desc (sb, i, NULL); 1001 gdp = ext4_get_group_desc(sb, i, NULL);
999 if (!gdp) 1002 if (!gdp)
1000 continue; 1003 continue;
1001 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1004 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -1006,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
1006} 1009}
1007 1010
1008/* Called at mount-time, super-block is locked */ 1011/* Called at mount-time, super-block is locked */
1009unsigned long ext4_count_dirs (struct super_block * sb) 1012unsigned long ext4_count_dirs(struct super_block * sb)
1010{ 1013{
1011 unsigned long count = 0; 1014 unsigned long count = 0;
1012 ext4_group_t i; 1015 ext4_group_t i;
1013 1016
1014 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1017 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
1015 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); 1018 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1016 if (!gdp) 1019 if (!gdp)
1017 continue; 1020 continue;
1018 count += le16_to_cpu(gdp->bg_used_dirs_count); 1021 count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 59fbbe899acc..9b4ec9decfd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
41#include "acl.h" 41#include "acl.h"
42#include "ext4_extents.h" 42#include "ext4_extents.h"
43 43
44#define MPAGE_DA_EXTENT_TAIL 0x01
45
44static inline int ext4_begin_ordered_truncate(struct inode *inode, 46static inline int ext4_begin_ordered_truncate(struct inode *inode,
45 loff_t new_size) 47 loff_t new_size)
46{ 48{
@@ -188,7 +190,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
188/* 190/*
189 * Called at the last iput() if i_nlink is zero. 191 * Called at the last iput() if i_nlink is zero.
190 */ 192 */
191void ext4_delete_inode (struct inode * inode) 193void ext4_delete_inode(struct inode *inode)
192{ 194{
193 handle_t *handle; 195 handle_t *handle;
194 int err; 196 int err;
@@ -328,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode,
328 int final = 0; 330 int final = 0;
329 331
330 if (i_block < 0) { 332 if (i_block < 0) {
331 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0"); 333 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
332 } else if (i_block < direct_blocks) { 334 } else if (i_block < direct_blocks) {
333 offsets[n++] = i_block; 335 offsets[n++] = i_block;
334 final = direct_blocks; 336 final = direct_blocks;
335 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 337 } else if ((i_block -= direct_blocks) < indirect_blocks) {
336 offsets[n++] = EXT4_IND_BLOCK; 338 offsets[n++] = EXT4_IND_BLOCK;
337 offsets[n++] = i_block; 339 offsets[n++] = i_block;
338 final = ptrs; 340 final = ptrs;
@@ -398,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
398 400
399 *err = 0; 401 *err = 0;
400 /* i_data is not going away, no lock needed */ 402 /* i_data is not going away, no lock needed */
401 add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets); 403 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
402 if (!p->key) 404 if (!p->key)
403 goto no_block; 405 goto no_block;
404 while (--depth) { 406 while (--depth) {
405 bh = sb_bread(sb, le32_to_cpu(p->key)); 407 bh = sb_bread(sb, le32_to_cpu(p->key));
406 if (!bh) 408 if (!bh)
407 goto failure; 409 goto failure;
408 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 410 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
409 /* Reader: end */ 411 /* Reader: end */
410 if (!p->key) 412 if (!p->key)
411 goto no_block; 413 goto no_block;
@@ -441,7 +443,7 @@ no_block:
441static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 443static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
442{ 444{
443 struct ext4_inode_info *ei = EXT4_I(inode); 445 struct ext4_inode_info *ei = EXT4_I(inode);
444 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 446 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
445 __le32 *p; 447 __le32 *p;
446 ext4_fsblk_t bg_start; 448 ext4_fsblk_t bg_start;
447 ext4_fsblk_t last_block; 449 ext4_fsblk_t last_block;
@@ -484,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
484static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 486static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
485 Indirect *partial) 487 Indirect *partial)
486{ 488{
487 struct ext4_block_alloc_info *block_i;
488
489 block_i = EXT4_I(inode)->i_block_alloc_info;
490
491 /* 489 /*
492 * try the heuristic for sequential allocation, 490 * XXX need to get goal block from mballoc's data structures
493 * failing that at least try to get decent locality.
494 */ 491 */
495 if (block_i && (block == block_i->last_alloc_logical_block + 1)
496 && (block_i->last_alloc_physical_block != 0)) {
497 return block_i->last_alloc_physical_block + 1;
498 }
499 492
500 return ext4_find_near(inode, partial); 493 return ext4_find_near(inode, partial);
501} 494}
@@ -628,7 +621,7 @@ allocated:
628 *err = 0; 621 *err = 0;
629 return ret; 622 return ret;
630failed_out: 623failed_out:
631 for (i = 0; i <index; i++) 624 for (i = 0; i < index; i++)
632 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 625 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
633 return ret; 626 return ret;
634} 627}
@@ -701,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
701 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 694 branch[n].p = (__le32 *) bh->b_data + offsets[n];
702 branch[n].key = cpu_to_le32(new_blocks[n]); 695 branch[n].key = cpu_to_le32(new_blocks[n]);
703 *branch[n].p = branch[n].key; 696 *branch[n].p = branch[n].key;
704 if ( n == indirect_blks) { 697 if (n == indirect_blks) {
705 current_block = new_blocks[n]; 698 current_block = new_blocks[n];
706 /* 699 /*
707 * End of chain, update the last new metablock of 700 * End of chain, update the last new metablock of
@@ -728,7 +721,7 @@ failed:
728 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 721 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
729 ext4_journal_forget(handle, branch[i].bh); 722 ext4_journal_forget(handle, branch[i].bh);
730 } 723 }
731 for (i = 0; i <indirect_blks; i++) 724 for (i = 0; i < indirect_blks; i++)
732 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 725 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
733 726
734 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 727 ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -755,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
755{ 748{
756 int i; 749 int i;
757 int err = 0; 750 int err = 0;
758 struct ext4_block_alloc_info *block_i;
759 ext4_fsblk_t current_block; 751 ext4_fsblk_t current_block;
760 752
761 block_i = EXT4_I(inode)->i_block_alloc_info;
762 /* 753 /*
763 * If we're splicing into a [td]indirect block (as opposed to the 754 * If we're splicing into a [td]indirect block (as opposed to the
764 * inode) then we need to get write access to the [td]indirect block 755 * inode) then we need to get write access to the [td]indirect block
@@ -781,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
781 if (num == 0 && blks > 1) { 772 if (num == 0 && blks > 1) {
782 current_block = le32_to_cpu(where->key) + 1; 773 current_block = le32_to_cpu(where->key) + 1;
783 for (i = 1; i < blks; i++) 774 for (i = 1; i < blks; i++)
784 *(where->p + i ) = cpu_to_le32(current_block++); 775 *(where->p + i) = cpu_to_le32(current_block++);
785 }
786
787 /*
788 * update the most recently allocated logical & physical block
789 * in i_block_alloc_info, to assist find the proper goal block for next
790 * allocation
791 */
792 if (block_i) {
793 block_i->last_alloc_logical_block = block + blks - 1;
794 block_i->last_alloc_physical_block =
795 le32_to_cpu(where[num].key) + blks - 1;
796 } 776 }
797 777
798 /* We are done with atomic stuff, now do the rest of housekeeping */ 778 /* We are done with atomic stuff, now do the rest of housekeeping */
@@ -912,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
912 goto cleanup; 892 goto cleanup;
913 893
914 /* 894 /*
915 * Okay, we need to do block allocation. Lazily initialize the block 895 * Okay, we need to do block allocation.
916 * allocation info here if necessary
917 */ 896 */
918 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
919 ext4_init_block_alloc_info(inode);
920
921 goal = ext4_find_goal(inode, iblock, partial); 897 goal = ext4_find_goal(inode, iblock, partial);
922 898
923 /* the number of blocks need to allocate for [d,t]indirect blocks */ 899 /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1005,6 +981,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
1005 */ 981 */
1006static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 982static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1007{ 983{
984 if (!blocks)
985 return 0;
986
1008 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 987 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1009 return ext4_ext_calc_metadata_amount(inode, blocks); 988 return ext4_ext_calc_metadata_amount(inode, blocks);
1010 989
@@ -1025,34 +1004,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1025 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1004 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1026 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1005 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1027 1006
1028 /* Account for allocated meta_blocks */ 1007 if (mdb_free) {
1029 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1008 /* Account for allocated meta_blocks */
1009 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1030 1010
1031 /* update fs free blocks counter for truncate case */ 1011 /* update fs dirty blocks counter */
1032 percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); 1012 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1013 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1014 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1015 }
1033 1016
1034 /* update per-inode reservations */ 1017 /* update per-inode reservations */
1035 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1018 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1036 EXT4_I(inode)->i_reserved_data_blocks -= used; 1019 EXT4_I(inode)->i_reserved_data_blocks -= used;
1037 1020
1038 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1039 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1040 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1041 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1021 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1042} 1022}
1043 1023
1044/* Maximum number of blocks we map for direct IO at once. */
1045#define DIO_MAX_BLOCKS 4096
1046/*
1047 * Number of credits we need for writing DIO_MAX_BLOCKS:
1048 * We need sb + group descriptor + bitmap + inode -> 4
1049 * For B blocks with A block pointers per block we need:
1050 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
1051 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
1052 */
1053#define DIO_CREDITS 25
1054
1055
1056/* 1024/*
1057 * The ext4_get_blocks_wrap() function try to look up the requested blocks, 1025 * The ext4_get_blocks_wrap() function try to look up the requested blocks,
1058 * and returns if the blocks are already mapped. 1026 * and returns if the blocks are already mapped.
@@ -1164,19 +1132,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1164 return retval; 1132 return retval;
1165} 1133}
1166 1134
1167static int ext4_get_block(struct inode *inode, sector_t iblock, 1135/* Maximum number of blocks we map for direct IO at once. */
1168 struct buffer_head *bh_result, int create) 1136#define DIO_MAX_BLOCKS 4096
1137
1138int ext4_get_block(struct inode *inode, sector_t iblock,
1139 struct buffer_head *bh_result, int create)
1169{ 1140{
1170 handle_t *handle = ext4_journal_current_handle(); 1141 handle_t *handle = ext4_journal_current_handle();
1171 int ret = 0, started = 0; 1142 int ret = 0, started = 0;
1172 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1143 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1144 int dio_credits;
1173 1145
1174 if (create && !handle) { 1146 if (create && !handle) {
1175 /* Direct IO write... */ 1147 /* Direct IO write... */
1176 if (max_blocks > DIO_MAX_BLOCKS) 1148 if (max_blocks > DIO_MAX_BLOCKS)
1177 max_blocks = DIO_MAX_BLOCKS; 1149 max_blocks = DIO_MAX_BLOCKS;
1178 handle = ext4_journal_start(inode, DIO_CREDITS + 1150 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
1179 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); 1151 handle = ext4_journal_start(inode, dio_credits);
1180 if (IS_ERR(handle)) { 1152 if (IS_ERR(handle)) {
1181 ret = PTR_ERR(handle); 1153 ret = PTR_ERR(handle);
1182 goto out; 1154 goto out;
@@ -1244,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1244 BUFFER_TRACE(bh, "call get_create_access"); 1216 BUFFER_TRACE(bh, "call get_create_access");
1245 fatal = ext4_journal_get_create_access(handle, bh); 1217 fatal = ext4_journal_get_create_access(handle, bh);
1246 if (!fatal && !buffer_uptodate(bh)) { 1218 if (!fatal && !buffer_uptodate(bh)) {
1247 memset(bh->b_data,0,inode->i_sb->s_blocksize); 1219 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1248 set_buffer_uptodate(bh); 1220 set_buffer_uptodate(bh);
1249 } 1221 }
1250 unlock_buffer(bh); 1222 unlock_buffer(bh);
@@ -1269,7 +1241,7 @@ err:
1269struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1241struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1270 ext4_lblk_t block, int create, int *err) 1242 ext4_lblk_t block, int create, int *err)
1271{ 1243{
1272 struct buffer_head * bh; 1244 struct buffer_head *bh;
1273 1245
1274 bh = ext4_getblk(handle, inode, block, create, err); 1246 bh = ext4_getblk(handle, inode, block, create, err);
1275 if (!bh) 1247 if (!bh)
@@ -1285,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1285 return NULL; 1257 return NULL;
1286} 1258}
1287 1259
1288static int walk_page_buffers( handle_t *handle, 1260static int walk_page_buffers(handle_t *handle,
1289 struct buffer_head *head, 1261 struct buffer_head *head,
1290 unsigned from, 1262 unsigned from,
1291 unsigned to, 1263 unsigned to,
1292 int *partial, 1264 int *partial,
1293 int (*fn)( handle_t *handle, 1265 int (*fn)(handle_t *handle,
1294 struct buffer_head *bh)) 1266 struct buffer_head *bh))
1295{ 1267{
1296 struct buffer_head *bh; 1268 struct buffer_head *bh;
1297 unsigned block_start, block_end; 1269 unsigned block_start, block_end;
@@ -1299,9 +1271,9 @@ static int walk_page_buffers( handle_t *handle,
1299 int err, ret = 0; 1271 int err, ret = 0;
1300 struct buffer_head *next; 1272 struct buffer_head *next;
1301 1273
1302 for ( bh = head, block_start = 0; 1274 for (bh = head, block_start = 0;
1303 ret == 0 && (bh != head || !block_start); 1275 ret == 0 && (bh != head || !block_start);
1304 block_start = block_end, bh = next) 1276 block_start = block_end, bh = next)
1305 { 1277 {
1306 next = bh->b_this_page; 1278 next = bh->b_this_page;
1307 block_end = block_start + blocksize; 1279 block_end = block_start + blocksize;
@@ -1354,23 +1326,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1354 loff_t pos, unsigned len, unsigned flags, 1326 loff_t pos, unsigned len, unsigned flags,
1355 struct page **pagep, void **fsdata) 1327 struct page **pagep, void **fsdata)
1356{ 1328{
1357 struct inode *inode = mapping->host; 1329 struct inode *inode = mapping->host;
1358 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1330 int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1359 handle_t *handle; 1331 handle_t *handle;
1360 int retries = 0; 1332 int retries = 0;
1361 struct page *page; 1333 struct page *page;
1362 pgoff_t index; 1334 pgoff_t index;
1363 unsigned from, to; 1335 unsigned from, to;
1364 1336
1365 index = pos >> PAGE_CACHE_SHIFT; 1337 index = pos >> PAGE_CACHE_SHIFT;
1366 from = pos & (PAGE_CACHE_SIZE - 1); 1338 from = pos & (PAGE_CACHE_SIZE - 1);
1367 to = from + len; 1339 to = from + len;
1368 1340
1369retry: 1341retry:
1370 handle = ext4_journal_start(inode, needed_blocks); 1342 handle = ext4_journal_start(inode, needed_blocks);
1371 if (IS_ERR(handle)) { 1343 if (IS_ERR(handle)) {
1372 ret = PTR_ERR(handle); 1344 ret = PTR_ERR(handle);
1373 goto out; 1345 goto out;
1374 } 1346 }
1375 1347
1376 page = __grab_cache_page(mapping, index); 1348 page = __grab_cache_page(mapping, index);
@@ -1390,9 +1362,16 @@ retry:
1390 } 1362 }
1391 1363
1392 if (ret) { 1364 if (ret) {
1393 unlock_page(page); 1365 unlock_page(page);
1394 ext4_journal_stop(handle); 1366 ext4_journal_stop(handle);
1395 page_cache_release(page); 1367 page_cache_release(page);
1368 /*
1369 * block_write_begin may have instantiated a few blocks
1370 * outside i_size. Trim these off again. Don't need
1371 * i_size_read because we hold i_mutex.
1372 */
1373 if (pos + len > inode->i_size)
1374 vmtruncate(inode, inode->i_size);
1396 } 1375 }
1397 1376
1398 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1377 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1429,16 +1408,18 @@ static int ext4_ordered_write_end(struct file *file,
1429 ret = ext4_jbd2_file_inode(handle, inode); 1408 ret = ext4_jbd2_file_inode(handle, inode);
1430 1409
1431 if (ret == 0) { 1410 if (ret == 0) {
1432 /*
1433 * generic_write_end() will run mark_inode_dirty() if i_size
1434 * changes. So let's piggyback the i_disksize mark_inode_dirty
1435 * into that.
1436 */
1437 loff_t new_i_size; 1411 loff_t new_i_size;
1438 1412
1439 new_i_size = pos + copied; 1413 new_i_size = pos + copied;
1440 if (new_i_size > EXT4_I(inode)->i_disksize) 1414 if (new_i_size > EXT4_I(inode)->i_disksize) {
1441 EXT4_I(inode)->i_disksize = new_i_size; 1415 ext4_update_i_disksize(inode, new_i_size);
1416 /* We need to mark inode dirty even if
1417 * new_i_size is less that inode->i_size
1418 * bu greater than i_disksize.(hint delalloc)
1419 */
1420 ext4_mark_inode_dirty(handle, inode);
1421 }
1422
1442 ret2 = generic_write_end(file, mapping, pos, len, copied, 1423 ret2 = generic_write_end(file, mapping, pos, len, copied,
1443 page, fsdata); 1424 page, fsdata);
1444 copied = ret2; 1425 copied = ret2;
@@ -1463,8 +1444,14 @@ static int ext4_writeback_write_end(struct file *file,
1463 loff_t new_i_size; 1444 loff_t new_i_size;
1464 1445
1465 new_i_size = pos + copied; 1446 new_i_size = pos + copied;
1466 if (new_i_size > EXT4_I(inode)->i_disksize) 1447 if (new_i_size > EXT4_I(inode)->i_disksize) {
1467 EXT4_I(inode)->i_disksize = new_i_size; 1448 ext4_update_i_disksize(inode, new_i_size);
1449 /* We need to mark inode dirty even if
1450 * new_i_size is less that inode->i_size
1451 * bu greater than i_disksize.(hint delalloc)
1452 */
1453 ext4_mark_inode_dirty(handle, inode);
1454 }
1468 1455
1469 ret2 = generic_write_end(file, mapping, pos, len, copied, 1456 ret2 = generic_write_end(file, mapping, pos, len, copied,
1470 page, fsdata); 1457 page, fsdata);
@@ -1489,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file,
1489 int ret = 0, ret2; 1476 int ret = 0, ret2;
1490 int partial = 0; 1477 int partial = 0;
1491 unsigned from, to; 1478 unsigned from, to;
1479 loff_t new_i_size;
1492 1480
1493 from = pos & (PAGE_CACHE_SIZE - 1); 1481 from = pos & (PAGE_CACHE_SIZE - 1);
1494 to = from + len; 1482 to = from + len;
@@ -1503,11 +1491,12 @@ static int ext4_journalled_write_end(struct file *file,
1503 to, &partial, write_end_fn); 1491 to, &partial, write_end_fn);
1504 if (!partial) 1492 if (!partial)
1505 SetPageUptodate(page); 1493 SetPageUptodate(page);
1506 if (pos+copied > inode->i_size) 1494 new_i_size = pos + copied;
1495 if (new_i_size > inode->i_size)
1507 i_size_write(inode, pos+copied); 1496 i_size_write(inode, pos+copied);
1508 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1497 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1509 if (inode->i_size > EXT4_I(inode)->i_disksize) { 1498 if (new_i_size > EXT4_I(inode)->i_disksize) {
1510 EXT4_I(inode)->i_disksize = inode->i_size; 1499 ext4_update_i_disksize(inode, new_i_size);
1511 ret2 = ext4_mark_inode_dirty(handle, inode); 1500 ret2 = ext4_mark_inode_dirty(handle, inode);
1512 if (!ret) 1501 if (!ret)
1513 ret = ret2; 1502 ret = ret2;
@@ -1524,6 +1513,7 @@ static int ext4_journalled_write_end(struct file *file,
1524 1513
1525static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1514static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1526{ 1515{
1516 int retries = 0;
1527 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1517 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1528 unsigned long md_needed, mdblocks, total = 0; 1518 unsigned long md_needed, mdblocks, total = 0;
1529 1519
@@ -1532,6 +1522,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1532 * in order to allocate nrblocks 1522 * in order to allocate nrblocks
1533 * worse case is one extent per block 1523 * worse case is one extent per block
1534 */ 1524 */
1525repeat:
1535 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1526 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1536 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1527 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1537 mdblocks = ext4_calc_metadata_amount(inode, total); 1528 mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1540,13 +1531,14 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1540 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1531 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1541 total = md_needed + nrblocks; 1532 total = md_needed + nrblocks;
1542 1533
1543 if (ext4_has_free_blocks(sbi, total) < total) { 1534 if (ext4_claim_free_blocks(sbi, total)) {
1544 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1535 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1536 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1537 yield();
1538 goto repeat;
1539 }
1545 return -ENOSPC; 1540 return -ENOSPC;
1546 } 1541 }
1547 /* reduce fs free blocks counter */
1548 percpu_counter_sub(&sbi->s_freeblocks_counter, total);
1549
1550 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1542 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1551 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1543 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1552 1544
@@ -1559,7 +1551,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1559 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1551 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1560 int total, mdb, mdb_free, release; 1552 int total, mdb, mdb_free, release;
1561 1553
1554 if (!to_free)
1555 return; /* Nothing to release, exit */
1556
1562 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1557 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1558
1559 if (!EXT4_I(inode)->i_reserved_data_blocks) {
1560 /*
1561 * if there is no reserved blocks, but we try to free some
1562 * then the counter is messed up somewhere.
1563 * but since this function is called from invalidate
1564 * page, it's harmless to return without any action
1565 */
1566 printk(KERN_INFO "ext4 delalloc try to release %d reserved "
1567 "blocks for inode %lu, but there is no reserved "
1568 "data blocks\n", to_free, inode->i_ino);
1569 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1570 return;
1571 }
1572
1563 /* recalculate the number of metablocks still need to be reserved */ 1573 /* recalculate the number of metablocks still need to be reserved */
1564 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1574 total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
1565 mdb = ext4_calc_metadata_amount(inode, total); 1575 mdb = ext4_calc_metadata_amount(inode, total);
@@ -1570,8 +1580,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1570 1580
1571 release = to_free + mdb_free; 1581 release = to_free + mdb_free;
1572 1582
1573 /* update fs free blocks counter for truncate case */ 1583 /* update fs dirty blocks counter for truncate case */
1574 percpu_counter_add(&sbi->s_freeblocks_counter, release); 1584 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1575 1585
1576 /* update per-inode reservations */ 1586 /* update per-inode reservations */
1577 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1587 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -1613,11 +1623,14 @@ struct mpage_da_data {
1613 unsigned long first_page, next_page; /* extent of pages */ 1623 unsigned long first_page, next_page; /* extent of pages */
1614 get_block_t *get_block; 1624 get_block_t *get_block;
1615 struct writeback_control *wbc; 1625 struct writeback_control *wbc;
1626 int io_done;
1627 long pages_written;
1628 int retval;
1616}; 1629};
1617 1630
1618/* 1631/*
1619 * mpage_da_submit_io - walks through extent of pages and try to write 1632 * mpage_da_submit_io - walks through extent of pages and try to write
1620 * them with __mpage_writepage() 1633 * them with writepage() call back
1621 * 1634 *
1622 * @mpd->inode: inode 1635 * @mpd->inode: inode
1623 * @mpd->first_page: first page of the extent 1636 * @mpd->first_page: first page of the extent
@@ -1632,18 +1645,11 @@ struct mpage_da_data {
1632static int mpage_da_submit_io(struct mpage_da_data *mpd) 1645static int mpage_da_submit_io(struct mpage_da_data *mpd)
1633{ 1646{
1634 struct address_space *mapping = mpd->inode->i_mapping; 1647 struct address_space *mapping = mpd->inode->i_mapping;
1635 struct mpage_data mpd_pp = {
1636 .bio = NULL,
1637 .last_block_in_bio = 0,
1638 .get_block = mpd->get_block,
1639 .use_writepage = 1,
1640 };
1641 int ret = 0, err, nr_pages, i; 1648 int ret = 0, err, nr_pages, i;
1642 unsigned long index, end; 1649 unsigned long index, end;
1643 struct pagevec pvec; 1650 struct pagevec pvec;
1644 1651
1645 BUG_ON(mpd->next_page <= mpd->first_page); 1652 BUG_ON(mpd->next_page <= mpd->first_page);
1646
1647 pagevec_init(&pvec, 0); 1653 pagevec_init(&pvec, 0);
1648 index = mpd->first_page; 1654 index = mpd->first_page;
1649 end = mpd->next_page - 1; 1655 end = mpd->next_page - 1;
@@ -1661,8 +1667,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1661 break; 1667 break;
1662 index++; 1668 index++;
1663 1669
1664 err = __mpage_writepage(page, mpd->wbc, &mpd_pp); 1670 err = mapping->a_ops->writepage(page, mpd->wbc);
1665 1671 if (!err)
1672 mpd->pages_written++;
1666 /* 1673 /*
1667 * In error case, we have to continue because 1674 * In error case, we have to continue because
1668 * remaining pages are still locked 1675 * remaining pages are still locked
@@ -1673,9 +1680,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1673 } 1680 }
1674 pagevec_release(&pvec); 1681 pagevec_release(&pvec);
1675 } 1682 }
1676 if (mpd_pp.bio)
1677 mpage_bio_submit(WRITE, mpd_pp.bio);
1678
1679 return ret; 1683 return ret;
1680} 1684}
1681 1685
@@ -1698,7 +1702,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1698 int blocks = exbh->b_size >> inode->i_blkbits; 1702 int blocks = exbh->b_size >> inode->i_blkbits;
1699 sector_t pblock = exbh->b_blocknr, cur_logical; 1703 sector_t pblock = exbh->b_blocknr, cur_logical;
1700 struct buffer_head *head, *bh; 1704 struct buffer_head *head, *bh;
1701 unsigned long index, end; 1705 pgoff_t index, end;
1702 struct pagevec pvec; 1706 struct pagevec pvec;
1703 int nr_pages, i; 1707 int nr_pages, i;
1704 1708
@@ -1741,6 +1745,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1741 if (buffer_delay(bh)) { 1745 if (buffer_delay(bh)) {
1742 bh->b_blocknr = pblock; 1746 bh->b_blocknr = pblock;
1743 clear_buffer_delay(bh); 1747 clear_buffer_delay(bh);
1748 bh->b_bdev = inode->i_sb->s_bdev;
1749 } else if (buffer_unwritten(bh)) {
1750 bh->b_blocknr = pblock;
1751 clear_buffer_unwritten(bh);
1752 set_buffer_mapped(bh);
1753 set_buffer_new(bh);
1754 bh->b_bdev = inode->i_sb->s_bdev;
1744 } else if (buffer_mapped(bh)) 1755 } else if (buffer_mapped(bh))
1745 BUG_ON(bh->b_blocknr != pblock); 1756 BUG_ON(bh->b_blocknr != pblock);
1746 1757
@@ -1768,6 +1779,57 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1768 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1779 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1769} 1780}
1770 1781
1782static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
1783 sector_t logical, long blk_cnt)
1784{
1785 int nr_pages, i;
1786 pgoff_t index, end;
1787 struct pagevec pvec;
1788 struct inode *inode = mpd->inode;
1789 struct address_space *mapping = inode->i_mapping;
1790
1791 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1792 end = (logical + blk_cnt - 1) >>
1793 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1794 while (index <= end) {
1795 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1796 if (nr_pages == 0)
1797 break;
1798 for (i = 0; i < nr_pages; i++) {
1799 struct page *page = pvec.pages[i];
1800 index = page->index;
1801 if (index > end)
1802 break;
1803 index++;
1804
1805 BUG_ON(!PageLocked(page));
1806 BUG_ON(PageWriteback(page));
1807 block_invalidatepage(page, 0);
1808 ClearPageUptodate(page);
1809 unlock_page(page);
1810 }
1811 }
1812 return;
1813}
1814
1815static void ext4_print_free_blocks(struct inode *inode)
1816{
1817 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1818 printk(KERN_EMERG "Total free blocks count %lld\n",
1819 ext4_count_free_blocks(inode->i_sb));
1820 printk(KERN_EMERG "Free/Dirty block details\n");
1821 printk(KERN_EMERG "free_blocks=%lld\n",
1822 percpu_counter_sum(&sbi->s_freeblocks_counter));
1823 printk(KERN_EMERG "dirty_blocks=%lld\n",
1824 percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1825 printk(KERN_EMERG "Block reservation details\n");
1826 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
1827 EXT4_I(inode)->i_reserved_data_blocks);
1828 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
1829 EXT4_I(inode)->i_reserved_meta_blocks);
1830 return;
1831}
1832
1771/* 1833/*
1772 * mpage_da_map_blocks - go through given space 1834 * mpage_da_map_blocks - go through given space
1773 * 1835 *
@@ -1776,54 +1838,87 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1776 * 1838 *
1777 * The function skips space we know is already mapped to disk blocks. 1839 * The function skips space we know is already mapped to disk blocks.
1778 * 1840 *
1779 * The function ignores errors ->get_block() returns, thus real
1780 * error handling is postponed to __mpage_writepage()
1781 */ 1841 */
1782static void mpage_da_map_blocks(struct mpage_da_data *mpd) 1842static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1783{ 1843{
1784 struct buffer_head *lbh = &mpd->lbh; 1844 int err = 0;
1785 int err = 0, remain = lbh->b_size;
1786 sector_t next = lbh->b_blocknr;
1787 struct buffer_head new; 1845 struct buffer_head new;
1846 struct buffer_head *lbh = &mpd->lbh;
1847 sector_t next;
1788 1848
1789 /* 1849 /*
1790 * We consider only non-mapped and non-allocated blocks 1850 * We consider only non-mapped and non-allocated blocks
1791 */ 1851 */
1792 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1852 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1793 return; 1853 return 0;
1854 new.b_state = lbh->b_state;
1855 new.b_blocknr = 0;
1856 new.b_size = lbh->b_size;
1857 next = lbh->b_blocknr;
1858 /*
1859 * If we didn't accumulate anything
1860 * to write simply return
1861 */
1862 if (!new.b_size)
1863 return 0;
1864 err = mpd->get_block(mpd->inode, next, &new, 1);
1865 if (err) {
1794 1866
1795 while (remain) { 1867 /* If get block returns with error
1796 new.b_state = lbh->b_state; 1868 * we simply return. Later writepage
1797 new.b_blocknr = 0; 1869 * will redirty the page and writepages
1798 new.b_size = remain; 1870 * will find the dirty page again
1799 err = mpd->get_block(mpd->inode, next, &new, 1); 1871 */
1800 if (err) { 1872 if (err == -EAGAIN)
1801 /* 1873 return 0;
1802 * Rather than implement own error handling
1803 * here, we just leave remaining blocks
1804 * unallocated and try again with ->writepage()
1805 */
1806 break;
1807 }
1808 BUG_ON(new.b_size == 0);
1809 1874
1810 if (buffer_new(&new)) 1875 if (err == -ENOSPC &&
1811 __unmap_underlying_blocks(mpd->inode, &new); 1876 ext4_count_free_blocks(mpd->inode->i_sb)) {
1877 mpd->retval = err;
1878 return 0;
1879 }
1812 1880
1813 /* 1881 /*
1814 * If blocks are delayed marked, we need to 1882 * get block failure will cause us
1815 * put actual blocknr and drop delayed bit 1883 * to loop in writepages. Because
1884 * a_ops->writepage won't be able to
1885 * make progress. The page will be redirtied
1886 * by writepage and writepages will again
1887 * try to write the same.
1816 */ 1888 */
1817 if (buffer_delay(lbh)) 1889 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1818 mpage_put_bnr_to_bhs(mpd, next, &new); 1890 "at logical offset %llu with max blocks "
1819 1891 "%zd with error %d\n",
1820 /* go for the remaining blocks */ 1892 __func__, mpd->inode->i_ino,
1821 next += new.b_size >> mpd->inode->i_blkbits; 1893 (unsigned long long)next,
1822 remain -= new.b_size; 1894 lbh->b_size >> mpd->inode->i_blkbits, err);
1895 printk(KERN_EMERG "This should not happen.!! "
1896 "Data will be lost\n");
1897 if (err == -ENOSPC) {
1898 ext4_print_free_blocks(mpd->inode);
1899 }
1900 /* invlaidate all the pages */
1901 ext4_da_block_invalidatepages(mpd, next,
1902 lbh->b_size >> mpd->inode->i_blkbits);
1903 return err;
1823 } 1904 }
1905 BUG_ON(new.b_size == 0);
1906
1907 if (buffer_new(&new))
1908 __unmap_underlying_blocks(mpd->inode, &new);
1909
1910 /*
1911 * If blocks are delayed marked, we need to
1912 * put actual blocknr and drop delayed bit
1913 */
1914 if (buffer_delay(lbh) || buffer_unwritten(lbh))
1915 mpage_put_bnr_to_bhs(mpd, next, &new);
1916
1917 return 0;
1824} 1918}
1825 1919
1826#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) 1920#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1921 (1 << BH_Delay) | (1 << BH_Unwritten))
1827 1922
1828/* 1923/*
1829 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 1924 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
@@ -1837,41 +1932,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1837static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 1932static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1838 sector_t logical, struct buffer_head *bh) 1933 sector_t logical, struct buffer_head *bh)
1839{ 1934{
1840 struct buffer_head *lbh = &mpd->lbh;
1841 sector_t next; 1935 sector_t next;
1936 size_t b_size = bh->b_size;
1937 struct buffer_head *lbh = &mpd->lbh;
1938 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
1842 1939
1843 next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); 1940 /* check if thereserved journal credits might overflow */
1844 1941 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
1942 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1943 /*
1944 * With non-extent format we are limited by the journal
1945 * credit available. Total credit needed to insert
1946 * nrblocks contiguous blocks is dependent on the
1947 * nrblocks. So limit nrblocks.
1948 */
1949 goto flush_it;
1950 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
1951 EXT4_MAX_TRANS_DATA) {
1952 /*
1953 * Adding the new buffer_head would make it cross the
1954 * allowed limit for which we have journal credit
1955 * reserved. So limit the new bh->b_size
1956 */
1957 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
1958 mpd->inode->i_blkbits;
1959 /* we will do mpage_da_submit_io in the next loop */
1960 }
1961 }
1845 /* 1962 /*
1846 * First block in the extent 1963 * First block in the extent
1847 */ 1964 */
1848 if (lbh->b_size == 0) { 1965 if (lbh->b_size == 0) {
1849 lbh->b_blocknr = logical; 1966 lbh->b_blocknr = logical;
1850 lbh->b_size = bh->b_size; 1967 lbh->b_size = b_size;
1851 lbh->b_state = bh->b_state & BH_FLAGS; 1968 lbh->b_state = bh->b_state & BH_FLAGS;
1852 return; 1969 return;
1853 } 1970 }
1854 1971
1972 next = lbh->b_blocknr + nrblocks;
1855 /* 1973 /*
1856 * Can we merge the block to our big extent? 1974 * Can we merge the block to our big extent?
1857 */ 1975 */
1858 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 1976 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1859 lbh->b_size += bh->b_size; 1977 lbh->b_size += b_size;
1860 return; 1978 return;
1861 } 1979 }
1862 1980
1981flush_it:
1863 /* 1982 /*
1864 * We couldn't merge the block to our extent, so we 1983 * We couldn't merge the block to our extent, so we
1865 * need to flush current extent and start new one 1984 * need to flush current extent and start new one
1866 */ 1985 */
1867 mpage_da_map_blocks(mpd); 1986 if (mpage_da_map_blocks(mpd) == 0)
1868 1987 mpage_da_submit_io(mpd);
1869 /* 1988 mpd->io_done = 1;
1870 * Now start a new extent 1989 return;
1871 */
1872 lbh->b_size = bh->b_size;
1873 lbh->b_state = bh->b_state & BH_FLAGS;
1874 lbh->b_blocknr = logical;
1875} 1990}
1876 1991
1877/* 1992/*
@@ -1891,17 +2006,35 @@ static int __mpage_da_writepage(struct page *page,
1891 struct buffer_head *bh, *head, fake; 2006 struct buffer_head *bh, *head, fake;
1892 sector_t logical; 2007 sector_t logical;
1893 2008
2009 if (mpd->io_done) {
2010 /*
2011 * Rest of the page in the page_vec
2012 * redirty then and skip then. We will
2013 * try to to write them again after
2014 * starting a new transaction
2015 */
2016 redirty_page_for_writepage(wbc, page);
2017 unlock_page(page);
2018 return MPAGE_DA_EXTENT_TAIL;
2019 }
1894 /* 2020 /*
1895 * Can we merge this page to current extent? 2021 * Can we merge this page to current extent?
1896 */ 2022 */
1897 if (mpd->next_page != page->index) { 2023 if (mpd->next_page != page->index) {
1898 /* 2024 /*
1899 * Nope, we can't. So, we map non-allocated blocks 2025 * Nope, we can't. So, we map non-allocated blocks
1900 * and start IO on them using __mpage_writepage() 2026 * and start IO on them using writepage()
1901 */ 2027 */
1902 if (mpd->next_page != mpd->first_page) { 2028 if (mpd->next_page != mpd->first_page) {
1903 mpage_da_map_blocks(mpd); 2029 if (mpage_da_map_blocks(mpd) == 0)
1904 mpage_da_submit_io(mpd); 2030 mpage_da_submit_io(mpd);
2031 /*
2032 * skip rest of the page in the page_vec
2033 */
2034 mpd->io_done = 1;
2035 redirty_page_for_writepage(wbc, page);
2036 unlock_page(page);
2037 return MPAGE_DA_EXTENT_TAIL;
1905 } 2038 }
1906 2039
1907 /* 2040 /*
@@ -1932,6 +2065,8 @@ static int __mpage_da_writepage(struct page *page,
1932 set_buffer_dirty(bh); 2065 set_buffer_dirty(bh);
1933 set_buffer_uptodate(bh); 2066 set_buffer_uptodate(bh);
1934 mpage_add_bh_to_extent(mpd, logical, bh); 2067 mpage_add_bh_to_extent(mpd, logical, bh);
2068 if (mpd->io_done)
2069 return MPAGE_DA_EXTENT_TAIL;
1935 } else { 2070 } else {
1936 /* 2071 /*
1937 * Page with regular buffer heads, just add all dirty ones 2072 * Page with regular buffer heads, just add all dirty ones
@@ -1940,8 +2075,12 @@ static int __mpage_da_writepage(struct page *page,
1940 bh = head; 2075 bh = head;
1941 do { 2076 do {
1942 BUG_ON(buffer_locked(bh)); 2077 BUG_ON(buffer_locked(bh));
1943 if (buffer_dirty(bh)) 2078 if (buffer_dirty(bh) &&
2079 (!buffer_mapped(bh) || buffer_delay(bh))) {
1944 mpage_add_bh_to_extent(mpd, logical, bh); 2080 mpage_add_bh_to_extent(mpd, logical, bh);
2081 if (mpd->io_done)
2082 return MPAGE_DA_EXTENT_TAIL;
2083 }
1945 logical++; 2084 logical++;
1946 } while ((bh = bh->b_this_page) != head); 2085 } while ((bh = bh->b_this_page) != head);
1947 } 2086 }
@@ -1960,46 +2099,39 @@ static int __mpage_da_writepage(struct page *page,
1960 * 2099 *
1961 * This is a library function, which implements the writepages() 2100 * This is a library function, which implements the writepages()
1962 * address_space_operation. 2101 * address_space_operation.
1963 *
1964 * In order to avoid duplication of logic that deals with partial pages,
1965 * multiple bio per page, etc, we find non-allocated blocks, allocate
1966 * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1967 *
1968 * It's important that we call __mpage_writepage() only once for each
1969 * involved page, otherwise we'd have to implement more complicated logic
1970 * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1971 *
1972 * See comments to mpage_writepages()
1973 */ 2102 */
1974static int mpage_da_writepages(struct address_space *mapping, 2103static int mpage_da_writepages(struct address_space *mapping,
1975 struct writeback_control *wbc, 2104 struct writeback_control *wbc,
1976 get_block_t get_block) 2105 struct mpage_da_data *mpd)
1977{ 2106{
1978 struct mpage_da_data mpd; 2107 long to_write;
1979 int ret; 2108 int ret;
1980 2109
1981 if (!get_block) 2110 if (!mpd->get_block)
1982 return generic_writepages(mapping, wbc); 2111 return generic_writepages(mapping, wbc);
1983 2112
1984 mpd.wbc = wbc; 2113 mpd->lbh.b_size = 0;
1985 mpd.inode = mapping->host; 2114 mpd->lbh.b_state = 0;
1986 mpd.lbh.b_size = 0; 2115 mpd->lbh.b_blocknr = 0;
1987 mpd.lbh.b_state = 0; 2116 mpd->first_page = 0;
1988 mpd.lbh.b_blocknr = 0; 2117 mpd->next_page = 0;
1989 mpd.first_page = 0; 2118 mpd->io_done = 0;
1990 mpd.next_page = 0; 2119 mpd->pages_written = 0;
1991 mpd.get_block = get_block; 2120 mpd->retval = 0;
2121
2122 to_write = wbc->nr_to_write;
1992 2123
1993 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); 2124 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
1994 2125
1995 /* 2126 /*
1996 * Handle last extent of pages 2127 * Handle last extent of pages
1997 */ 2128 */
1998 if (mpd.next_page != mpd.first_page) { 2129 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
1999 mpage_da_map_blocks(&mpd); 2130 if (mpage_da_map_blocks(mpd) == 0)
2000 mpage_da_submit_io(&mpd); 2131 mpage_da_submit_io(mpd);
2001 } 2132 }
2002 2133
2134 wbc->nr_to_write = to_write - mpd->pages_written;
2003 return ret; 2135 return ret;
2004} 2136}
2005 2137
@@ -2052,18 +2184,24 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2052 handle_t *handle = NULL; 2184 handle_t *handle = NULL;
2053 2185
2054 handle = ext4_journal_current_handle(); 2186 handle = ext4_journal_current_handle();
2055 if (!handle) { 2187 BUG_ON(!handle);
2056 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2188 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2057 bh_result, 0, 0, 0); 2189 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2058 BUG_ON(!ret);
2059 } else {
2060 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2061 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2062 }
2063
2064 if (ret > 0) { 2190 if (ret > 0) {
2191
2065 bh_result->b_size = (ret << inode->i_blkbits); 2192 bh_result->b_size = (ret << inode->i_blkbits);
2066 2193
2194 if (ext4_should_order_data(inode)) {
2195 int retval;
2196 retval = ext4_jbd2_file_inode(handle, inode);
2197 if (retval)
2198 /*
2199 * Failed to add inode for ordered
2200 * mode. Don't update file size
2201 */
2202 return retval;
2203 }
2204
2067 /* 2205 /*
2068 * Update on-disk size along with block allocation 2206 * Update on-disk size along with block allocation
2069 * we don't use 'extend_disksize' as size may change 2207 * we don't use 'extend_disksize' as size may change
@@ -2073,18 +2211,9 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2073 if (disksize > i_size_read(inode)) 2211 if (disksize > i_size_read(inode))
2074 disksize = i_size_read(inode); 2212 disksize = i_size_read(inode);
2075 if (disksize > EXT4_I(inode)->i_disksize) { 2213 if (disksize > EXT4_I(inode)->i_disksize) {
2076 /* 2214 ext4_update_i_disksize(inode, disksize);
2077 * XXX: replace with spinlock if seen contended -bzzz 2215 ret = ext4_mark_inode_dirty(handle, inode);
2078 */ 2216 return ret;
2079 down_write(&EXT4_I(inode)->i_data_sem);
2080 if (disksize > EXT4_I(inode)->i_disksize)
2081 EXT4_I(inode)->i_disksize = disksize;
2082 up_write(&EXT4_I(inode)->i_data_sem);
2083
2084 if (EXT4_I(inode)->i_disksize == disksize) {
2085 ret = ext4_mark_inode_dirty(handle, inode);
2086 return ret;
2087 }
2088 } 2217 }
2089 ret = 0; 2218 ret = 0;
2090 } 2219 }
@@ -2204,84 +2333,114 @@ static int ext4_da_writepage(struct page *page,
2204} 2333}
2205 2334
2206/* 2335/*
2207 * For now just follow the DIO way to estimate the max credits 2336 * This is called via ext4_da_writepages() to
2208 * needed to write out EXT4_MAX_WRITEBACK_PAGES. 2337 * calulate the total number of credits to reserve to fit
2209 * todo: need to calculate the max credits need for 2338 * a single extent allocation into a single transaction,
2210 * extent based files, currently the DIO credits is based on 2339 * ext4_da_writpeages() will loop calling this before
2211 * indirect-blocks mapping way. 2340 * the block allocation.
2212 *
2213 * Probably should have a generic way to calculate credits
2214 * for DIO, writepages, and truncate
2215 */ 2341 */
2216#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS 2342
2217#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS 2343static int ext4_da_writepages_trans_blocks(struct inode *inode)
2344{
2345 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2346
2347 /*
2348 * With non-extent format the journal credit needed to
2349 * insert nrblocks contiguous block is dependent on
2350 * number of contiguous block. So we will limit
2351 * number of contiguous block to a sane value
2352 */
2353 if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
2354 (max_blocks > EXT4_MAX_TRANS_DATA))
2355 max_blocks = EXT4_MAX_TRANS_DATA;
2356
2357 return ext4_chunk_trans_blocks(inode, max_blocks);
2358}
2218 2359
2219static int ext4_da_writepages(struct address_space *mapping, 2360static int ext4_da_writepages(struct address_space *mapping,
2220 struct writeback_control *wbc) 2361 struct writeback_control *wbc)
2221{ 2362{
2222 struct inode *inode = mapping->host;
2223 handle_t *handle = NULL; 2363 handle_t *handle = NULL;
2224 int needed_blocks;
2225 int ret = 0;
2226 long to_write;
2227 loff_t range_start = 0; 2364 loff_t range_start = 0;
2365 struct mpage_da_data mpd;
2366 struct inode *inode = mapping->host;
2367 int needed_blocks, ret = 0, nr_to_writebump = 0;
2368 long to_write, pages_skipped = 0;
2369 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2228 2370
2229 /* 2371 /*
2230 * No pages to write? This is mainly a kludge to avoid starting 2372 * No pages to write? This is mainly a kludge to avoid starting
2231 * a transaction for special inodes like journal inode on last iput() 2373 * a transaction for special inodes like journal inode on last iput()
2232 * because that could violate lock ordering on umount 2374 * because that could violate lock ordering on umount
2233 */ 2375 */
2234 if (!mapping->nrpages) 2376 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2235 return 0; 2377 return 0;
2236
2237 /* 2378 /*
2238 * Estimate the worse case needed credits to write out 2379 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2239 * EXT4_MAX_BUF_BLOCKS pages 2380 * This make sure small files blocks are allocated in
2381 * single attempt. This ensure that small files
2382 * get less fragmented.
2240 */ 2383 */
2241 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; 2384 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2385 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2386 wbc->nr_to_write = sbi->s_mb_stream_request;
2387 }
2242 2388
2243 to_write = wbc->nr_to_write; 2389 if (!wbc->range_cyclic)
2244 if (!wbc->range_cyclic) {
2245 /* 2390 /*
2246 * If range_cyclic is not set force range_cont 2391 * If range_cyclic is not set force range_cont
2247 * and save the old writeback_index 2392 * and save the old writeback_index
2248 */ 2393 */
2249 wbc->range_cont = 1; 2394 wbc->range_cont = 1;
2250 range_start = wbc->range_start;
2251 }
2252 2395
2253 while (!ret && to_write) { 2396 range_start = wbc->range_start;
2397 pages_skipped = wbc->pages_skipped;
2398
2399 mpd.wbc = wbc;
2400 mpd.inode = mapping->host;
2401
2402restart_loop:
2403 to_write = wbc->nr_to_write;
2404 while (!ret && to_write > 0) {
2405
2406 /*
2407 * we insert one extent at a time. So we need
2408 * credit needed for single extent allocation.
2409 * journalled mode is currently not supported
2410 * by delalloc
2411 */
2412 BUG_ON(ext4_should_journal_data(inode));
2413 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2414
2254 /* start a new transaction*/ 2415 /* start a new transaction*/
2255 handle = ext4_journal_start(inode, needed_blocks); 2416 handle = ext4_journal_start(inode, needed_blocks);
2256 if (IS_ERR(handle)) { 2417 if (IS_ERR(handle)) {
2257 ret = PTR_ERR(handle); 2418 ret = PTR_ERR(handle);
2419 printk(KERN_EMERG "%s: jbd2_start: "
2420 "%ld pages, ino %lu; err %d\n", __func__,
2421 wbc->nr_to_write, inode->i_ino, ret);
2422 dump_stack();
2258 goto out_writepages; 2423 goto out_writepages;
2259 } 2424 }
2260 if (ext4_should_order_data(inode)) { 2425 to_write -= wbc->nr_to_write;
2261 /*
2262 * With ordered mode we need to add
2263 * the inode to the journal handle
2264 * when we do block allocation.
2265 */
2266 ret = ext4_jbd2_file_inode(handle, inode);
2267 if (ret) {
2268 ext4_journal_stop(handle);
2269 goto out_writepages;
2270 }
2271 2426
2272 } 2427 mpd.get_block = ext4_da_get_block_write;
2273 /* 2428 ret = mpage_da_writepages(mapping, wbc, &mpd);
2274 * set the max dirty pages could be write at a time
2275 * to fit into the reserved transaction credits
2276 */
2277 if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
2278 wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
2279 2429
2280 to_write -= wbc->nr_to_write;
2281 ret = mpage_da_writepages(mapping, wbc,
2282 ext4_da_get_block_write);
2283 ext4_journal_stop(handle); 2430 ext4_journal_stop(handle);
2284 if (wbc->nr_to_write) { 2431
2432 if (mpd.retval == -ENOSPC)
2433 jbd2_journal_force_commit_nested(sbi->s_journal);
2434
2435 /* reset the retry count */
2436 if (ret == MPAGE_DA_EXTENT_TAIL) {
2437 /*
2438 * got one extent now try with
2439 * rest of the pages
2440 */
2441 to_write += wbc->nr_to_write;
2442 ret = 0;
2443 } else if (wbc->nr_to_write) {
2285 /* 2444 /*
2286 * There is no more writeout needed 2445 * There is no more writeout needed
2287 * or we requested for a noblocking writeout 2446 * or we requested for a noblocking writeout
@@ -2293,13 +2452,48 @@ static int ext4_da_writepages(struct address_space *mapping,
2293 wbc->nr_to_write = to_write; 2452 wbc->nr_to_write = to_write;
2294 } 2453 }
2295 2454
2296out_writepages: 2455 if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
2297 wbc->nr_to_write = to_write; 2456 /* We skipped pages in this loop */
2298 if (range_start)
2299 wbc->range_start = range_start; 2457 wbc->range_start = range_start;
2458 wbc->nr_to_write = to_write +
2459 wbc->pages_skipped - pages_skipped;
2460 wbc->pages_skipped = pages_skipped;
2461 goto restart_loop;
2462 }
2463
2464out_writepages:
2465 wbc->nr_to_write = to_write - nr_to_writebump;
2466 wbc->range_start = range_start;
2300 return ret; 2467 return ret;
2301} 2468}
2302 2469
2470#define FALL_BACK_TO_NONDELALLOC 1
2471static int ext4_nonda_switch(struct super_block *sb)
2472{
2473 s64 free_blocks, dirty_blocks;
2474 struct ext4_sb_info *sbi = EXT4_SB(sb);
2475
2476 /*
2477 * switch to non delalloc mode if we are running low
2478 * on free block. The free block accounting via percpu
2479 * counters can get slightly wrong with FBC_BATCH getting
2480 * accumulated on each CPU without updating global counters
2481 * Delalloc need an accurate free block accounting. So switch
2482 * to non delalloc when we are near to error range.
2483 */
2484 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
2485 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
2486 if (2 * free_blocks < 3 * dirty_blocks ||
2487 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
2488 /*
2489 * free block count is less that 150% of dirty blocks
2490 * or free blocks is less that watermark
2491 */
2492 return 1;
2493 }
2494 return 0;
2495}
2496
2303static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2497static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2304 loff_t pos, unsigned len, unsigned flags, 2498 loff_t pos, unsigned len, unsigned flags,
2305 struct page **pagep, void **fsdata) 2499 struct page **pagep, void **fsdata)
@@ -2315,6 +2509,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2315 from = pos & (PAGE_CACHE_SIZE - 1); 2509 from = pos & (PAGE_CACHE_SIZE - 1);
2316 to = from + len; 2510 to = from + len;
2317 2511
2512 if (ext4_nonda_switch(inode->i_sb)) {
2513 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2514 return ext4_write_begin(file, mapping, pos,
2515 len, flags, pagep, fsdata);
2516 }
2517 *fsdata = (void *)0;
2318retry: 2518retry:
2319 /* 2519 /*
2320 * With delayed allocation, we don't log the i_disksize update 2520 * With delayed allocation, we don't log the i_disksize update
@@ -2342,6 +2542,13 @@ retry:
2342 unlock_page(page); 2542 unlock_page(page);
2343 ext4_journal_stop(handle); 2543 ext4_journal_stop(handle);
2344 page_cache_release(page); 2544 page_cache_release(page);
2545 /*
2546 * block_write_begin may have instantiated a few blocks
2547 * outside i_size. Trim these off again. Don't need
2548 * i_size_read because we hold i_mutex.
2549 */
2550 if (pos + len > inode->i_size)
2551 vmtruncate(inode, inode->i_size);
2345 } 2552 }
2346 2553
2347 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2554 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2365,7 +2572,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
2365 bh = page_buffers(page); 2572 bh = page_buffers(page);
2366 idx = offset >> inode->i_blkbits; 2573 idx = offset >> inode->i_blkbits;
2367 2574
2368 for (i=0; i < idx; i++) 2575 for (i = 0; i < idx; i++)
2369 bh = bh->b_this_page; 2576 bh = bh->b_this_page;
2370 2577
2371 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2578 if (!buffer_mapped(bh) || (buffer_delay(bh)))
@@ -2383,9 +2590,22 @@ static int ext4_da_write_end(struct file *file,
2383 handle_t *handle = ext4_journal_current_handle(); 2590 handle_t *handle = ext4_journal_current_handle();
2384 loff_t new_i_size; 2591 loff_t new_i_size;
2385 unsigned long start, end; 2592 unsigned long start, end;
2593 int write_mode = (int)(unsigned long)fsdata;
2594
2595 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2596 if (ext4_should_order_data(inode)) {
2597 return ext4_ordered_write_end(file, mapping, pos,
2598 len, copied, page, fsdata);
2599 } else if (ext4_should_writeback_data(inode)) {
2600 return ext4_writeback_write_end(file, mapping, pos,
2601 len, copied, page, fsdata);
2602 } else {
2603 BUG();
2604 }
2605 }
2386 2606
2387 start = pos & (PAGE_CACHE_SIZE - 1); 2607 start = pos & (PAGE_CACHE_SIZE - 1);
2388 end = start + copied -1; 2608 end = start + copied - 1;
2389 2609
2390 /* 2610 /*
2391 * generic_write_end() will run mark_inode_dirty() if i_size 2611 * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2409,6 +2629,11 @@ static int ext4_da_write_end(struct file *file,
2409 EXT4_I(inode)->i_disksize = new_i_size; 2629 EXT4_I(inode)->i_disksize = new_i_size;
2410 } 2630 }
2411 up_write(&EXT4_I(inode)->i_data_sem); 2631 up_write(&EXT4_I(inode)->i_data_sem);
2632 /* We need to mark inode dirty even if
2633 * new_i_size is less that inode->i_size
2634 * bu greater than i_disksize.(hint delalloc)
2635 */
2636 ext4_mark_inode_dirty(handle, inode);
2412 } 2637 }
2413 } 2638 }
2414 ret2 = generic_write_end(file, mapping, pos, len, copied, 2639 ret2 = generic_write_end(file, mapping, pos, len, copied,
@@ -2500,7 +2725,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2500 return 0; 2725 return 0;
2501 } 2726 }
2502 2727
2503 return generic_block_bmap(mapping,block,ext4_get_block); 2728 return generic_block_bmap(mapping, block, ext4_get_block);
2504} 2729}
2505 2730
2506static int bget_one(handle_t *handle, struct buffer_head *bh) 2731static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -3106,7 +3331,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
3106 if (!partial->key && *partial->p) 3331 if (!partial->key && *partial->p)
3107 /* Writer: end */ 3332 /* Writer: end */
3108 goto no_top; 3333 goto no_top;
3109 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 3334 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
3110 ; 3335 ;
3111 /* 3336 /*
3112 * OK, we've found the last block that must survive. The rest of our 3337 * OK, we've found the last block that must survive. The rest of our
@@ -3125,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
3125 } 3350 }
3126 /* Writer: end */ 3351 /* Writer: end */
3127 3352
3128 while(partial > p) { 3353 while (partial > p) {
3129 brelse(partial->bh); 3354 brelse(partial->bh);
3130 partial--; 3355 partial--;
3131 } 3356 }
@@ -3317,9 +3542,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3317 /* This zaps the entire block. Bottom up. */ 3542 /* This zaps the entire block. Bottom up. */
3318 BUFFER_TRACE(bh, "free child branches"); 3543 BUFFER_TRACE(bh, "free child branches");
3319 ext4_free_branches(handle, inode, bh, 3544 ext4_free_branches(handle, inode, bh,
3320 (__le32*)bh->b_data, 3545 (__le32 *) bh->b_data,
3321 (__le32*)bh->b_data + addr_per_block, 3546 (__le32 *) bh->b_data + addr_per_block,
3322 depth); 3547 depth);
3323 3548
3324 /* 3549 /*
3325 * We've probably journalled the indirect block several 3550 * We've probably journalled the indirect block several
@@ -3486,6 +3711,9 @@ void ext4_truncate(struct inode *inode)
3486 * modify the block allocation tree. 3711 * modify the block allocation tree.
3487 */ 3712 */
3488 down_write(&ei->i_data_sem); 3713 down_write(&ei->i_data_sem);
3714
3715 ext4_discard_preallocations(inode);
3716
3489 /* 3717 /*
3490 * The orphan list entry will now protect us from any crash which 3718 * The orphan list entry will now protect us from any crash which
3491 * occurs before the truncate completes, so it is now safe to propagate 3719 * occurs before the truncate completes, so it is now safe to propagate
@@ -3555,8 +3783,6 @@ do_indirects:
3555 ; 3783 ;
3556 } 3784 }
3557 3785
3558 ext4_discard_reservation(inode);
3559
3560 up_write(&ei->i_data_sem); 3786 up_write(&ei->i_data_sem);
3561 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3787 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3562 ext4_mark_inode_dirty(handle, inode); 3788 ext4_mark_inode_dirty(handle, inode);
@@ -3581,41 +3807,6 @@ out_stop:
3581 ext4_journal_stop(handle); 3807 ext4_journal_stop(handle);
3582} 3808}
3583 3809
3584static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3585 unsigned long ino, struct ext4_iloc *iloc)
3586{
3587 ext4_group_t block_group;
3588 unsigned long offset;
3589 ext4_fsblk_t block;
3590 struct ext4_group_desc *gdp;
3591
3592 if (!ext4_valid_inum(sb, ino)) {
3593 /*
3594 * This error is already checked for in namei.c unless we are
3595 * looking at an NFS filehandle, in which case no error
3596 * report is needed
3597 */
3598 return 0;
3599 }
3600
3601 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
3602 gdp = ext4_get_group_desc(sb, block_group, NULL);
3603 if (!gdp)
3604 return 0;
3605
3606 /*
3607 * Figure out the offset within the block group inode table
3608 */
3609 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
3610 EXT4_INODE_SIZE(sb);
3611 block = ext4_inode_table(sb, gdp) +
3612 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
3613
3614 iloc->block_group = block_group;
3615 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
3616 return block;
3617}
3618
3619/* 3810/*
3620 * ext4_get_inode_loc returns with an extra refcount against the inode's 3811 * ext4_get_inode_loc returns with an extra refcount against the inode's
3621 * underlying buffer_head on success. If 'in_mem' is true, we have all 3812 * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3625,19 +3816,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3625static int __ext4_get_inode_loc(struct inode *inode, 3816static int __ext4_get_inode_loc(struct inode *inode,
3626 struct ext4_iloc *iloc, int in_mem) 3817 struct ext4_iloc *iloc, int in_mem)
3627{ 3818{
3628 ext4_fsblk_t block; 3819 struct ext4_group_desc *gdp;
3629 struct buffer_head *bh; 3820 struct buffer_head *bh;
3821 struct super_block *sb = inode->i_sb;
3822 ext4_fsblk_t block;
3823 int inodes_per_block, inode_offset;
3824
3825 iloc->bh = 0;
3826 if (!ext4_valid_inum(sb, inode->i_ino))
3827 return -EIO;
3630 3828
3631 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); 3829 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
3632 if (!block) 3830 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
3831 if (!gdp)
3633 return -EIO; 3832 return -EIO;
3634 3833
3635 bh = sb_getblk(inode->i_sb, block); 3834 /*
3835 * Figure out the offset within the block group inode table
3836 */
3837 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
3838 inode_offset = ((inode->i_ino - 1) %
3839 EXT4_INODES_PER_GROUP(sb));
3840 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
3841 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3842
3843 bh = sb_getblk(sb, block);
3636 if (!bh) { 3844 if (!bh) {
3637 ext4_error (inode->i_sb, "ext4_get_inode_loc", 3845 ext4_error(sb, "ext4_get_inode_loc", "unable to read "
3638 "unable to read inode block - " 3846 "inode block - inode=%lu, block=%llu",
3639 "inode=%lu, block=%llu", 3847 inode->i_ino, block);
3640 inode->i_ino, block);
3641 return -EIO; 3848 return -EIO;
3642 } 3849 }
3643 if (!buffer_uptodate(bh)) { 3850 if (!buffer_uptodate(bh)) {
@@ -3665,28 +3872,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
3665 */ 3872 */
3666 if (in_mem) { 3873 if (in_mem) {
3667 struct buffer_head *bitmap_bh; 3874 struct buffer_head *bitmap_bh;
3668 struct ext4_group_desc *desc; 3875 int i, start;
3669 int inodes_per_buffer;
3670 int inode_offset, i;
3671 ext4_group_t block_group;
3672 int start;
3673
3674 block_group = (inode->i_ino - 1) /
3675 EXT4_INODES_PER_GROUP(inode->i_sb);
3676 inodes_per_buffer = bh->b_size /
3677 EXT4_INODE_SIZE(inode->i_sb);
3678 inode_offset = ((inode->i_ino - 1) %
3679 EXT4_INODES_PER_GROUP(inode->i_sb));
3680 start = inode_offset & ~(inodes_per_buffer - 1);
3681 3876
3682 /* Is the inode bitmap in cache? */ 3877 start = inode_offset & ~(inodes_per_block - 1);
3683 desc = ext4_get_group_desc(inode->i_sb,
3684 block_group, NULL);
3685 if (!desc)
3686 goto make_io;
3687 3878
3688 bitmap_bh = sb_getblk(inode->i_sb, 3879 /* Is the inode bitmap in cache? */
3689 ext4_inode_bitmap(inode->i_sb, desc)); 3880 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3690 if (!bitmap_bh) 3881 if (!bitmap_bh)
3691 goto make_io; 3882 goto make_io;
3692 3883
@@ -3699,14 +3890,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
3699 brelse(bitmap_bh); 3890 brelse(bitmap_bh);
3700 goto make_io; 3891 goto make_io;
3701 } 3892 }
3702 for (i = start; i < start + inodes_per_buffer; i++) { 3893 for (i = start; i < start + inodes_per_block; i++) {
3703 if (i == inode_offset) 3894 if (i == inode_offset)
3704 continue; 3895 continue;
3705 if (ext4_test_bit(i, bitmap_bh->b_data)) 3896 if (ext4_test_bit(i, bitmap_bh->b_data))
3706 break; 3897 break;
3707 } 3898 }
3708 brelse(bitmap_bh); 3899 brelse(bitmap_bh);
3709 if (i == start + inodes_per_buffer) { 3900 if (i == start + inodes_per_block) {
3710 /* all other inodes are free, so skip I/O */ 3901 /* all other inodes are free, so skip I/O */
3711 memset(bh->b_data, 0, bh->b_size); 3902 memset(bh->b_data, 0, bh->b_size);
3712 set_buffer_uptodate(bh); 3903 set_buffer_uptodate(bh);
@@ -3717,6 +3908,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
3717 3908
3718make_io: 3909make_io:
3719 /* 3910 /*
3911 * If we need to do any I/O, try to pre-readahead extra
3912 * blocks from the inode table.
3913 */
3914 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3915 ext4_fsblk_t b, end, table;
3916 unsigned num;
3917
3918 table = ext4_inode_table(sb, gdp);
3919 /* Make sure s_inode_readahead_blks is a power of 2 */
3920 while (EXT4_SB(sb)->s_inode_readahead_blks &
3921 (EXT4_SB(sb)->s_inode_readahead_blks-1))
3922 EXT4_SB(sb)->s_inode_readahead_blks =
3923 (EXT4_SB(sb)->s_inode_readahead_blks &
3924 (EXT4_SB(sb)->s_inode_readahead_blks-1));
3925 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
3926 if (table > b)
3927 b = table;
3928 end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3929 num = EXT4_INODES_PER_GROUP(sb);
3930 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3931 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3932 num -= le16_to_cpu(gdp->bg_itable_unused);
3933 table += num / inodes_per_block;
3934 if (end > table)
3935 end = table;
3936 while (b <= end)
3937 sb_breadahead(sb, b++);
3938 }
3939
3940 /*
3720 * There are other valid inodes in the buffer, this inode 3941 * There are other valid inodes in the buffer, this inode
3721 * has in-inode xattrs, or we don't have this inode in memory. 3942 * has in-inode xattrs, or we don't have this inode in memory.
3722 * Read the block from disk. 3943 * Read the block from disk.
@@ -3726,10 +3947,9 @@ make_io:
3726 submit_bh(READ_META, bh); 3947 submit_bh(READ_META, bh);
3727 wait_on_buffer(bh); 3948 wait_on_buffer(bh);
3728 if (!buffer_uptodate(bh)) { 3949 if (!buffer_uptodate(bh)) {
3729 ext4_error(inode->i_sb, "ext4_get_inode_loc", 3950 ext4_error(sb, __func__,
3730 "unable to read inode block - " 3951 "unable to read inode block - inode=%lu, "
3731 "inode=%lu, block=%llu", 3952 "block=%llu", inode->i_ino, block);
3732 inode->i_ino, block);
3733 brelse(bh); 3953 brelse(bh);
3734 return -EIO; 3954 return -EIO;
3735 } 3955 }
@@ -3821,11 +4041,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3821 return inode; 4041 return inode;
3822 4042
3823 ei = EXT4_I(inode); 4043 ei = EXT4_I(inode);
3824#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 4044#ifdef CONFIG_EXT4_FS_POSIX_ACL
3825 ei->i_acl = EXT4_ACL_NOT_CACHED; 4045 ei->i_acl = EXT4_ACL_NOT_CACHED;
3826 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4046 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
3827#endif 4047#endif
3828 ei->i_block_alloc_info = NULL;
3829 4048
3830 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4049 ret = __ext4_get_inode_loc(inode, &iloc, 0);
3831 if (ret < 0) 4050 if (ret < 0)
@@ -3835,7 +4054,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3835 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4054 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3836 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4055 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3837 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4056 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3838 if(!(test_opt (inode->i_sb, NO_UID32))) { 4057 if (!(test_opt(inode->i_sb, NO_UID32))) {
3839 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4058 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3840 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4059 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3841 } 4060 }
@@ -3853,7 +4072,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3853 if (inode->i_mode == 0 || 4072 if (inode->i_mode == 0 ||
3854 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4073 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3855 /* this inode is deleted */ 4074 /* this inode is deleted */
3856 brelse (bh); 4075 brelse(bh);
3857 ret = -ESTALE; 4076 ret = -ESTALE;
3858 goto bad_inode; 4077 goto bad_inode;
3859 } 4078 }
@@ -3886,7 +4105,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3886 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4105 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3887 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4106 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3888 EXT4_INODE_SIZE(inode->i_sb)) { 4107 EXT4_INODE_SIZE(inode->i_sb)) {
3889 brelse (bh); 4108 brelse(bh);
3890 ret = -EIO; 4109 ret = -EIO;
3891 goto bad_inode; 4110 goto bad_inode;
3892 } 4111 }
@@ -3939,7 +4158,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3939 init_special_inode(inode, inode->i_mode, 4158 init_special_inode(inode, inode->i_mode,
3940 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4159 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3941 } 4160 }
3942 brelse (iloc.bh); 4161 brelse(iloc.bh);
3943 ext4_set_inode_flags(inode); 4162 ext4_set_inode_flags(inode);
3944 unlock_new_inode(inode); 4163 unlock_new_inode(inode);
3945 return inode; 4164 return inode;
@@ -4021,14 +4240,14 @@ static int ext4_do_update_inode(handle_t *handle,
4021 4240
4022 ext4_get_inode_flags(ei); 4241 ext4_get_inode_flags(ei);
4023 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4242 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4024 if(!(test_opt(inode->i_sb, NO_UID32))) { 4243 if (!(test_opt(inode->i_sb, NO_UID32))) {
4025 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4244 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
4026 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4245 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
4027/* 4246/*
4028 * Fix up interoperability with old kernels. Otherwise, old inodes get 4247 * Fix up interoperability with old kernels. Otherwise, old inodes get
4029 * re-used with the upper 16 bits of the uid/gid intact 4248 * re-used with the upper 16 bits of the uid/gid intact
4030 */ 4249 */
4031 if(!ei->i_dtime) { 4250 if (!ei->i_dtime) {
4032 raw_inode->i_uid_high = 4251 raw_inode->i_uid_high =
4033 cpu_to_le16(high_16_bits(inode->i_uid)); 4252 cpu_to_le16(high_16_bits(inode->i_uid));
4034 raw_inode->i_gid_high = 4253 raw_inode->i_gid_high =
@@ -4116,7 +4335,7 @@ static int ext4_do_update_inode(handle_t *handle,
4116 ei->i_state &= ~EXT4_STATE_NEW; 4335 ei->i_state &= ~EXT4_STATE_NEW;
4117 4336
4118out_brelse: 4337out_brelse:
4119 brelse (bh); 4338 brelse(bh);
4120 ext4_std_error(inode->i_sb, err); 4339 ext4_std_error(inode->i_sb, err);
4121 return err; 4340 return err;
4122} 4341}
@@ -4324,57 +4543,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4324 return 0; 4543 return 0;
4325} 4544}
4326 4545
4546static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
4547 int chunk)
4548{
4549 int indirects;
4550
4551 /* if nrblocks are contiguous */
4552 if (chunk) {
4553 /*
4554 * With N contiguous data blocks, it need at most
4555 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
4556 * 2 dindirect blocks
4557 * 1 tindirect block
4558 */
4559 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
4560 return indirects + 3;
4561 }
4562 /*
4563 * if nrblocks are not contiguous, worse case, each block touch
4564 * a indirect block, and each indirect block touch a double indirect
4565 * block, plus a triple indirect block
4566 */
4567 indirects = nrblocks * 2 + 1;
4568 return indirects;
4569}
4570
4571static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4572{
4573 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4574 return ext4_indirect_trans_blocks(inode, nrblocks, 0);
4575 return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
4576}
4327/* 4577/*
4328 * How many blocks doth make a writepage()? 4578 * Account for index blocks, block groups bitmaps and block group
4329 * 4579 * descriptor blocks if modify datablocks and index blocks
4330 * With N blocks per page, it may be: 4580 * worse case, the indexs blocks spread over different block groups
4331 * N data blocks
4332 * 2 indirect block
4333 * 2 dindirect
4334 * 1 tindirect
4335 * N+5 bitmap blocks (from the above)
4336 * N+5 group descriptor summary blocks
4337 * 1 inode block
4338 * 1 superblock.
4339 * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
4340 * 4581 *
4341 * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS 4582 * If datablocks are discontiguous, they are possible to spread over
4583 * different block groups too. If they are contiugous, with flexbg,
4584 * they could still across block group boundary.
4342 * 4585 *
4343 * With ordered or writeback data it's the same, less the N data blocks. 4586 * Also account for superblock, inode, quota and xattr blocks
4587 */
4588int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4589{
4590 int groups, gdpblocks;
4591 int idxblocks;
4592 int ret = 0;
4593
4594 /*
4595 * How many index blocks need to touch to modify nrblocks?
4596 * The "Chunk" flag indicating whether the nrblocks is
4597 * physically contiguous on disk
4598 *
4599 * For Direct IO and fallocate, they calls get_block to allocate
4600 * one single extent at a time, so they could set the "Chunk" flag
4601 */
4602 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
4603
4604 ret = idxblocks;
4605
4606 /*
4607 * Now let's see how many group bitmaps and group descriptors need
4608 * to account
4609 */
4610 groups = idxblocks;
4611 if (chunk)
4612 groups += 1;
4613 else
4614 groups += nrblocks;
4615
4616 gdpblocks = groups;
4617 if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
4618 groups = EXT4_SB(inode->i_sb)->s_groups_count;
4619 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4620 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4621
4622 /* bitmaps and block group descriptor blocks */
4623 ret += groups + gdpblocks;
4624
4625 /* Blocks for super block, inode, quota and xattr blocks */
4626 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4627
4628 return ret;
4629}
4630
4631/*
4632 * Calulate the total number of credits to reserve to fit
4633 * the modification of a single pages into a single transaction,
4634 * which may include multiple chunks of block allocations.
4344 * 4635 *
4345 * If the inode's direct blocks can hold an integral number of pages then a 4636 * This could be called via ext4_write_begin()
4346 * page cannot straddle two indirect blocks, and we can only touch one indirect
4347 * and dindirect block, and the "5" above becomes "3".
4348 * 4637 *
4349 * This still overestimates under most circumstances. If we were to pass the 4638 * We need to consider the worse case, when
4350 * start and end offsets in here as well we could do block_to_path() on each 4639 * one new block per extent.
4351 * block and work out the exact number of indirects which are touched. Pah.
4352 */ 4640 */
4353
4354int ext4_writepage_trans_blocks(struct inode *inode) 4641int ext4_writepage_trans_blocks(struct inode *inode)
4355{ 4642{
4356 int bpp = ext4_journal_blocks_per_page(inode); 4643 int bpp = ext4_journal_blocks_per_page(inode);
4357 int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
4358 int ret; 4644 int ret;
4359 4645
4360 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 4646 ret = ext4_meta_trans_blocks(inode, bpp, 0);
4361 return ext4_ext_writepage_trans_blocks(inode, bpp);
4362 4647
4648 /* Account for data blocks for journalled mode */
4363 if (ext4_should_journal_data(inode)) 4649 if (ext4_should_journal_data(inode))
4364 ret = 3 * (bpp + indirects) + 2; 4650 ret += bpp;
4365 else
4366 ret = 2 * (bpp + indirects) + 2;
4367
4368#ifdef CONFIG_QUOTA
4369 /* We know that structure was already allocated during DQUOT_INIT so
4370 * we will be updating only the data blocks + inodes */
4371 ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
4372#endif
4373
4374 return ret; 4651 return ret;
4375} 4652}
4376 4653
4377/* 4654/*
4655 * Calculate the journal credits for a chunk of data modification.
4656 *
4657 * This is called from DIO, fallocate or whoever calling
4658 * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
4659 *
4660 * journal buffers for data blocks are not included here, as DIO
4661 * and fallocate do no need to journal data buffers.
4662 */
4663int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4664{
4665 return ext4_meta_trans_blocks(inode, nrblocks, 1);
4666}
4667
4668/*
4378 * The caller must have previously called ext4_reserve_inode_write(). 4669 * The caller must have previously called ext4_reserve_inode_write().
4379 * Give this, we know that the caller already has write access to iloc->bh. 4670 * Give this, we know that the caller already has write access to iloc->bh.
4380 */ 4671 */
@@ -4647,6 +4938,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4647 loff_t size; 4938 loff_t size;
4648 unsigned long len; 4939 unsigned long len;
4649 int ret = -EINVAL; 4940 int ret = -EINVAL;
4941 void *fsdata;
4650 struct file *file = vma->vm_file; 4942 struct file *file = vma->vm_file;
4651 struct inode *inode = file->f_path.dentry->d_inode; 4943 struct inode *inode = file->f_path.dentry->d_inode;
4652 struct address_space *mapping = inode->i_mapping; 4944 struct address_space *mapping = inode->i_mapping;
@@ -4685,11 +4977,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4685 * on the same page though 4977 * on the same page though
4686 */ 4978 */
4687 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 4979 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4688 len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 4980 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
4689 if (ret < 0) 4981 if (ret < 0)
4690 goto out_unlock; 4982 goto out_unlock;
4691 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 4983 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4692 len, len, page, NULL); 4984 len, len, page, fsdata);
4693 if (ret < 0) 4985 if (ret < 0)
4694 goto out_unlock; 4986 goto out_unlock;
4695 ret = 0; 4987 ret = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1faba6..dc99b4776d58 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
23 struct inode *inode = filp->f_dentry->d_inode; 23 struct inode *inode = filp->f_dentry->d_inode;
24 struct ext4_inode_info *ei = EXT4_I(inode); 24 struct ext4_inode_info *ei = EXT4_I(inode);
25 unsigned int flags; 25 unsigned int flags;
26 unsigned short rsv_window_size;
27 26
28 ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg); 27 ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
29 28
30 switch (cmd) { 29 switch (cmd) {
31 case EXT4_IOC_GETFLAGS: 30 case EXT4_IOC_GETFLAGS:
@@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
34 return put_user(flags, (int __user *) arg); 33 return put_user(flags, (int __user *) arg);
35 case EXT4_IOC_SETFLAGS: { 34 case EXT4_IOC_SETFLAGS: {
36 handle_t *handle = NULL; 35 handle_t *handle = NULL;
37 int err; 36 int err, migrate = 0;
38 struct ext4_iloc iloc; 37 struct ext4_iloc iloc;
39 unsigned int oldflags; 38 unsigned int oldflags;
40 unsigned int jflag; 39 unsigned int jflag;
@@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
82 if (!capable(CAP_SYS_RESOURCE)) 81 if (!capable(CAP_SYS_RESOURCE))
83 goto flags_out; 82 goto flags_out;
84 } 83 }
84 if (oldflags & EXT4_EXTENTS_FL) {
85 /* We don't support clearning extent flags */
86 if (!(flags & EXT4_EXTENTS_FL)) {
87 err = -EOPNOTSUPP;
88 goto flags_out;
89 }
90 } else if (flags & EXT4_EXTENTS_FL) {
91 /* migrate the file */
92 migrate = 1;
93 flags &= ~EXT4_EXTENTS_FL;
94 }
85 95
86 handle = ext4_journal_start(inode, 1); 96 handle = ext4_journal_start(inode, 1);
87 if (IS_ERR(handle)) { 97 if (IS_ERR(handle)) {
@@ -109,6 +119,10 @@ flags_err:
109 119
110 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) 120 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
111 err = ext4_change_inode_journal_flag(inode, jflag); 121 err = ext4_change_inode_journal_flag(inode, jflag);
122 if (err)
123 goto flags_out;
124 if (migrate)
125 err = ext4_ext_migrate(inode);
112flags_out: 126flags_out:
113 mutex_unlock(&inode->i_mutex); 127 mutex_unlock(&inode->i_mutex);
114 mnt_drop_write(filp->f_path.mnt); 128 mnt_drop_write(filp->f_path.mnt);
@@ -175,53 +189,10 @@ setversion_out:
175 return ret; 189 return ret;
176 } 190 }
177#endif 191#endif
178 case EXT4_IOC_GETRSVSZ:
179 if (test_opt(inode->i_sb, RESERVATION)
180 && S_ISREG(inode->i_mode)
181 && ei->i_block_alloc_info) {
182 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
183 return put_user(rsv_window_size, (int __user *)arg);
184 }
185 return -ENOTTY;
186 case EXT4_IOC_SETRSVSZ: {
187 int err;
188
189 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
190 return -ENOTTY;
191
192 if (!is_owner_or_cap(inode))
193 return -EACCES;
194
195 if (get_user(rsv_window_size, (int __user *)arg))
196 return -EFAULT;
197
198 err = mnt_want_write(filp->f_path.mnt);
199 if (err)
200 return err;
201
202 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
203 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
204
205 /*
206 * need to allocate reservation structure for this inode
207 * before set the window size
208 */
209 down_write(&ei->i_data_sem);
210 if (!ei->i_block_alloc_info)
211 ext4_init_block_alloc_info(inode);
212
213 if (ei->i_block_alloc_info){
214 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
215 rsv->rsv_goal_size = rsv_window_size;
216 }
217 up_write(&ei->i_data_sem);
218 mnt_drop_write(filp->f_path.mnt);
219 return 0;
220 }
221 case EXT4_IOC_GROUP_EXTEND: { 192 case EXT4_IOC_GROUP_EXTEND: {
222 ext4_fsblk_t n_blocks_count; 193 ext4_fsblk_t n_blocks_count;
223 struct super_block *sb = inode->i_sb; 194 struct super_block *sb = inode->i_sb;
224 int err; 195 int err, err2;
225 196
226 if (!capable(CAP_SYS_RESOURCE)) 197 if (!capable(CAP_SYS_RESOURCE))
227 return -EPERM; 198 return -EPERM;
@@ -235,8 +206,10 @@ setversion_out:
235 206
236 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 207 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
237 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 208 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
238 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 209 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
239 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 210 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
211 if (err == 0)
212 err = err2;
240 mnt_drop_write(filp->f_path.mnt); 213 mnt_drop_write(filp->f_path.mnt);
241 214
242 return err; 215 return err;
@@ -244,7 +217,7 @@ setversion_out:
244 case EXT4_IOC_GROUP_ADD: { 217 case EXT4_IOC_GROUP_ADD: {
245 struct ext4_new_group_data input; 218 struct ext4_new_group_data input;
246 struct super_block *sb = inode->i_sb; 219 struct super_block *sb = inode->i_sb;
247 int err; 220 int err, err2;
248 221
249 if (!capable(CAP_SYS_RESOURCE)) 222 if (!capable(CAP_SYS_RESOURCE))
250 return -EPERM; 223 return -EPERM;
@@ -259,15 +232,36 @@ setversion_out:
259 232
260 err = ext4_group_add(sb, &input); 233 err = ext4_group_add(sb, &input);
261 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 234 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
262 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 235 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
263 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 236 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
237 if (err == 0)
238 err = err2;
264 mnt_drop_write(filp->f_path.mnt); 239 mnt_drop_write(filp->f_path.mnt);
265 240
266 return err; 241 return err;
267 } 242 }
268 243
269 case EXT4_IOC_MIGRATE: 244 case EXT4_IOC_MIGRATE:
270 return ext4_ext_migrate(inode, filp, cmd, arg); 245 {
246 int err;
247 if (!is_owner_or_cap(inode))
248 return -EACCES;
249
250 err = mnt_want_write(filp->f_path.mnt);
251 if (err)
252 return err;
253 /*
254 * inode_mutex prevent write and truncate on the file.
255 * Read still goes through. We take i_data_sem in
256 * ext4_ext_swap_inode_data before we switch the
257 * inode format to prevent read.
258 */
259 mutex_lock(&(inode->i_mutex));
260 err = ext4_ext_migrate(inode);
261 mutex_unlock(&(inode->i_mutex));
262 mnt_drop_write(filp->f_path.mnt);
263 return err;
264 }
271 265
272 default: 266 default:
273 return -ENOTTY; 267 return -ENOTTY;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 865e9ddb44d4..b580714f0d85 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -477,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
477 b2 = (unsigned char *) bitmap; 477 b2 = (unsigned char *) bitmap;
478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
479 if (b1[i] != b2[i]) { 479 if (b1[i] != b2[i]) {
480 printk("corruption in group %lu at byte %u(%u):" 480 printk(KERN_ERR "corruption in group %lu "
481 " %x in copy != %x on disk/prealloc\n", 481 "at byte %u(%u): %x in copy != %x "
482 e4b->bd_group, i, i * 8, b1[i], b2[i]); 482 "on disk/prealloc\n",
483 e4b->bd_group, i, i * 8, b1[i], b2[i]);
483 BUG(); 484 BUG();
484 } 485 }
485 } 486 }
@@ -533,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
533 void *buddy; 534 void *buddy;
534 void *buddy2; 535 void *buddy2;
535 536
536 if (!test_opt(sb, MBALLOC))
537 return 0;
538
539 { 537 {
540 static int mb_check_counter; 538 static int mb_check_counter;
541 if (mb_check_counter++ % 100 != 0) 539 if (mb_check_counter++ % 100 != 0)
@@ -784,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
784 if (bh[i] == NULL) 782 if (bh[i] == NULL)
785 goto out; 783 goto out;
786 784
787 if (bh_uptodate_or_lock(bh[i])) 785 if (buffer_uptodate(bh[i]) &&
786 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
788 continue; 787 continue;
789 788
789 lock_buffer(bh[i]);
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
792 ext4_init_block_bitmap(sb, bh[i], 792 ext4_init_block_bitmap(sb, bh[i],
@@ -2169,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb)
2169{ 2169{
2170 struct ext4_sb_info *sbi = EXT4_SB(sb); 2170 struct ext4_sb_info *sbi = EXT4_SB(sb);
2171 2171
2172 remove_proc_entry("mb_groups", sbi->s_mb_proc); 2172 if (sbi->s_proc != NULL) {
2173 remove_proc_entry("mb_history", sbi->s_mb_proc); 2173 remove_proc_entry("mb_groups", sbi->s_proc);
2174 2174 remove_proc_entry("mb_history", sbi->s_proc);
2175 }
2175 kfree(sbi->s_mb_history); 2176 kfree(sbi->s_mb_history);
2176} 2177}
2177 2178
@@ -2180,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb)
2180 struct ext4_sb_info *sbi = EXT4_SB(sb); 2181 struct ext4_sb_info *sbi = EXT4_SB(sb);
2181 int i; 2182 int i;
2182 2183
2183 if (sbi->s_mb_proc != NULL) { 2184 if (sbi->s_proc != NULL) {
2184 proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc, 2185 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2185 &ext4_mb_seq_history_fops, sb); 2186 &ext4_mb_seq_history_fops, sb);
2186 proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc, 2187 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2187 &ext4_mb_seq_groups_fops, sb); 2188 &ext4_mb_seq_groups_fops, sb);
2188 } 2189 }
2189 2190
@@ -2485,19 +2486,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2485 unsigned max; 2486 unsigned max;
2486 int ret; 2487 int ret;
2487 2488
2488 if (!test_opt(sb, MBALLOC))
2489 return 0;
2490
2491 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2489 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
2492 2490
2493 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2491 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2494 if (sbi->s_mb_offsets == NULL) { 2492 if (sbi->s_mb_offsets == NULL) {
2495 clear_opt(sbi->s_mount_opt, MBALLOC);
2496 return -ENOMEM; 2493 return -ENOMEM;
2497 } 2494 }
2498 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2495 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2499 if (sbi->s_mb_maxs == NULL) { 2496 if (sbi->s_mb_maxs == NULL) {
2500 clear_opt(sbi->s_mount_opt, MBALLOC);
2501 kfree(sbi->s_mb_maxs); 2497 kfree(sbi->s_mb_maxs);
2502 return -ENOMEM; 2498 return -ENOMEM;
2503 } 2499 }
@@ -2520,7 +2516,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2520 /* init file for buddy data */ 2516 /* init file for buddy data */
2521 ret = ext4_mb_init_backend(sb); 2517 ret = ext4_mb_init_backend(sb);
2522 if (ret != 0) { 2518 if (ret != 0) {
2523 clear_opt(sbi->s_mount_opt, MBALLOC);
2524 kfree(sbi->s_mb_offsets); 2519 kfree(sbi->s_mb_offsets);
2525 kfree(sbi->s_mb_maxs); 2520 kfree(sbi->s_mb_maxs);
2526 return ret; 2521 return ret;
@@ -2540,17 +2535,15 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2540 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; 2535 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2541 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2536 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2542 2537
2543 i = sizeof(struct ext4_locality_group) * nr_cpu_ids; 2538 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2544 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
2545 if (sbi->s_locality_groups == NULL) { 2539 if (sbi->s_locality_groups == NULL) {
2546 clear_opt(sbi->s_mount_opt, MBALLOC);
2547 kfree(sbi->s_mb_offsets); 2540 kfree(sbi->s_mb_offsets);
2548 kfree(sbi->s_mb_maxs); 2541 kfree(sbi->s_mb_maxs);
2549 return -ENOMEM; 2542 return -ENOMEM;
2550 } 2543 }
2551 for (i = 0; i < nr_cpu_ids; i++) { 2544 for_each_possible_cpu(i) {
2552 struct ext4_locality_group *lg; 2545 struct ext4_locality_group *lg;
2553 lg = &sbi->s_locality_groups[i]; 2546 lg = per_cpu_ptr(sbi->s_locality_groups, i);
2554 mutex_init(&lg->lg_mutex); 2547 mutex_init(&lg->lg_mutex);
2555 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2548 for (j = 0; j < PREALLOC_TB_SIZE; j++)
2556 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2549 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
@@ -2560,7 +2553,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2560 ext4_mb_init_per_dev_proc(sb); 2553 ext4_mb_init_per_dev_proc(sb);
2561 ext4_mb_history_init(sb); 2554 ext4_mb_history_init(sb);
2562 2555
2563 printk("EXT4-fs: mballoc enabled\n"); 2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2564 return 0; 2557 return 0;
2565} 2558}
2566 2559
@@ -2589,9 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
2589 struct ext4_group_info *grinfo; 2582 struct ext4_group_info *grinfo;
2590 struct ext4_sb_info *sbi = EXT4_SB(sb); 2583 struct ext4_sb_info *sbi = EXT4_SB(sb);
2591 2584
2592 if (!test_opt(sb, MBALLOC))
2593 return 0;
2594
2595 /* release freed, non-committed blocks */ 2585 /* release freed, non-committed blocks */
2596 spin_lock(&sbi->s_md_lock); 2586 spin_lock(&sbi->s_md_lock);
2597 list_splice_init(&sbi->s_closed_transaction, 2587 list_splice_init(&sbi->s_closed_transaction,
@@ -2647,8 +2637,7 @@ int ext4_mb_release(struct super_block *sb)
2647 atomic_read(&sbi->s_mb_discarded)); 2637 atomic_read(&sbi->s_mb_discarded));
2648 } 2638 }
2649 2639
2650 kfree(sbi->s_locality_groups); 2640 free_percpu(sbi->s_locality_groups);
2651
2652 ext4_mb_history_release(sb); 2641 ext4_mb_history_release(sb);
2653 ext4_mb_destroy_per_dev_proc(sb); 2642 ext4_mb_destroy_per_dev_proc(sb);
2654 2643
@@ -2721,118 +2710,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2721#define EXT4_MB_STREAM_REQ "stream_req" 2710#define EXT4_MB_STREAM_REQ "stream_req"
2722#define EXT4_MB_GROUP_PREALLOC "group_prealloc" 2711#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2723 2712
2724
2725
2726#define MB_PROC_FOPS(name) \
2727static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
2728{ \
2729 struct ext4_sb_info *sbi = m->private; \
2730 \
2731 seq_printf(m, "%ld\n", sbi->s_mb_##name); \
2732 return 0; \
2733} \
2734 \
2735static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
2736{ \
2737 return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
2738} \
2739 \
2740static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
2741 const char __user *buf, size_t cnt, loff_t *ppos) \
2742{ \
2743 struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
2744 char str[32]; \
2745 long value; \
2746 if (cnt >= sizeof(str)) \
2747 return -EINVAL; \
2748 if (copy_from_user(str, buf, cnt)) \
2749 return -EFAULT; \
2750 value = simple_strtol(str, NULL, 0); \
2751 if (value <= 0) \
2752 return -ERANGE; \
2753 sbi->s_mb_##name = value; \
2754 return cnt; \
2755} \
2756 \
2757static const struct file_operations ext4_mb_##name##_proc_fops = { \
2758 .owner = THIS_MODULE, \
2759 .open = ext4_mb_##name##_proc_open, \
2760 .read = seq_read, \
2761 .llseek = seq_lseek, \
2762 .release = single_release, \
2763 .write = ext4_mb_##name##_proc_write, \
2764};
2765
2766MB_PROC_FOPS(stats);
2767MB_PROC_FOPS(max_to_scan);
2768MB_PROC_FOPS(min_to_scan);
2769MB_PROC_FOPS(order2_reqs);
2770MB_PROC_FOPS(stream_request);
2771MB_PROC_FOPS(group_prealloc);
2772
2773#define MB_PROC_HANDLER(name, var) \
2774do { \
2775 proc = proc_create_data(name, mode, sbi->s_mb_proc, \
2776 &ext4_mb_##var##_proc_fops, sbi); \
2777 if (proc == NULL) { \
2778 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2779 goto err_out; \
2780 } \
2781} while (0)
2782
2783static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2713static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2784{ 2714{
2785 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2715 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2786 struct ext4_sb_info *sbi = EXT4_SB(sb); 2716 struct ext4_sb_info *sbi = EXT4_SB(sb);
2787 struct proc_dir_entry *proc; 2717 struct proc_dir_entry *proc;
2788 char devname[64];
2789 2718
2790 if (proc_root_ext4 == NULL) { 2719 if (sbi->s_proc == NULL)
2791 sbi->s_mb_proc = NULL;
2792 return -EINVAL; 2720 return -EINVAL;
2793 }
2794 bdevname(sb->s_bdev, devname);
2795 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2796
2797 MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
2798 MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
2799 MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
2800 MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
2801 MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
2802 MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
2803 2721
2722 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2723 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2724 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2725 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2726 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2727 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2804 return 0; 2728 return 0;
2805 2729
2806err_out: 2730err_out:
2807 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); 2731 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2808 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2732 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2809 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2733 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2810 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2734 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2811 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2735 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2812 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2736 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2813 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2814 remove_proc_entry(devname, proc_root_ext4);
2815 sbi->s_mb_proc = NULL;
2816
2817 return -ENOMEM; 2737 return -ENOMEM;
2818} 2738}
2819 2739
2820static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2740static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2821{ 2741{
2822 struct ext4_sb_info *sbi = EXT4_SB(sb); 2742 struct ext4_sb_info *sbi = EXT4_SB(sb);
2823 char devname[64];
2824 2743
2825 if (sbi->s_mb_proc == NULL) 2744 if (sbi->s_proc == NULL)
2826 return -EINVAL; 2745 return -EINVAL;
2827 2746
2828 bdevname(sb->s_bdev, devname); 2747 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2829 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2748 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2830 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2749 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2831 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2750 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2832 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2751 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2833 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2752 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2834 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2835 remove_proc_entry(devname, proc_root_ext4);
2836 2753
2837 return 0; 2754 return 0;
2838} 2755}
@@ -2854,11 +2771,6 @@ int __init init_ext4_mballoc(void)
2854 kmem_cache_destroy(ext4_pspace_cachep); 2771 kmem_cache_destroy(ext4_pspace_cachep);
2855 return -ENOMEM; 2772 return -ENOMEM;
2856 } 2773 }
2857#ifdef CONFIG_PROC_FS
2858 proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
2859 if (proc_root_ext4 == NULL)
2860 printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
2861#endif
2862 return 0; 2774 return 0;
2863} 2775}
2864 2776
@@ -2867,9 +2779,6 @@ void exit_ext4_mballoc(void)
2867 /* XXX: synchronize_rcu(); */ 2779 /* XXX: synchronize_rcu(); */
2868 kmem_cache_destroy(ext4_pspace_cachep); 2780 kmem_cache_destroy(ext4_pspace_cachep);
2869 kmem_cache_destroy(ext4_ac_cachep); 2781 kmem_cache_destroy(ext4_ac_cachep);
2870#ifdef CONFIG_PROC_FS
2871 remove_proc_entry("fs/ext4", NULL);
2872#endif
2873} 2782}
2874 2783
2875 2784
@@ -2879,7 +2788,7 @@ void exit_ext4_mballoc(void)
2879 */ 2788 */
2880static noinline_for_stack int 2789static noinline_for_stack int
2881ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2790ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2882 handle_t *handle) 2791 handle_t *handle, unsigned long reserv_blks)
2883{ 2792{
2884 struct buffer_head *bitmap_bh = NULL; 2793 struct buffer_head *bitmap_bh = NULL;
2885 struct ext4_super_block *es; 2794 struct ext4_super_block *es;
@@ -2968,15 +2877,16 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2968 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 2877 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
2969 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2878 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2970 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2879 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2971 2880 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
2972 /* 2881 /*
2973 * free blocks account has already be reduced/reserved 2882 * Now reduce the dirty block count also. Should not go negative
2974 * at write_begin() time for delayed allocation
2975 * do not double accounting
2976 */ 2883 */
2977 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2884 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2978 percpu_counter_sub(&sbi->s_freeblocks_counter, 2885 /* release all the reserved blocks if non delalloc */
2979 ac->ac_b_ex.fe_len); 2886 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
2887 else
2888 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
2889 ac->ac_b_ex.fe_len);
2980 2890
2981 if (sbi->s_log_groups_per_flex) { 2891 if (sbi->s_log_groups_per_flex) {
2982 ext4_group_t flex_group = ext4_flex_group(sbi, 2892 ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3282,6 +3192,35 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3282} 3192}
3283 3193
3284/* 3194/*
3195 * Return the prealloc space that have minimal distance
3196 * from the goal block. @cpa is the prealloc
3197 * space that is having currently known minimal distance
3198 * from the goal block.
3199 */
3200static struct ext4_prealloc_space *
3201ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3202 struct ext4_prealloc_space *pa,
3203 struct ext4_prealloc_space *cpa)
3204{
3205 ext4_fsblk_t cur_distance, new_distance;
3206
3207 if (cpa == NULL) {
3208 atomic_inc(&pa->pa_count);
3209 return pa;
3210 }
3211 cur_distance = abs(goal_block - cpa->pa_pstart);
3212 new_distance = abs(goal_block - pa->pa_pstart);
3213
3214 if (cur_distance < new_distance)
3215 return cpa;
3216
3217 /* drop the previous reference */
3218 atomic_dec(&cpa->pa_count);
3219 atomic_inc(&pa->pa_count);
3220 return pa;
3221}
3222
3223/*
3285 * search goal blocks in preallocated space 3224 * search goal blocks in preallocated space
3286 */ 3225 */
3287static noinline_for_stack int 3226static noinline_for_stack int
@@ -3290,7 +3229,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3290 int order, i; 3229 int order, i;
3291 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3230 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3292 struct ext4_locality_group *lg; 3231 struct ext4_locality_group *lg;
3293 struct ext4_prealloc_space *pa; 3232 struct ext4_prealloc_space *pa, *cpa = NULL;
3233 ext4_fsblk_t goal_block;
3294 3234
3295 /* only data can be preallocated */ 3235 /* only data can be preallocated */
3296 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3236 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3333,6 +3273,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3333 /* The max size of hash table is PREALLOC_TB_SIZE */ 3273 /* The max size of hash table is PREALLOC_TB_SIZE */
3334 order = PREALLOC_TB_SIZE - 1; 3274 order = PREALLOC_TB_SIZE - 1;
3335 3275
3276 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
3277 ac->ac_g_ex.fe_start +
3278 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
3279 /*
3280 * search for the prealloc space that is having
3281 * minimal distance from the goal block.
3282 */
3336 for (i = order; i < PREALLOC_TB_SIZE; i++) { 3283 for (i = order; i < PREALLOC_TB_SIZE; i++) {
3337 rcu_read_lock(); 3284 rcu_read_lock();
3338 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 3285 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
@@ -3340,17 +3287,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3340 spin_lock(&pa->pa_lock); 3287 spin_lock(&pa->pa_lock);
3341 if (pa->pa_deleted == 0 && 3288 if (pa->pa_deleted == 0 &&
3342 pa->pa_free >= ac->ac_o_ex.fe_len) { 3289 pa->pa_free >= ac->ac_o_ex.fe_len) {
3343 atomic_inc(&pa->pa_count); 3290
3344 ext4_mb_use_group_pa(ac, pa); 3291 cpa = ext4_mb_check_group_pa(goal_block,
3345 spin_unlock(&pa->pa_lock); 3292 pa, cpa);
3346 ac->ac_criteria = 20;
3347 rcu_read_unlock();
3348 return 1;
3349 } 3293 }
3350 spin_unlock(&pa->pa_lock); 3294 spin_unlock(&pa->pa_lock);
3351 } 3295 }
3352 rcu_read_unlock(); 3296 rcu_read_unlock();
3353 } 3297 }
3298 if (cpa) {
3299 ext4_mb_use_group_pa(ac, cpa);
3300 ac->ac_criteria = 20;
3301 return 1;
3302 }
3354 return 0; 3303 return 0;
3355} 3304}
3356 3305
@@ -3845,7 +3794,7 @@ out:
3845 * 3794 *
3846 * FIXME!! Make sure it is valid at all the call sites 3795 * FIXME!! Make sure it is valid at all the call sites
3847 */ 3796 */
3848void ext4_mb_discard_inode_preallocations(struct inode *inode) 3797void ext4_discard_preallocations(struct inode *inode)
3849{ 3798{
3850 struct ext4_inode_info *ei = EXT4_I(inode); 3799 struct ext4_inode_info *ei = EXT4_I(inode);
3851 struct super_block *sb = inode->i_sb; 3800 struct super_block *sb = inode->i_sb;
@@ -3857,7 +3806,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
3857 struct ext4_buddy e4b; 3806 struct ext4_buddy e4b;
3858 int err; 3807 int err;
3859 3808
3860 if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { 3809 if (!S_ISREG(inode->i_mode)) {
3861 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3810 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3862 return; 3811 return;
3863 } 3812 }
@@ -4055,8 +4004,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4055 * per cpu locality group is to reduce the contention between block 4004 * per cpu locality group is to reduce the contention between block
4056 * request from multiple CPUs. 4005 * request from multiple CPUs.
4057 */ 4006 */
4058 ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; 4007 ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
4059 put_cpu();
4060 4008
4061 /* we're going to use group allocation */ 4009 /* we're going to use group allocation */
4062 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4010 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4330,33 +4278,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4330ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4278ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4331 struct ext4_allocation_request *ar, int *errp) 4279 struct ext4_allocation_request *ar, int *errp)
4332{ 4280{
4281 int freed;
4333 struct ext4_allocation_context *ac = NULL; 4282 struct ext4_allocation_context *ac = NULL;
4334 struct ext4_sb_info *sbi; 4283 struct ext4_sb_info *sbi;
4335 struct super_block *sb; 4284 struct super_block *sb;
4336 ext4_fsblk_t block = 0; 4285 ext4_fsblk_t block = 0;
4337 int freed; 4286 unsigned long inquota;
4338 int inquota; 4287 unsigned long reserv_blks = 0;
4339 4288
4340 sb = ar->inode->i_sb; 4289 sb = ar->inode->i_sb;
4341 sbi = EXT4_SB(sb); 4290 sbi = EXT4_SB(sb);
4342 4291
4343 if (!test_opt(sb, MBALLOC)) {
4344 block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
4345 &(ar->len), errp);
4346 return block;
4347 }
4348 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4292 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4349 /* 4293 /*
4350 * With delalloc we already reserved the blocks 4294 * With delalloc we already reserved the blocks
4351 */ 4295 */
4352 ar->len = ext4_has_free_blocks(sbi, ar->len); 4296 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
4353 } 4297 /* let others to free the space */
4354 4298 yield();
4355 if (ar->len == 0) { 4299 ar->len = ar->len >> 1;
4356 *errp = -ENOSPC; 4300 }
4357 return 0; 4301 if (!ar->len) {
4302 *errp = -ENOSPC;
4303 return 0;
4304 }
4305 reserv_blks = ar->len;
4358 } 4306 }
4359
4360 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4307 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4361 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4308 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4362 ar->len--; 4309 ar->len--;
@@ -4402,7 +4349,7 @@ repeat:
4402 } 4349 }
4403 4350
4404 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4351 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4405 *errp = ext4_mb_mark_diskspace_used(ac, handle); 4352 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4406 if (*errp == -EAGAIN) { 4353 if (*errp == -EAGAIN) {
4407 ac->ac_b_ex.fe_group = 0; 4354 ac->ac_b_ex.fe_group = 0;
4408 ac->ac_b_ex.fe_start = 0; 4355 ac->ac_b_ex.fe_start = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c7c9906c2a75..b3b4828f8b89 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -257,7 +257,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
257 257
258#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 258#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
259 259
260static struct proc_dir_entry *proc_root_ext4;
261struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); 260struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
262 261
263static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 262static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b9e077ba07e9..f2a9cf498ecd 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
53 * credit. But below we try to not accumalate too much 53 * credit. But below we try to not accumalate too much
54 * of them by restarting the journal. 54 * of them by restarting the journal.
55 */ 55 */
56 needed = ext4_ext_calc_credits_for_insert(inode, path); 56 needed = ext4_ext_calc_credits_for_single_extent(inode,
57 lb->last_block - lb->first_block + 1, path);
57 58
58 /* 59 /*
59 * Make sure the credit we accumalated is not really high 60 * Make sure the credit we accumalated is not really high
@@ -446,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
446 447
447} 448}
448 449
449int ext4_ext_migrate(struct inode *inode, struct file *filp, 450int ext4_ext_migrate(struct inode *inode)
450 unsigned int cmd, unsigned long arg)
451{ 451{
452 handle_t *handle; 452 handle_t *handle;
453 int retval = 0, i; 453 int retval = 0, i;
@@ -515,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
515 * when we add extents we extent the journal 515 * when we add extents we extent the journal
516 */ 516 */
517 /* 517 /*
518 * inode_mutex prevent write and truncate on the file. Read still goes
519 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
520 * switch the inode format to prevent read.
521 */
522 mutex_lock(&(inode->i_mutex));
523 /*
524 * Even though we take i_mutex we can still cause block allocation 518 * Even though we take i_mutex we can still cause block allocation
525 * via mmap write to holes. If we have allocated new blocks we fail 519 * via mmap write to holes. If we have allocated new blocks we fail
526 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 520 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
@@ -622,7 +616,6 @@ err_out:
622 tmp_inode->i_nlink = 0; 616 tmp_inode->i_nlink = 0;
623 617
624 ext4_journal_stop(handle); 618 ext4_journal_stop(handle);
625 mutex_unlock(&(inode->i_mutex));
626 619
627 if (tmp_inode) 620 if (tmp_inode)
628 iput(tmp_inode); 621 iput(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 387ad98350c3..92db9e945147 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,34 +151,36 @@ struct dx_map_entry
151 151
152static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); 152static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
153static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); 153static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
154static inline unsigned dx_get_hash (struct dx_entry *entry); 154static inline unsigned dx_get_hash(struct dx_entry *entry);
155static void dx_set_hash (struct dx_entry *entry, unsigned value); 155static void dx_set_hash(struct dx_entry *entry, unsigned value);
156static unsigned dx_get_count (struct dx_entry *entries); 156static unsigned dx_get_count(struct dx_entry *entries);
157static unsigned dx_get_limit (struct dx_entry *entries); 157static unsigned dx_get_limit(struct dx_entry *entries);
158static void dx_set_count (struct dx_entry *entries, unsigned value); 158static void dx_set_count(struct dx_entry *entries, unsigned value);
159static void dx_set_limit (struct dx_entry *entries, unsigned value); 159static void dx_set_limit(struct dx_entry *entries, unsigned value);
160static unsigned dx_root_limit (struct inode *dir, unsigned infosize); 160static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
161static unsigned dx_node_limit (struct inode *dir); 161static unsigned dx_node_limit(struct inode *dir);
162static struct dx_frame *dx_probe(struct dentry *dentry, 162static struct dx_frame *dx_probe(const struct qstr *d_name,
163 struct inode *dir, 163 struct inode *dir,
164 struct dx_hash_info *hinfo, 164 struct dx_hash_info *hinfo,
165 struct dx_frame *frame, 165 struct dx_frame *frame,
166 int *err); 166 int *err);
167static void dx_release (struct dx_frame *frames); 167static void dx_release(struct dx_frame *frames);
168static int dx_make_map (struct ext4_dir_entry_2 *de, int size, 168static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
169 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 169 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
170static void dx_sort_map(struct dx_map_entry *map, unsigned count); 170static void dx_sort_map(struct dx_map_entry *map, unsigned count);
171static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, 171static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
172 struct dx_map_entry *offsets, int count); 172 struct dx_map_entry *offsets, int count);
173static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); 173static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
174static void dx_insert_block(struct dx_frame *frame, 174static void dx_insert_block(struct dx_frame *frame,
175 u32 hash, ext4_lblk_t block); 175 u32 hash, ext4_lblk_t block);
176static int ext4_htree_next_block(struct inode *dir, __u32 hash, 176static int ext4_htree_next_block(struct inode *dir, __u32 hash,
177 struct dx_frame *frame, 177 struct dx_frame *frame,
178 struct dx_frame *frames, 178 struct dx_frame *frames,
179 __u32 *start_hash); 179 __u32 *start_hash);
180static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, 180static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
181 struct ext4_dir_entry_2 **res_dir, int *err); 181 const struct qstr *d_name,
182 struct ext4_dir_entry_2 **res_dir,
183 int *err);
182static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 184static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
183 struct inode *inode); 185 struct inode *inode);
184 186
@@ -207,44 +209,44 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
207 entry->block = cpu_to_le32(value); 209 entry->block = cpu_to_le32(value);
208} 210}
209 211
210static inline unsigned dx_get_hash (struct dx_entry *entry) 212static inline unsigned dx_get_hash(struct dx_entry *entry)
211{ 213{
212 return le32_to_cpu(entry->hash); 214 return le32_to_cpu(entry->hash);
213} 215}
214 216
215static inline void dx_set_hash (struct dx_entry *entry, unsigned value) 217static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
216{ 218{
217 entry->hash = cpu_to_le32(value); 219 entry->hash = cpu_to_le32(value);
218} 220}
219 221
220static inline unsigned dx_get_count (struct dx_entry *entries) 222static inline unsigned dx_get_count(struct dx_entry *entries)
221{ 223{
222 return le16_to_cpu(((struct dx_countlimit *) entries)->count); 224 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
223} 225}
224 226
225static inline unsigned dx_get_limit (struct dx_entry *entries) 227static inline unsigned dx_get_limit(struct dx_entry *entries)
226{ 228{
227 return le16_to_cpu(((struct dx_countlimit *) entries)->limit); 229 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
228} 230}
229 231
230static inline void dx_set_count (struct dx_entry *entries, unsigned value) 232static inline void dx_set_count(struct dx_entry *entries, unsigned value)
231{ 233{
232 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); 234 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
233} 235}
234 236
235static inline void dx_set_limit (struct dx_entry *entries, unsigned value) 237static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
236{ 238{
237 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); 239 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
238} 240}
239 241
240static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) 242static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
241{ 243{
242 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 244 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
243 EXT4_DIR_REC_LEN(2) - infosize; 245 EXT4_DIR_REC_LEN(2) - infosize;
244 return entry_space / sizeof(struct dx_entry); 246 return entry_space / sizeof(struct dx_entry);
245} 247}
246 248
247static inline unsigned dx_node_limit (struct inode *dir) 249static inline unsigned dx_node_limit(struct inode *dir)
248{ 250{
249 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 251 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
250 return entry_space / sizeof(struct dx_entry); 252 return entry_space / sizeof(struct dx_entry);
@@ -254,12 +256,12 @@ static inline unsigned dx_node_limit (struct inode *dir)
254 * Debug 256 * Debug
255 */ 257 */
256#ifdef DX_DEBUG 258#ifdef DX_DEBUG
257static void dx_show_index (char * label, struct dx_entry *entries) 259static void dx_show_index(char * label, struct dx_entry *entries)
258{ 260{
259 int i, n = dx_get_count (entries); 261 int i, n = dx_get_count (entries);
260 printk("%s index ", label); 262 printk(KERN_DEBUG "%s index ", label);
261 for (i = 0; i < n; i++) { 263 for (i = 0; i < n; i++) {
262 printk("%x->%lu ", i? dx_get_hash(entries + i) : 264 printk("%x->%lu ", i ? dx_get_hash(entries + i) :
263 0, (unsigned long)dx_get_block(entries + i)); 265 0, (unsigned long)dx_get_block(entries + i));
264 } 266 }
265 printk("\n"); 267 printk("\n");
@@ -306,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
306 struct dx_entry *entries, int levels) 308 struct dx_entry *entries, int levels)
307{ 309{
308 unsigned blocksize = dir->i_sb->s_blocksize; 310 unsigned blocksize = dir->i_sb->s_blocksize;
309 unsigned count = dx_get_count (entries), names = 0, space = 0, i; 311 unsigned count = dx_get_count(entries), names = 0, space = 0, i;
310 unsigned bcount = 0; 312 unsigned bcount = 0;
311 struct buffer_head *bh; 313 struct buffer_head *bh;
312 int err; 314 int err;
@@ -325,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
325 names += stats.names; 327 names += stats.names;
326 space += stats.space; 328 space += stats.space;
327 bcount += stats.bcount; 329 bcount += stats.bcount;
328 brelse (bh); 330 brelse(bh);
329 } 331 }
330 if (bcount) 332 if (bcount)
331 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", 333 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
332 names, space/bcount,(space/bcount)*100/blocksize); 334 levels ? "" : " ", names, space/bcount,
335 (space/bcount)*100/blocksize);
333 return (struct stats) { names, space, bcount}; 336 return (struct stats) { names, space, bcount};
334} 337}
335#endif /* DX_DEBUG */ 338#endif /* DX_DEBUG */
@@ -344,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
344 * back to userspace. 347 * back to userspace.
345 */ 348 */
346static struct dx_frame * 349static struct dx_frame *
347dx_probe(struct dentry *dentry, struct inode *dir, 350dx_probe(const struct qstr *d_name, struct inode *dir,
348 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) 351 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
349{ 352{
350 unsigned count, indirect; 353 unsigned count, indirect;
@@ -355,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
355 u32 hash; 358 u32 hash;
356 359
357 frame->bh = NULL; 360 frame->bh = NULL;
358 if (dentry)
359 dir = dentry->d_parent->d_inode;
360 if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) 361 if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
361 goto fail; 362 goto fail;
362 root = (struct dx_root *) bh->b_data; 363 root = (struct dx_root *) bh->b_data;
@@ -372,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
372 } 373 }
373 hinfo->hash_version = root->info.hash_version; 374 hinfo->hash_version = root->info.hash_version;
374 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 375 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
375 if (dentry) 376 if (d_name)
376 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); 377 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
377 hash = hinfo->hash; 378 hash = hinfo->hash;
378 379
379 if (root->info.unused_flags & 1) { 380 if (root->info.unused_flags & 1) {
@@ -406,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
406 goto fail; 407 goto fail;
407 } 408 }
408 409
409 dxtrace (printk("Look up %x", hash)); 410 dxtrace(printk("Look up %x", hash));
410 while (1) 411 while (1)
411 { 412 {
412 count = dx_get_count(entries); 413 count = dx_get_count(entries);
@@ -555,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
555 0, &err))) 556 0, &err)))
556 return err; /* Failure */ 557 return err; /* Failure */
557 p++; 558 p++;
558 brelse (p->bh); 559 brelse(p->bh);
559 p->bh = bh; 560 p->bh = bh;
560 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; 561 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
561 } 562 }
@@ -593,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
593 /* On error, skip the f_pos to the next block. */ 594 /* On error, skip the f_pos to the next block. */
594 dir_file->f_pos = (dir_file->f_pos | 595 dir_file->f_pos = (dir_file->f_pos |
595 (dir->i_sb->s_blocksize - 1)) + 1; 596 (dir->i_sb->s_blocksize - 1)) + 1;
596 brelse (bh); 597 brelse(bh);
597 return count; 598 return count;
598 } 599 }
599 ext4fs_dirhash(de->name, de->name_len, hinfo); 600 ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -635,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
635 int ret, err; 636 int ret, err;
636 __u32 hashval; 637 __u32 hashval;
637 638
638 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, 639 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
639 start_minor_hash)); 640 start_hash, start_minor_hash));
640 dir = dir_file->f_path.dentry->d_inode; 641 dir = dir_file->f_path.dentry->d_inode;
641 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 642 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
642 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 643 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -648,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
648 } 649 }
649 hinfo.hash = start_hash; 650 hinfo.hash = start_hash;
650 hinfo.minor_hash = 0; 651 hinfo.minor_hash = 0;
651 frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); 652 frame = dx_probe(NULL, dir, &hinfo, frames, &err);
652 if (!frame) 653 if (!frame)
653 return err; 654 return err;
654 655
@@ -694,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
694 break; 695 break;
695 } 696 }
696 dx_release(frames); 697 dx_release(frames);
697 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", 698 dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
698 count, *next_hash)); 699 "next hash: %x\n", count, *next_hash));
699 return count; 700 return count;
700errout: 701errout:
701 dx_release(frames); 702 dx_release(frames);
@@ -802,17 +803,17 @@ static inline int ext4_match (int len, const char * const name,
802/* 803/*
803 * Returns 0 if not found, -1 on failure, and 1 on success 804 * Returns 0 if not found, -1 on failure, and 1 on success
804 */ 805 */
805static inline int search_dirblock(struct buffer_head * bh, 806static inline int search_dirblock(struct buffer_head *bh,
806 struct inode *dir, 807 struct inode *dir,
807 struct dentry *dentry, 808 const struct qstr *d_name,
808 unsigned long offset, 809 unsigned long offset,
809 struct ext4_dir_entry_2 ** res_dir) 810 struct ext4_dir_entry_2 ** res_dir)
810{ 811{
811 struct ext4_dir_entry_2 * de; 812 struct ext4_dir_entry_2 * de;
812 char * dlimit; 813 char * dlimit;
813 int de_len; 814 int de_len;
814 const char *name = dentry->d_name.name; 815 const char *name = d_name->name;
815 int namelen = dentry->d_name.len; 816 int namelen = d_name->len;
816 817
817 de = (struct ext4_dir_entry_2 *) bh->b_data; 818 de = (struct ext4_dir_entry_2 *) bh->b_data;
818 dlimit = bh->b_data + dir->i_sb->s_blocksize; 819 dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -851,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh,
851 * The returned buffer_head has ->b_count elevated. The caller is expected 852 * The returned buffer_head has ->b_count elevated. The caller is expected
852 * to brelse() it when appropriate. 853 * to brelse() it when appropriate.
853 */ 854 */
854static struct buffer_head * ext4_find_entry (struct dentry *dentry, 855static struct buffer_head * ext4_find_entry (struct inode *dir,
856 const struct qstr *d_name,
855 struct ext4_dir_entry_2 ** res_dir) 857 struct ext4_dir_entry_2 ** res_dir)
856{ 858{
857 struct super_block * sb; 859 struct super_block *sb;
858 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 860 struct buffer_head *bh_use[NAMEI_RA_SIZE];
859 struct buffer_head * bh, *ret = NULL; 861 struct buffer_head *bh, *ret = NULL;
860 ext4_lblk_t start, block, b; 862 ext4_lblk_t start, block, b;
861 int ra_max = 0; /* Number of bh's in the readahead 863 int ra_max = 0; /* Number of bh's in the readahead
862 buffer, bh_use[] */ 864 buffer, bh_use[] */
@@ -865,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
865 int num = 0; 867 int num = 0;
866 ext4_lblk_t nblocks; 868 ext4_lblk_t nblocks;
867 int i, err; 869 int i, err;
868 struct inode *dir = dentry->d_parent->d_inode;
869 int namelen; 870 int namelen;
870 871
871 *res_dir = NULL; 872 *res_dir = NULL;
872 sb = dir->i_sb; 873 sb = dir->i_sb;
873 namelen = dentry->d_name.len; 874 namelen = d_name->len;
874 if (namelen > EXT4_NAME_LEN) 875 if (namelen > EXT4_NAME_LEN)
875 return NULL; 876 return NULL;
876 if (is_dx(dir)) { 877 if (is_dx(dir)) {
877 bh = ext4_dx_find_entry(dentry, res_dir, &err); 878 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
878 /* 879 /*
879 * On success, or if the error was file not found, 880 * On success, or if the error was file not found,
880 * return. Otherwise, fall back to doing a search the 881 * return. Otherwise, fall back to doing a search the
@@ -882,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
882 */ 883 */
883 if (bh || (err != ERR_BAD_DX_DIR)) 884 if (bh || (err != ERR_BAD_DX_DIR))
884 return bh; 885 return bh;
885 dxtrace(printk("ext4_find_entry: dx failed, falling back\n")); 886 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
887 "falling back\n"));
886 } 888 }
887 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); 889 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
888 start = EXT4_I(dir)->i_dir_start_lookup; 890 start = EXT4_I(dir)->i_dir_start_lookup;
@@ -926,7 +928,7 @@ restart:
926 brelse(bh); 928 brelse(bh);
927 goto next; 929 goto next;
928 } 930 }
929 i = search_dirblock(bh, dir, dentry, 931 i = search_dirblock(bh, dir, d_name,
930 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); 932 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
931 if (i == 1) { 933 if (i == 1) {
932 EXT4_I(dir)->i_dir_start_lookup = block; 934 EXT4_I(dir)->i_dir_start_lookup = block;
@@ -956,11 +958,11 @@ restart:
956cleanup_and_exit: 958cleanup_and_exit:
957 /* Clean up the read-ahead blocks */ 959 /* Clean up the read-ahead blocks */
958 for (; ra_ptr < ra_max; ra_ptr++) 960 for (; ra_ptr < ra_max; ra_ptr++)
959 brelse (bh_use[ra_ptr]); 961 brelse(bh_use[ra_ptr]);
960 return ret; 962 return ret;
961} 963}
962 964
963static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, 965static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
964 struct ext4_dir_entry_2 **res_dir, int *err) 966 struct ext4_dir_entry_2 **res_dir, int *err)
965{ 967{
966 struct super_block * sb; 968 struct super_block * sb;
@@ -971,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
971 struct buffer_head *bh; 973 struct buffer_head *bh;
972 ext4_lblk_t block; 974 ext4_lblk_t block;
973 int retval; 975 int retval;
974 int namelen = dentry->d_name.len; 976 int namelen = d_name->len;
975 const u8 *name = dentry->d_name.name; 977 const u8 *name = d_name->name;
976 struct inode *dir = dentry->d_parent->d_inode;
977 978
978 sb = dir->i_sb; 979 sb = dir->i_sb;
979 /* NFS may look up ".." - look at dx_root directory block */ 980 /* NFS may look up ".." - look at dx_root directory block */
980 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ 981 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
981 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) 982 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
982 return NULL; 983 return NULL;
983 } else { 984 } else {
984 frame = frames; 985 frame = frames;
@@ -1010,7 +1011,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
1010 return bh; 1011 return bh;
1011 } 1012 }
1012 } 1013 }
1013 brelse (bh); 1014 brelse(bh);
1014 /* Check to see if we should continue to search */ 1015 /* Check to see if we should continue to search */
1015 retval = ext4_htree_next_block(dir, hash, frame, 1016 retval = ext4_htree_next_block(dir, hash, frame,
1016 frames, NULL); 1017 frames, NULL);
@@ -1025,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
1025 1026
1026 *err = -ENOENT; 1027 *err = -ENOENT;
1027errout: 1028errout:
1028 dxtrace(printk("%s not found\n", name)); 1029 dxtrace(printk(KERN_DEBUG "%s not found\n", name));
1029 dx_release (frames); 1030 dx_release (frames);
1030 return NULL; 1031 return NULL;
1031} 1032}
1032 1033
1033static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) 1034static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1034{ 1035{
1035 struct inode * inode; 1036 struct inode *inode;
1036 struct ext4_dir_entry_2 * de; 1037 struct ext4_dir_entry_2 *de;
1037 struct buffer_head * bh; 1038 struct buffer_head *bh;
1038 1039
1039 if (dentry->d_name.len > EXT4_NAME_LEN) 1040 if (dentry->d_name.len > EXT4_NAME_LEN)
1040 return ERR_PTR(-ENAMETOOLONG); 1041 return ERR_PTR(-ENAMETOOLONG);
1041 1042
1042 bh = ext4_find_entry(dentry, &de); 1043 bh = ext4_find_entry(dir, &dentry->d_name, &de);
1043 inode = NULL; 1044 inode = NULL;
1044 if (bh) { 1045 if (bh) {
1045 unsigned long ino = le32_to_cpu(de->inode); 1046 unsigned long ino = le32_to_cpu(de->inode);
1046 brelse (bh); 1047 brelse(bh);
1047 if (!ext4_valid_inum(dir->i_sb, ino)) { 1048 if (!ext4_valid_inum(dir->i_sb, ino)) {
1048 ext4_error(dir->i_sb, "ext4_lookup", 1049 ext4_error(dir->i_sb, "ext4_lookup",
1049 "bad inode number: %lu", ino); 1050 "bad inode number: %lu", ino);
@@ -1062,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
1062 unsigned long ino; 1063 unsigned long ino;
1063 struct dentry *parent; 1064 struct dentry *parent;
1064 struct inode *inode; 1065 struct inode *inode;
1065 struct dentry dotdot; 1066 static const struct qstr dotdot = {
1067 .name = "..",
1068 .len = 2,
1069 };
1066 struct ext4_dir_entry_2 * de; 1070 struct ext4_dir_entry_2 * de;
1067 struct buffer_head *bh; 1071 struct buffer_head *bh;
1068 1072
1069 dotdot.d_name.name = ".."; 1073 bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1070 dotdot.d_name.len = 2;
1071 dotdot.d_parent = child; /* confusing, isn't it! */
1072
1073 bh = ext4_find_entry(&dotdot, &de);
1074 inode = NULL; 1074 inode = NULL;
1075 if (!bh) 1075 if (!bh)
1076 return ERR_PTR(-ENOENT); 1076 return ERR_PTR(-ENOENT);
@@ -1201,10 +1201,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1201 1201
1202 /* create map in the end of data2 block */ 1202 /* create map in the end of data2 block */
1203 map = (struct dx_map_entry *) (data2 + blocksize); 1203 map = (struct dx_map_entry *) (data2 + blocksize);
1204 count = dx_make_map ((struct ext4_dir_entry_2 *) data1, 1204 count = dx_make_map((struct ext4_dir_entry_2 *) data1,
1205 blocksize, hinfo, map); 1205 blocksize, hinfo, map);
1206 map -= count; 1206 map -= count;
1207 dx_sort_map (map, count); 1207 dx_sort_map(map, count);
1208 /* Split the existing block in the middle, size-wise */ 1208 /* Split the existing block in the middle, size-wise */
1209 size = 0; 1209 size = 0;
1210 move = 0; 1210 move = 0;
@@ -1225,7 +1225,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1225 1225
1226 /* Fancy dance to stay within two buffers */ 1226 /* Fancy dance to stay within two buffers */
1227 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1227 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1228 de = dx_pack_dirents(data1,blocksize); 1228 de = dx_pack_dirents(data1, blocksize);
1229 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1229 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
1230 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); 1230 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
1231 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1231 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1237,15 +1237,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1237 swap(*bh, bh2); 1237 swap(*bh, bh2);
1238 de = de2; 1238 de = de2;
1239 } 1239 }
1240 dx_insert_block (frame, hash2 + continued, newblock); 1240 dx_insert_block(frame, hash2 + continued, newblock);
1241 err = ext4_journal_dirty_metadata (handle, bh2); 1241 err = ext4_journal_dirty_metadata(handle, bh2);
1242 if (err) 1242 if (err)
1243 goto journal_error; 1243 goto journal_error;
1244 err = ext4_journal_dirty_metadata (handle, frame->bh); 1244 err = ext4_journal_dirty_metadata(handle, frame->bh);
1245 if (err) 1245 if (err)
1246 goto journal_error; 1246 goto journal_error;
1247 brelse (bh2); 1247 brelse(bh2);
1248 dxtrace(dx_show_index ("frame", frame->entries)); 1248 dxtrace(dx_show_index("frame", frame->entries));
1249 return de; 1249 return de;
1250 1250
1251journal_error: 1251journal_error:
@@ -1271,7 +1271,7 @@ errout:
1271 */ 1271 */
1272static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1272static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1273 struct inode *inode, struct ext4_dir_entry_2 *de, 1273 struct inode *inode, struct ext4_dir_entry_2 *de,
1274 struct buffer_head * bh) 1274 struct buffer_head *bh)
1275{ 1275{
1276 struct inode *dir = dentry->d_parent->d_inode; 1276 struct inode *dir = dentry->d_parent->d_inode;
1277 const char *name = dentry->d_name.name; 1277 const char *name = dentry->d_name.name;
@@ -1288,11 +1288,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1288 while ((char *) de <= top) { 1288 while ((char *) de <= top) {
1289 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1289 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1290 bh, offset)) { 1290 bh, offset)) {
1291 brelse (bh); 1291 brelse(bh);
1292 return -EIO; 1292 return -EIO;
1293 } 1293 }
1294 if (ext4_match (namelen, name, de)) { 1294 if (ext4_match(namelen, name, de)) {
1295 brelse (bh); 1295 brelse(bh);
1296 return -EEXIST; 1296 return -EEXIST;
1297 } 1297 }
1298 nlen = EXT4_DIR_REC_LEN(de->name_len); 1298 nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1329,7 +1329,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1329 } else 1329 } else
1330 de->inode = 0; 1330 de->inode = 0;
1331 de->name_len = namelen; 1331 de->name_len = namelen;
1332 memcpy (de->name, name, namelen); 1332 memcpy(de->name, name, namelen);
1333 /* 1333 /*
1334 * XXX shouldn't update any times until successful 1334 * XXX shouldn't update any times until successful
1335 * completion of syscall, but too many callers depend 1335 * completion of syscall, but too many callers depend
@@ -1377,7 +1377,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1377 struct fake_dirent *fde; 1377 struct fake_dirent *fde;
1378 1378
1379 blocksize = dir->i_sb->s_blocksize; 1379 blocksize = dir->i_sb->s_blocksize;
1380 dxtrace(printk("Creating index\n")); 1380 dxtrace(printk(KERN_DEBUG "Creating index\n"));
1381 retval = ext4_journal_get_write_access(handle, bh); 1381 retval = ext4_journal_get_write_access(handle, bh);
1382 if (retval) { 1382 if (retval) {
1383 ext4_std_error(dir->i_sb, retval); 1383 ext4_std_error(dir->i_sb, retval);
@@ -1386,7 +1386,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1386 } 1386 }
1387 root = (struct dx_root *) bh->b_data; 1387 root = (struct dx_root *) bh->b_data;
1388 1388
1389 bh2 = ext4_append (handle, dir, &block, &retval); 1389 bh2 = ext4_append(handle, dir, &block, &retval);
1390 if (!(bh2)) { 1390 if (!(bh2)) {
1391 brelse(bh); 1391 brelse(bh);
1392 return retval; 1392 return retval;
@@ -1412,9 +1412,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1412 root->info.info_length = sizeof(root->info); 1412 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1414 entries = root->entries; 1414 entries = root->entries;
1415 dx_set_block (entries, 1); 1415 dx_set_block(entries, 1);
1416 dx_set_count (entries, 1); 1416 dx_set_count(entries, 1);
1417 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); 1417 dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
1418 1418
1419 /* Initialize as for dx_probe */ 1419 /* Initialize as for dx_probe */
1420 hinfo.hash_version = root->info.hash_version; 1420 hinfo.hash_version = root->info.hash_version;
@@ -1443,14 +1443,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1443 * may not sleep between calling this and putting something into 1443 * may not sleep between calling this and putting something into
1444 * the entry, as someone else might have used it while you slept. 1444 * the entry, as someone else might have used it while you slept.
1445 */ 1445 */
1446static int ext4_add_entry (handle_t *handle, struct dentry *dentry, 1446static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1447 struct inode *inode) 1447 struct inode *inode)
1448{ 1448{
1449 struct inode *dir = dentry->d_parent->d_inode; 1449 struct inode *dir = dentry->d_parent->d_inode;
1450 unsigned long offset; 1450 unsigned long offset;
1451 struct buffer_head * bh; 1451 struct buffer_head *bh;
1452 struct ext4_dir_entry_2 *de; 1452 struct ext4_dir_entry_2 *de;
1453 struct super_block * sb; 1453 struct super_block *sb;
1454 int retval; 1454 int retval;
1455 int dx_fallback=0; 1455 int dx_fallback=0;
1456 unsigned blocksize; 1456 unsigned blocksize;
@@ -1500,13 +1500,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1500 struct dx_frame frames[2], *frame; 1500 struct dx_frame frames[2], *frame;
1501 struct dx_entry *entries, *at; 1501 struct dx_entry *entries, *at;
1502 struct dx_hash_info hinfo; 1502 struct dx_hash_info hinfo;
1503 struct buffer_head * bh; 1503 struct buffer_head *bh;
1504 struct inode *dir = dentry->d_parent->d_inode; 1504 struct inode *dir = dentry->d_parent->d_inode;
1505 struct super_block * sb = dir->i_sb; 1505 struct super_block *sb = dir->i_sb;
1506 struct ext4_dir_entry_2 *de; 1506 struct ext4_dir_entry_2 *de;
1507 int err; 1507 int err;
1508 1508
1509 frame = dx_probe(dentry, NULL, &hinfo, frames, &err); 1509 frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1510 if (!frame) 1510 if (!frame)
1511 return err; 1511 return err;
1512 entries = frame->entries; 1512 entries = frame->entries;
@@ -1527,7 +1527,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1527 } 1527 }
1528 1528
1529 /* Block full, should compress but for now just split */ 1529 /* Block full, should compress but for now just split */
1530 dxtrace(printk("using %u of %u node entries\n", 1530 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1531 dx_get_count(entries), dx_get_limit(entries))); 1531 dx_get_count(entries), dx_get_limit(entries)));
1532 /* Need to split index? */ 1532 /* Need to split index? */
1533 if (dx_get_count(entries) == dx_get_limit(entries)) { 1533 if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1559,7 +1559,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1559 if (levels) { 1559 if (levels) {
1560 unsigned icount1 = icount/2, icount2 = icount - icount1; 1560 unsigned icount1 = icount/2, icount2 = icount - icount1;
1561 unsigned hash2 = dx_get_hash(entries + icount1); 1561 unsigned hash2 = dx_get_hash(entries + icount1);
1562 dxtrace(printk("Split index %i/%i\n", icount1, icount2)); 1562 dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
1563 icount1, icount2));
1563 1564
1564 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ 1565 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1565 err = ext4_journal_get_write_access(handle, 1566 err = ext4_journal_get_write_access(handle,
@@ -1567,11 +1568,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1567 if (err) 1568 if (err)
1568 goto journal_error; 1569 goto journal_error;
1569 1570
1570 memcpy ((char *) entries2, (char *) (entries + icount1), 1571 memcpy((char *) entries2, (char *) (entries + icount1),
1571 icount2 * sizeof(struct dx_entry)); 1572 icount2 * sizeof(struct dx_entry));
1572 dx_set_count (entries, icount1); 1573 dx_set_count(entries, icount1);
1573 dx_set_count (entries2, icount2); 1574 dx_set_count(entries2, icount2);
1574 dx_set_limit (entries2, dx_node_limit(dir)); 1575 dx_set_limit(entries2, dx_node_limit(dir));
1575 1576
1576 /* Which index block gets the new entry? */ 1577 /* Which index block gets the new entry? */
1577 if (at - entries >= icount1) { 1578 if (at - entries >= icount1) {
@@ -1579,16 +1580,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1579 frame->entries = entries = entries2; 1580 frame->entries = entries = entries2;
1580 swap(frame->bh, bh2); 1581 swap(frame->bh, bh2);
1581 } 1582 }
1582 dx_insert_block (frames + 0, hash2, newblock); 1583 dx_insert_block(frames + 0, hash2, newblock);
1583 dxtrace(dx_show_index ("node", frames[1].entries)); 1584 dxtrace(dx_show_index("node", frames[1].entries));
1584 dxtrace(dx_show_index ("node", 1585 dxtrace(dx_show_index("node",
1585 ((struct dx_node *) bh2->b_data)->entries)); 1586 ((struct dx_node *) bh2->b_data)->entries));
1586 err = ext4_journal_dirty_metadata(handle, bh2); 1587 err = ext4_journal_dirty_metadata(handle, bh2);
1587 if (err) 1588 if (err)
1588 goto journal_error; 1589 goto journal_error;
1589 brelse (bh2); 1590 brelse (bh2);
1590 } else { 1591 } else {
1591 dxtrace(printk("Creating second level index...\n")); 1592 dxtrace(printk(KERN_DEBUG
1593 "Creating second level index...\n"));
1592 memcpy((char *) entries2, (char *) entries, 1594 memcpy((char *) entries2, (char *) entries,
1593 icount * sizeof(struct dx_entry)); 1595 icount * sizeof(struct dx_entry));
1594 dx_set_limit(entries2, dx_node_limit(dir)); 1596 dx_set_limit(entries2, dx_node_limit(dir));
@@ -1630,12 +1632,12 @@ cleanup:
1630 * ext4_delete_entry deletes a directory entry by merging it with the 1632 * ext4_delete_entry deletes a directory entry by merging it with the
1631 * previous entry 1633 * previous entry
1632 */ 1634 */
1633static int ext4_delete_entry (handle_t *handle, 1635static int ext4_delete_entry(handle_t *handle,
1634 struct inode * dir, 1636 struct inode *dir,
1635 struct ext4_dir_entry_2 * de_del, 1637 struct ext4_dir_entry_2 *de_del,
1636 struct buffer_head * bh) 1638 struct buffer_head *bh)
1637{ 1639{
1638 struct ext4_dir_entry_2 * de, * pde; 1640 struct ext4_dir_entry_2 *de, *pde;
1639 int i; 1641 int i;
1640 1642
1641 i = 0; 1643 i = 0;
@@ -1716,11 +1718,11 @@ static int ext4_add_nondir(handle_t *handle,
1716 * If the create succeeds, we fill in the inode information 1718 * If the create succeeds, we fill in the inode information
1717 * with d_instantiate(). 1719 * with d_instantiate().
1718 */ 1720 */
1719static int ext4_create (struct inode * dir, struct dentry * dentry, int mode, 1721static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
1720 struct nameidata *nd) 1722 struct nameidata *nd)
1721{ 1723{
1722 handle_t *handle; 1724 handle_t *handle;
1723 struct inode * inode; 1725 struct inode *inode;
1724 int err, retries = 0; 1726 int err, retries = 0;
1725 1727
1726retry: 1728retry:
@@ -1747,8 +1749,8 @@ retry:
1747 return err; 1749 return err;
1748} 1750}
1749 1751
1750static int ext4_mknod (struct inode * dir, struct dentry *dentry, 1752static int ext4_mknod(struct inode *dir, struct dentry *dentry,
1751 int mode, dev_t rdev) 1753 int mode, dev_t rdev)
1752{ 1754{
1753 handle_t *handle; 1755 handle_t *handle;
1754 struct inode *inode; 1756 struct inode *inode;
@@ -1767,11 +1769,11 @@ retry:
1767 if (IS_DIRSYNC(dir)) 1769 if (IS_DIRSYNC(dir))
1768 handle->h_sync = 1; 1770 handle->h_sync = 1;
1769 1771
1770 inode = ext4_new_inode (handle, dir, mode); 1772 inode = ext4_new_inode(handle, dir, mode);
1771 err = PTR_ERR(inode); 1773 err = PTR_ERR(inode);
1772 if (!IS_ERR(inode)) { 1774 if (!IS_ERR(inode)) {
1773 init_special_inode(inode, inode->i_mode, rdev); 1775 init_special_inode(inode, inode->i_mode, rdev);
1774#ifdef CONFIG_EXT4DEV_FS_XATTR 1776#ifdef CONFIG_EXT4_FS_XATTR
1775 inode->i_op = &ext4_special_inode_operations; 1777 inode->i_op = &ext4_special_inode_operations;
1776#endif 1778#endif
1777 err = ext4_add_nondir(handle, dentry, inode); 1779 err = ext4_add_nondir(handle, dentry, inode);
@@ -1782,12 +1784,12 @@ retry:
1782 return err; 1784 return err;
1783} 1785}
1784 1786
1785static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode) 1787static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1786{ 1788{
1787 handle_t *handle; 1789 handle_t *handle;
1788 struct inode * inode; 1790 struct inode *inode;
1789 struct buffer_head * dir_block; 1791 struct buffer_head *dir_block;
1790 struct ext4_dir_entry_2 * de; 1792 struct ext4_dir_entry_2 *de;
1791 int err, retries = 0; 1793 int err, retries = 0;
1792 1794
1793 if (EXT4_DIR_LINK_MAX(dir)) 1795 if (EXT4_DIR_LINK_MAX(dir))
@@ -1803,7 +1805,7 @@ retry:
1803 if (IS_DIRSYNC(dir)) 1805 if (IS_DIRSYNC(dir))
1804 handle->h_sync = 1; 1806 handle->h_sync = 1;
1805 1807
1806 inode = ext4_new_inode (handle, dir, S_IFDIR | mode); 1808 inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
1807 err = PTR_ERR(inode); 1809 err = PTR_ERR(inode);
1808 if (IS_ERR(inode)) 1810 if (IS_ERR(inode))
1809 goto out_stop; 1811 goto out_stop;
@@ -1811,7 +1813,7 @@ retry:
1811 inode->i_op = &ext4_dir_inode_operations; 1813 inode->i_op = &ext4_dir_inode_operations;
1812 inode->i_fop = &ext4_dir_operations; 1814 inode->i_fop = &ext4_dir_operations;
1813 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1815 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1814 dir_block = ext4_bread (handle, inode, 0, 1, &err); 1816 dir_block = ext4_bread(handle, inode, 0, 1, &err);
1815 if (!dir_block) 1817 if (!dir_block)
1816 goto out_clear_inode; 1818 goto out_clear_inode;
1817 BUFFER_TRACE(dir_block, "get_write_access"); 1819 BUFFER_TRACE(dir_block, "get_write_access");
@@ -1820,26 +1822,26 @@ retry:
1820 de->inode = cpu_to_le32(inode->i_ino); 1822 de->inode = cpu_to_le32(inode->i_ino);
1821 de->name_len = 1; 1823 de->name_len = 1;
1822 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); 1824 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
1823 strcpy (de->name, "."); 1825 strcpy(de->name, ".");
1824 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1826 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1825 de = ext4_next_entry(de); 1827 de = ext4_next_entry(de);
1826 de->inode = cpu_to_le32(dir->i_ino); 1828 de->inode = cpu_to_le32(dir->i_ino);
1827 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - 1829 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
1828 EXT4_DIR_REC_LEN(1)); 1830 EXT4_DIR_REC_LEN(1));
1829 de->name_len = 2; 1831 de->name_len = 2;
1830 strcpy (de->name, ".."); 1832 strcpy(de->name, "..");
1831 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1833 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1832 inode->i_nlink = 2; 1834 inode->i_nlink = 2;
1833 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); 1835 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
1834 ext4_journal_dirty_metadata(handle, dir_block); 1836 ext4_journal_dirty_metadata(handle, dir_block);
1835 brelse (dir_block); 1837 brelse(dir_block);
1836 ext4_mark_inode_dirty(handle, inode); 1838 ext4_mark_inode_dirty(handle, inode);
1837 err = ext4_add_entry (handle, dentry, inode); 1839 err = ext4_add_entry(handle, dentry, inode);
1838 if (err) { 1840 if (err) {
1839out_clear_inode: 1841out_clear_inode:
1840 clear_nlink(inode); 1842 clear_nlink(inode);
1841 ext4_mark_inode_dirty(handle, inode); 1843 ext4_mark_inode_dirty(handle, inode);
1842 iput (inode); 1844 iput(inode);
1843 goto out_stop; 1845 goto out_stop;
1844 } 1846 }
1845 ext4_inc_count(handle, dir); 1847 ext4_inc_count(handle, dir);
@@ -1856,17 +1858,17 @@ out_stop:
1856/* 1858/*
1857 * routine to check that the specified directory is empty (for rmdir) 1859 * routine to check that the specified directory is empty (for rmdir)
1858 */ 1860 */
1859static int empty_dir (struct inode * inode) 1861static int empty_dir(struct inode *inode)
1860{ 1862{
1861 unsigned long offset; 1863 unsigned long offset;
1862 struct buffer_head * bh; 1864 struct buffer_head *bh;
1863 struct ext4_dir_entry_2 * de, * de1; 1865 struct ext4_dir_entry_2 *de, *de1;
1864 struct super_block * sb; 1866 struct super_block *sb;
1865 int err = 0; 1867 int err = 0;
1866 1868
1867 sb = inode->i_sb; 1869 sb = inode->i_sb;
1868 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1870 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1869 !(bh = ext4_bread (NULL, inode, 0, 0, &err))) { 1871 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1870 if (err) 1872 if (err)
1871 ext4_error(inode->i_sb, __func__, 1873 ext4_error(inode->i_sb, __func__,
1872 "error %d reading directory #%lu offset 0", 1874 "error %d reading directory #%lu offset 0",
@@ -1881,23 +1883,23 @@ static int empty_dir (struct inode * inode)
1881 de1 = ext4_next_entry(de); 1883 de1 = ext4_next_entry(de);
1882 if (le32_to_cpu(de->inode) != inode->i_ino || 1884 if (le32_to_cpu(de->inode) != inode->i_ino ||
1883 !le32_to_cpu(de1->inode) || 1885 !le32_to_cpu(de1->inode) ||
1884 strcmp (".", de->name) || 1886 strcmp(".", de->name) ||
1885 strcmp ("..", de1->name)) { 1887 strcmp("..", de1->name)) {
1886 ext4_warning (inode->i_sb, "empty_dir", 1888 ext4_warning(inode->i_sb, "empty_dir",
1887 "bad directory (dir #%lu) - no `.' or `..'", 1889 "bad directory (dir #%lu) - no `.' or `..'",
1888 inode->i_ino); 1890 inode->i_ino);
1889 brelse (bh); 1891 brelse(bh);
1890 return 1; 1892 return 1;
1891 } 1893 }
1892 offset = ext4_rec_len_from_disk(de->rec_len) + 1894 offset = ext4_rec_len_from_disk(de->rec_len) +
1893 ext4_rec_len_from_disk(de1->rec_len); 1895 ext4_rec_len_from_disk(de1->rec_len);
1894 de = ext4_next_entry(de1); 1896 de = ext4_next_entry(de1);
1895 while (offset < inode->i_size ) { 1897 while (offset < inode->i_size) {
1896 if (!bh || 1898 if (!bh ||
1897 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1899 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1898 err = 0; 1900 err = 0;
1899 brelse (bh); 1901 brelse(bh);
1900 bh = ext4_bread (NULL, inode, 1902 bh = ext4_bread(NULL, inode,
1901 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1903 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1902 if (!bh) { 1904 if (!bh) {
1903 if (err) 1905 if (err)
@@ -1917,13 +1919,13 @@ static int empty_dir (struct inode * inode)
1917 continue; 1919 continue;
1918 } 1920 }
1919 if (le32_to_cpu(de->inode)) { 1921 if (le32_to_cpu(de->inode)) {
1920 brelse (bh); 1922 brelse(bh);
1921 return 0; 1923 return 0;
1922 } 1924 }
1923 offset += ext4_rec_len_from_disk(de->rec_len); 1925 offset += ext4_rec_len_from_disk(de->rec_len);
1924 de = ext4_next_entry(de); 1926 de = ext4_next_entry(de);
1925 } 1927 }
1926 brelse (bh); 1928 brelse(bh);
1927 return 1; 1929 return 1;
1928} 1930}
1929 1931
@@ -1954,8 +1956,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1954 * ->i_nlink. For, say it, character device. Not a regular file, 1956 * ->i_nlink. For, say it, character device. Not a regular file,
1955 * not a directory, not a symlink and ->i_nlink > 0. 1957 * not a directory, not a symlink and ->i_nlink > 0.
1956 */ 1958 */
1957 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1959 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1958 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 1960 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1959 1961
1960 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); 1962 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
1961 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 1963 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2069,12 +2071,12 @@ out_brelse:
2069 goto out_err; 2071 goto out_err;
2070} 2072}
2071 2073
2072static int ext4_rmdir (struct inode * dir, struct dentry *dentry) 2074static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2073{ 2075{
2074 int retval; 2076 int retval;
2075 struct inode * inode; 2077 struct inode *inode;
2076 struct buffer_head * bh; 2078 struct buffer_head *bh;
2077 struct ext4_dir_entry_2 * de; 2079 struct ext4_dir_entry_2 *de;
2078 handle_t *handle; 2080 handle_t *handle;
2079 2081
2080 /* Initialize quotas before so that eventual writes go in 2082 /* Initialize quotas before so that eventual writes go in
@@ -2085,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2085 return PTR_ERR(handle); 2087 return PTR_ERR(handle);
2086 2088
2087 retval = -ENOENT; 2089 retval = -ENOENT;
2088 bh = ext4_find_entry (dentry, &de); 2090 bh = ext4_find_entry(dir, &dentry->d_name, &de);
2089 if (!bh) 2091 if (!bh)
2090 goto end_rmdir; 2092 goto end_rmdir;
2091 2093
@@ -2099,16 +2101,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2099 goto end_rmdir; 2101 goto end_rmdir;
2100 2102
2101 retval = -ENOTEMPTY; 2103 retval = -ENOTEMPTY;
2102 if (!empty_dir (inode)) 2104 if (!empty_dir(inode))
2103 goto end_rmdir; 2105 goto end_rmdir;
2104 2106
2105 retval = ext4_delete_entry(handle, dir, de, bh); 2107 retval = ext4_delete_entry(handle, dir, de, bh);
2106 if (retval) 2108 if (retval)
2107 goto end_rmdir; 2109 goto end_rmdir;
2108 if (!EXT4_DIR_LINK_EMPTY(inode)) 2110 if (!EXT4_DIR_LINK_EMPTY(inode))
2109 ext4_warning (inode->i_sb, "ext4_rmdir", 2111 ext4_warning(inode->i_sb, "ext4_rmdir",
2110 "empty directory has too many links (%d)", 2112 "empty directory has too many links (%d)",
2111 inode->i_nlink); 2113 inode->i_nlink);
2112 inode->i_version++; 2114 inode->i_version++;
2113 clear_nlink(inode); 2115 clear_nlink(inode);
2114 /* There's no need to set i_disksize: the fact that i_nlink is 2116 /* There's no need to set i_disksize: the fact that i_nlink is
@@ -2124,16 +2126,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2124 2126
2125end_rmdir: 2127end_rmdir:
2126 ext4_journal_stop(handle); 2128 ext4_journal_stop(handle);
2127 brelse (bh); 2129 brelse(bh);
2128 return retval; 2130 return retval;
2129} 2131}
2130 2132
2131static int ext4_unlink(struct inode * dir, struct dentry *dentry) 2133static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2132{ 2134{
2133 int retval; 2135 int retval;
2134 struct inode * inode; 2136 struct inode *inode;
2135 struct buffer_head * bh; 2137 struct buffer_head *bh;
2136 struct ext4_dir_entry_2 * de; 2138 struct ext4_dir_entry_2 *de;
2137 handle_t *handle; 2139 handle_t *handle;
2138 2140
2139 /* Initialize quotas before so that eventual writes go 2141 /* Initialize quotas before so that eventual writes go
@@ -2147,7 +2149,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2147 handle->h_sync = 1; 2149 handle->h_sync = 1;
2148 2150
2149 retval = -ENOENT; 2151 retval = -ENOENT;
2150 bh = ext4_find_entry (dentry, &de); 2152 bh = ext4_find_entry(dir, &dentry->d_name, &de);
2151 if (!bh) 2153 if (!bh)
2152 goto end_unlink; 2154 goto end_unlink;
2153 2155
@@ -2158,9 +2160,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2158 goto end_unlink; 2160 goto end_unlink;
2159 2161
2160 if (!inode->i_nlink) { 2162 if (!inode->i_nlink) {
2161 ext4_warning (inode->i_sb, "ext4_unlink", 2163 ext4_warning(inode->i_sb, "ext4_unlink",
2162 "Deleting nonexistent file (%lu), %d", 2164 "Deleting nonexistent file (%lu), %d",
2163 inode->i_ino, inode->i_nlink); 2165 inode->i_ino, inode->i_nlink);
2164 inode->i_nlink = 1; 2166 inode->i_nlink = 1;
2165 } 2167 }
2166 retval = ext4_delete_entry(handle, dir, de, bh); 2168 retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2178,15 +2180,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2178 2180
2179end_unlink: 2181end_unlink:
2180 ext4_journal_stop(handle); 2182 ext4_journal_stop(handle);
2181 brelse (bh); 2183 brelse(bh);
2182 return retval; 2184 return retval;
2183} 2185}
2184 2186
2185static int ext4_symlink (struct inode * dir, 2187static int ext4_symlink(struct inode *dir,
2186 struct dentry *dentry, const char * symname) 2188 struct dentry *dentry, const char *symname)
2187{ 2189{
2188 handle_t *handle; 2190 handle_t *handle;
2189 struct inode * inode; 2191 struct inode *inode;
2190 int l, err, retries = 0; 2192 int l, err, retries = 0;
2191 2193
2192 l = strlen(symname)+1; 2194 l = strlen(symname)+1;
@@ -2203,12 +2205,12 @@ retry:
2203 if (IS_DIRSYNC(dir)) 2205 if (IS_DIRSYNC(dir))
2204 handle->h_sync = 1; 2206 handle->h_sync = 1;
2205 2207
2206 inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); 2208 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
2207 err = PTR_ERR(inode); 2209 err = PTR_ERR(inode);
2208 if (IS_ERR(inode)) 2210 if (IS_ERR(inode))
2209 goto out_stop; 2211 goto out_stop;
2210 2212
2211 if (l > sizeof (EXT4_I(inode)->i_data)) { 2213 if (l > sizeof(EXT4_I(inode)->i_data)) {
2212 inode->i_op = &ext4_symlink_inode_operations; 2214 inode->i_op = &ext4_symlink_inode_operations;
2213 ext4_set_aops(inode); 2215 ext4_set_aops(inode);
2214 /* 2216 /*
@@ -2221,14 +2223,14 @@ retry:
2221 if (err) { 2223 if (err) {
2222 clear_nlink(inode); 2224 clear_nlink(inode);
2223 ext4_mark_inode_dirty(handle, inode); 2225 ext4_mark_inode_dirty(handle, inode);
2224 iput (inode); 2226 iput(inode);
2225 goto out_stop; 2227 goto out_stop;
2226 } 2228 }
2227 } else { 2229 } else {
2228 /* clear the extent format for fast symlink */ 2230 /* clear the extent format for fast symlink */
2229 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2231 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
2230 inode->i_op = &ext4_fast_symlink_inode_operations; 2232 inode->i_op = &ext4_fast_symlink_inode_operations;
2231 memcpy((char*)&EXT4_I(inode)->i_data,symname,l); 2233 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2232 inode->i_size = l-1; 2234 inode->i_size = l-1;
2233 } 2235 }
2234 EXT4_I(inode)->i_disksize = inode->i_size; 2236 EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2240,8 +2242,8 @@ out_stop:
2240 return err; 2242 return err;
2241} 2243}
2242 2244
2243static int ext4_link (struct dentry * old_dentry, 2245static int ext4_link(struct dentry *old_dentry,
2244 struct inode * dir, struct dentry *dentry) 2246 struct inode *dir, struct dentry *dentry)
2245{ 2247{
2246 handle_t *handle; 2248 handle_t *handle;
2247 struct inode *inode = old_dentry->d_inode; 2249 struct inode *inode = old_dentry->d_inode;
@@ -2284,13 +2286,13 @@ retry:
2284 * Anybody can rename anything with this: the permission checks are left to the 2286 * Anybody can rename anything with this: the permission checks are left to the
2285 * higher-level routines. 2287 * higher-level routines.
2286 */ 2288 */
2287static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, 2289static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2288 struct inode * new_dir,struct dentry *new_dentry) 2290 struct inode *new_dir, struct dentry *new_dentry)
2289{ 2291{
2290 handle_t *handle; 2292 handle_t *handle;
2291 struct inode * old_inode, * new_inode; 2293 struct inode *old_inode, *new_inode;
2292 struct buffer_head * old_bh, * new_bh, * dir_bh; 2294 struct buffer_head *old_bh, *new_bh, *dir_bh;
2293 struct ext4_dir_entry_2 * old_de, * new_de; 2295 struct ext4_dir_entry_2 *old_de, *new_de;
2294 int retval; 2296 int retval;
2295 2297
2296 old_bh = new_bh = dir_bh = NULL; 2298 old_bh = new_bh = dir_bh = NULL;
@@ -2308,7 +2310,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2308 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 2310 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2309 handle->h_sync = 1; 2311 handle->h_sync = 1;
2310 2312
2311 old_bh = ext4_find_entry (old_dentry, &old_de); 2313 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2312 /* 2314 /*
2313 * Check for inode number is _not_ due to possible IO errors. 2315 * Check for inode number is _not_ due to possible IO errors.
2314 * We might rmdir the source, keep it as pwd of some process 2316 * We might rmdir the source, keep it as pwd of some process
@@ -2321,32 +2323,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2321 goto end_rename; 2323 goto end_rename;
2322 2324
2323 new_inode = new_dentry->d_inode; 2325 new_inode = new_dentry->d_inode;
2324 new_bh = ext4_find_entry (new_dentry, &new_de); 2326 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2325 if (new_bh) { 2327 if (new_bh) {
2326 if (!new_inode) { 2328 if (!new_inode) {
2327 brelse (new_bh); 2329 brelse(new_bh);
2328 new_bh = NULL; 2330 new_bh = NULL;
2329 } 2331 }
2330 } 2332 }
2331 if (S_ISDIR(old_inode->i_mode)) { 2333 if (S_ISDIR(old_inode->i_mode)) {
2332 if (new_inode) { 2334 if (new_inode) {
2333 retval = -ENOTEMPTY; 2335 retval = -ENOTEMPTY;
2334 if (!empty_dir (new_inode)) 2336 if (!empty_dir(new_inode))
2335 goto end_rename; 2337 goto end_rename;
2336 } 2338 }
2337 retval = -EIO; 2339 retval = -EIO;
2338 dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval); 2340 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2339 if (!dir_bh) 2341 if (!dir_bh)
2340 goto end_rename; 2342 goto end_rename;
2341 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2343 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2342 goto end_rename; 2344 goto end_rename;
2343 retval = -EMLINK; 2345 retval = -EMLINK;
2344 if (!new_inode && new_dir!=old_dir && 2346 if (!new_inode && new_dir != old_dir &&
2345 new_dir->i_nlink >= EXT4_LINK_MAX) 2347 new_dir->i_nlink >= EXT4_LINK_MAX)
2346 goto end_rename; 2348 goto end_rename;
2347 } 2349 }
2348 if (!new_bh) { 2350 if (!new_bh) {
2349 retval = ext4_add_entry (handle, new_dentry, old_inode); 2351 retval = ext4_add_entry(handle, new_dentry, old_inode);
2350 if (retval) 2352 if (retval)
2351 goto end_rename; 2353 goto end_rename;
2352 } else { 2354 } else {
@@ -2388,7 +2390,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2388 struct buffer_head *old_bh2; 2390 struct buffer_head *old_bh2;
2389 struct ext4_dir_entry_2 *old_de2; 2391 struct ext4_dir_entry_2 *old_de2;
2390 2392
2391 old_bh2 = ext4_find_entry(old_dentry, &old_de2); 2393 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2392 if (old_bh2) { 2394 if (old_bh2) {
2393 retval = ext4_delete_entry(handle, old_dir, 2395 retval = ext4_delete_entry(handle, old_dir,
2394 old_de2, old_bh2); 2396 old_de2, old_bh2);
@@ -2433,9 +2435,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2433 retval = 0; 2435 retval = 0;
2434 2436
2435end_rename: 2437end_rename:
2436 brelse (dir_bh); 2438 brelse(dir_bh);
2437 brelse (old_bh); 2439 brelse(old_bh);
2438 brelse (new_bh); 2440 brelse(new_bh);
2439 ext4_journal_stop(handle); 2441 ext4_journal_stop(handle);
2440 return retval; 2442 return retval;
2441} 2443}
@@ -2454,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2454 .mknod = ext4_mknod, 2456 .mknod = ext4_mknod,
2455 .rename = ext4_rename, 2457 .rename = ext4_rename,
2456 .setattr = ext4_setattr, 2458 .setattr = ext4_setattr,
2457#ifdef CONFIG_EXT4DEV_FS_XATTR 2459#ifdef CONFIG_EXT4_FS_XATTR
2458 .setxattr = generic_setxattr, 2460 .setxattr = generic_setxattr,
2459 .getxattr = generic_getxattr, 2461 .getxattr = generic_getxattr,
2460 .listxattr = ext4_listxattr, 2462 .listxattr = ext4_listxattr,
@@ -2465,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2465 2467
2466const struct inode_operations ext4_special_inode_operations = { 2468const struct inode_operations ext4_special_inode_operations = {
2467 .setattr = ext4_setattr, 2469 .setattr = ext4_setattr,
2468#ifdef CONFIG_EXT4DEV_FS_XATTR 2470#ifdef CONFIG_EXT4_FS_XATTR
2469 .setxattr = generic_setxattr, 2471 .setxattr = generic_setxattr,
2470 .getxattr = generic_getxattr, 2472 .getxattr = generic_getxattr,
2471 .listxattr = ext4_listxattr, 2473 .listxattr = ext4_listxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 0a9265164265..b6ec1843a015 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -416,8 +416,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
416 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", 416 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
417 gdb_num); 417 gdb_num);
418 418
419 /* 419 /*
420 * If we are not using the primary superblock/GDT copy don't resize, 420 * If we are not using the primary superblock/GDT copy don't resize,
421 * because the user tools have no way of handling this. Probably a 421 * because the user tools have no way of handling this. Probably a
422 * bad time to do it anyways. 422 * bad time to do it anyways.
423 */ 423 */
@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
773 773
774 if (reserved_gdb || gdb_off == 0) { 774 if (reserved_gdb || gdb_off == 0) {
775 if (!EXT4_HAS_COMPAT_FEATURE(sb, 775 if (!EXT4_HAS_COMPAT_FEATURE(sb,
776 EXT4_FEATURE_COMPAT_RESIZE_INODE)){ 776 EXT4_FEATURE_COMPAT_RESIZE_INODE)
777 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
777 ext4_warning(sb, __func__, 778 ext4_warning(sb, __func__,
778 "No reserved GDT blocks, can't resize"); 779 "No reserved GDT blocks, can't resize");
779 return -EPERM; 780 return -EPERM;
@@ -869,11 +870,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
869 * We can allocate memory for mb_alloc based on the new group 870 * We can allocate memory for mb_alloc based on the new group
870 * descriptor 871 * descriptor
871 */ 872 */
872 if (test_opt(sb, MBALLOC)) { 873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); 874 if (err)
874 if (err) 875 goto exit_journal;
875 goto exit_journal; 876
876 }
877 /* 877 /*
878 * Make the new blocks and inodes valid next. We do this before 878 * Make the new blocks and inodes valid next. We do this before
879 * increasing the group count so that once the group is enabled, 879 * increasing the group count so that once the group is enabled,
@@ -928,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
928 percpu_counter_add(&sbi->s_freeinodes_counter, 928 percpu_counter_add(&sbi->s_freeinodes_counter,
929 EXT4_INODES_PER_GROUP(sb)); 929 EXT4_INODES_PER_GROUP(sb));
930 930
931 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
932 ext4_group_t flex_group;
933 flex_group = ext4_flex_group(sbi, input->group);
934 sbi->s_flex_groups[flex_group].free_blocks +=
935 input->free_blocks_count;
936 sbi->s_flex_groups[flex_group].free_inodes +=
937 EXT4_INODES_PER_GROUP(sb);
938 }
939
931 ext4_journal_dirty_metadata(handle, sbi->s_sbh); 940 ext4_journal_dirty_metadata(handle, sbi->s_sbh);
932 sb->s_dirt = 1; 941 sb->s_dirt = 1;
933 942
@@ -963,7 +972,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
963 ext4_group_t o_groups_count; 972 ext4_group_t o_groups_count;
964 ext4_grpblk_t last; 973 ext4_grpblk_t last;
965 ext4_grpblk_t add; 974 ext4_grpblk_t add;
966 struct buffer_head * bh; 975 struct buffer_head *bh;
967 handle_t *handle; 976 handle_t *handle;
968 int err; 977 int err;
969 unsigned long freed_blocks; 978 unsigned long freed_blocks;
@@ -1076,8 +1085,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1076 /* 1085 /*
1077 * Mark mballoc pages as not up to date so that they will be updated 1086 * Mark mballoc pages as not up to date so that they will be updated
1078 * next time they are loaded by ext4_mb_load_buddy. 1087 * next time they are loaded by ext4_mb_load_buddy.
1088 *
1089 * XXX Bad, Bad, BAD!!! We should not be overloading the
1090 * Uptodate flag, particularly on thte bitmap bh, as way of
1091 * hinting to ext4_mb_load_buddy() that it needs to be
1092 * overloaded. A user could take a LVM snapshot, then do an
1093 * on-line fsck, and clear the uptodate flag, and this would
1094 * not be a bug in userspace, but a bug in the kernel. FIXME!!!
1079 */ 1095 */
1080 if (test_opt(sb, MBALLOC)) { 1096 {
1081 struct ext4_sb_info *sbi = EXT4_SB(sb); 1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1082 struct inode *inode = sbi->s_buddy_cache; 1098 struct inode *inode = sbi->s_buddy_cache;
1083 int blocks_per_page; 1099 int blocks_per_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d5d77958b861..dea8f13c2fd9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,8 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/quotaops.h> 35#include <linux/quotaops.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/proc_fs.h>
38#include <linux/marker.h>
37#include <linux/log2.h> 39#include <linux/log2.h>
38#include <linux/crc16.h> 40#include <linux/crc16.h>
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -45,6 +47,8 @@
45#include "namei.h" 47#include "namei.h"
46#include "group.h" 48#include "group.h"
47 49
50struct proc_dir_entry *ext4_proc_root;
51
48static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 52static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
49 unsigned long journal_devnum); 53 unsigned long journal_devnum);
50static int ext4_create_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
@@ -503,15 +507,18 @@ static void ext4_put_super(struct super_block *sb)
503 ext4_mb_release(sb); 507 ext4_mb_release(sb);
504 ext4_ext_release(sb); 508 ext4_ext_release(sb);
505 ext4_xattr_put_super(sb); 509 ext4_xattr_put_super(sb);
506 jbd2_journal_destroy(sbi->s_journal); 510 if (jbd2_journal_destroy(sbi->s_journal) < 0)
511 ext4_abort(sb, __func__, "Couldn't clean up the journal");
507 sbi->s_journal = NULL; 512 sbi->s_journal = NULL;
508 if (!(sb->s_flags & MS_RDONLY)) { 513 if (!(sb->s_flags & MS_RDONLY)) {
509 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 514 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
510 es->s_state = cpu_to_le16(sbi->s_mount_state); 515 es->s_state = cpu_to_le16(sbi->s_mount_state);
511 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
512 mark_buffer_dirty(sbi->s_sbh);
513 ext4_commit_super(sb, es, 1); 516 ext4_commit_super(sb, es, 1);
514 } 517 }
518 if (sbi->s_proc) {
519 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
520 remove_proc_entry(sb->s_id, ext4_proc_root);
521 }
515 522
516 for (i = 0; i < sbi->s_gdb_count; i++) 523 for (i = 0; i < sbi->s_gdb_count; i++)
517 brelse(sbi->s_group_desc[i]); 524 brelse(sbi->s_group_desc[i]);
@@ -520,6 +527,7 @@ static void ext4_put_super(struct super_block *sb)
520 percpu_counter_destroy(&sbi->s_freeblocks_counter); 527 percpu_counter_destroy(&sbi->s_freeblocks_counter);
521 percpu_counter_destroy(&sbi->s_freeinodes_counter); 528 percpu_counter_destroy(&sbi->s_freeinodes_counter);
522 percpu_counter_destroy(&sbi->s_dirs_counter); 529 percpu_counter_destroy(&sbi->s_dirs_counter);
530 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
523 brelse(sbi->s_sbh); 531 brelse(sbi->s_sbh);
524#ifdef CONFIG_QUOTA 532#ifdef CONFIG_QUOTA
525 for (i = 0; i < MAXQUOTAS; i++) 533 for (i = 0; i < MAXQUOTAS; i++)
@@ -562,12 +570,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
562 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); 570 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
563 if (!ei) 571 if (!ei)
564 return NULL; 572 return NULL;
565#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 573#ifdef CONFIG_EXT4_FS_POSIX_ACL
566 ei->i_acl = EXT4_ACL_NOT_CACHED; 574 ei->i_acl = EXT4_ACL_NOT_CACHED;
567 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 575 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
568#endif 576#endif
569 ei->i_block_alloc_info = NULL;
570 ei->vfs_inode.i_version = 1; 577 ei->vfs_inode.i_version = 1;
578 ei->vfs_inode.i_data.writeback_index = 0;
571 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 579 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
572 INIT_LIST_HEAD(&ei->i_prealloc_list); 580 INIT_LIST_HEAD(&ei->i_prealloc_list);
573 spin_lock_init(&ei->i_prealloc_lock); 581 spin_lock_init(&ei->i_prealloc_lock);
@@ -598,7 +606,7 @@ static void init_once(void *foo)
598 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 606 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
599 607
600 INIT_LIST_HEAD(&ei->i_orphan); 608 INIT_LIST_HEAD(&ei->i_orphan);
601#ifdef CONFIG_EXT4DEV_FS_XATTR 609#ifdef CONFIG_EXT4_FS_XATTR
602 init_rwsem(&ei->xattr_sem); 610 init_rwsem(&ei->xattr_sem);
603#endif 611#endif
604 init_rwsem(&ei->i_data_sem); 612 init_rwsem(&ei->i_data_sem);
@@ -624,8 +632,7 @@ static void destroy_inodecache(void)
624 632
625static void ext4_clear_inode(struct inode *inode) 633static void ext4_clear_inode(struct inode *inode)
626{ 634{
627 struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info; 635#ifdef CONFIG_EXT4_FS_POSIX_ACL
628#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
629 if (EXT4_I(inode)->i_acl && 636 if (EXT4_I(inode)->i_acl &&
630 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) { 637 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
631 posix_acl_release(EXT4_I(inode)->i_acl); 638 posix_acl_release(EXT4_I(inode)->i_acl);
@@ -637,10 +644,7 @@ static void ext4_clear_inode(struct inode *inode)
637 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED; 644 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
638 } 645 }
639#endif 646#endif
640 ext4_discard_reservation(inode); 647 ext4_discard_preallocations(inode);
641 EXT4_I(inode)->i_block_alloc_info = NULL;
642 if (unlikely(rsv))
643 kfree(rsv);
644 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 648 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
645 &EXT4_I(inode)->jinode); 649 &EXT4_I(inode)->jinode);
646} 650}
@@ -653,7 +657,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
653 657
654 if (sbi->s_jquota_fmt) 658 if (sbi->s_jquota_fmt)
655 seq_printf(seq, ",jqfmt=%s", 659 seq_printf(seq, ",jqfmt=%s",
656 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); 660 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
657 661
658 if (sbi->s_qf_names[USRQUOTA]) 662 if (sbi->s_qf_names[USRQUOTA])
659 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); 663 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -717,7 +721,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
717 seq_puts(seq, ",debug"); 721 seq_puts(seq, ",debug");
718 if (test_opt(sb, OLDALLOC)) 722 if (test_opt(sb, OLDALLOC))
719 seq_puts(seq, ",oldalloc"); 723 seq_puts(seq, ",oldalloc");
720#ifdef CONFIG_EXT4DEV_FS_XATTR 724#ifdef CONFIG_EXT4_FS_XATTR
721 if (test_opt(sb, XATTR_USER) && 725 if (test_opt(sb, XATTR_USER) &&
722 !(def_mount_opts & EXT4_DEFM_XATTR_USER)) 726 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
723 seq_puts(seq, ",user_xattr"); 727 seq_puts(seq, ",user_xattr");
@@ -726,7 +730,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
726 seq_puts(seq, ",nouser_xattr"); 730 seq_puts(seq, ",nouser_xattr");
727 } 731 }
728#endif 732#endif
729#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 733#ifdef CONFIG_EXT4_FS_POSIX_ACL
730 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) 734 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
731 seq_puts(seq, ",acl"); 735 seq_puts(seq, ",acl");
732 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 736 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -751,8 +755,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
751 seq_puts(seq, ",nobh"); 755 seq_puts(seq, ",nobh");
752 if (!test_opt(sb, EXTENTS)) 756 if (!test_opt(sb, EXTENTS))
753 seq_puts(seq, ",noextents"); 757 seq_puts(seq, ",noextents");
754 if (!test_opt(sb, MBALLOC))
755 seq_puts(seq, ",nomballoc");
756 if (test_opt(sb, I_VERSION)) 758 if (test_opt(sb, I_VERSION))
757 seq_puts(seq, ",i_version"); 759 seq_puts(seq, ",i_version");
758 if (!test_opt(sb, DELALLOC)) 760 if (!test_opt(sb, DELALLOC))
@@ -772,6 +774,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
772 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 774 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
773 seq_puts(seq, ",data=writeback"); 775 seq_puts(seq, ",data=writeback");
774 776
777 if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
778 seq_printf(seq, ",inode_readahead_blks=%u",
779 sbi->s_inode_readahead_blks);
780
781 if (test_opt(sb, DATA_ERR_ABORT))
782 seq_puts(seq, ",data_err=abort");
783
775 ext4_show_quota_options(seq, sb); 784 ext4_show_quota_options(seq, sb);
776 return 0; 785 return 0;
777} 786}
@@ -821,7 +830,7 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
821} 830}
822 831
823#ifdef CONFIG_QUOTA 832#ifdef CONFIG_QUOTA
824#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group") 833#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
825#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 834#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
826 835
827static int ext4_dquot_initialize(struct inode *inode, int type); 836static int ext4_dquot_initialize(struct inode *inode, int type);
@@ -901,14 +910,16 @@ enum {
901 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 910 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
902 Opt_journal_checksum, Opt_journal_async_commit, 911 Opt_journal_checksum, Opt_journal_async_commit,
903 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 912 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
913 Opt_data_err_abort, Opt_data_err_ignore,
904 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 914 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
905 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 915 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
906 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 916 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
907 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 917 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
908 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, 918 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
919 Opt_inode_readahead_blks
909}; 920};
910 921
911static match_table_t tokens = { 922static const match_table_t tokens = {
912 {Opt_bsd_df, "bsddf"}, 923 {Opt_bsd_df, "bsddf"},
913 {Opt_minix_df, "minixdf"}, 924 {Opt_minix_df, "minixdf"},
914 {Opt_grpid, "grpid"}, 925 {Opt_grpid, "grpid"},
@@ -946,6 +957,8 @@ static match_table_t tokens = {
946 {Opt_data_journal, "data=journal"}, 957 {Opt_data_journal, "data=journal"},
947 {Opt_data_ordered, "data=ordered"}, 958 {Opt_data_ordered, "data=ordered"},
948 {Opt_data_writeback, "data=writeback"}, 959 {Opt_data_writeback, "data=writeback"},
960 {Opt_data_err_abort, "data_err=abort"},
961 {Opt_data_err_ignore, "data_err=ignore"},
949 {Opt_offusrjquota, "usrjquota="}, 962 {Opt_offusrjquota, "usrjquota="},
950 {Opt_usrjquota, "usrjquota=%s"}, 963 {Opt_usrjquota, "usrjquota=%s"},
951 {Opt_offgrpjquota, "grpjquota="}, 964 {Opt_offgrpjquota, "grpjquota="},
@@ -966,6 +979,7 @@ static match_table_t tokens = {
966 {Opt_resize, "resize"}, 979 {Opt_resize, "resize"},
967 {Opt_delalloc, "delalloc"}, 980 {Opt_delalloc, "delalloc"},
968 {Opt_nodelalloc, "nodelalloc"}, 981 {Opt_nodelalloc, "nodelalloc"},
982 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
969 {Opt_err, NULL}, 983 {Opt_err, NULL},
970}; 984};
971 985
@@ -980,7 +994,7 @@ static ext4_fsblk_t get_sb_block(void **data)
980 /*todo: use simple_strtoll with >32bit ext4 */ 994 /*todo: use simple_strtoll with >32bit ext4 */
981 sb_block = simple_strtoul(options, &options, 0); 995 sb_block = simple_strtoul(options, &options, 0);
982 if (*options && *options != ',') { 996 if (*options && *options != ',') {
983 printk("EXT4-fs: Invalid sb specification: %s\n", 997 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
984 (char *) *data); 998 (char *) *data);
985 return 1; 999 return 1;
986 } 1000 }
@@ -1071,7 +1085,7 @@ static int parse_options(char *options, struct super_block *sb,
1071 case Opt_orlov: 1085 case Opt_orlov:
1072 clear_opt(sbi->s_mount_opt, OLDALLOC); 1086 clear_opt(sbi->s_mount_opt, OLDALLOC);
1073 break; 1087 break;
1074#ifdef CONFIG_EXT4DEV_FS_XATTR 1088#ifdef CONFIG_EXT4_FS_XATTR
1075 case Opt_user_xattr: 1089 case Opt_user_xattr:
1076 set_opt(sbi->s_mount_opt, XATTR_USER); 1090 set_opt(sbi->s_mount_opt, XATTR_USER);
1077 break; 1091 break;
@@ -1081,10 +1095,11 @@ static int parse_options(char *options, struct super_block *sb,
1081#else 1095#else
1082 case Opt_user_xattr: 1096 case Opt_user_xattr:
1083 case Opt_nouser_xattr: 1097 case Opt_nouser_xattr:
1084 printk("EXT4 (no)user_xattr options not supported\n"); 1098 printk(KERN_ERR "EXT4 (no)user_xattr options "
1099 "not supported\n");
1085 break; 1100 break;
1086#endif 1101#endif
1087#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 1102#ifdef CONFIG_EXT4_FS_POSIX_ACL
1088 case Opt_acl: 1103 case Opt_acl:
1089 set_opt(sbi->s_mount_opt, POSIX_ACL); 1104 set_opt(sbi->s_mount_opt, POSIX_ACL);
1090 break; 1105 break;
@@ -1094,7 +1109,8 @@ static int parse_options(char *options, struct super_block *sb,
1094#else 1109#else
1095 case Opt_acl: 1110 case Opt_acl:
1096 case Opt_noacl: 1111 case Opt_noacl:
1097 printk("EXT4 (no)acl options not supported\n"); 1112 printk(KERN_ERR "EXT4 (no)acl options "
1113 "not supported\n");
1098 break; 1114 break;
1099#endif 1115#endif
1100 case Opt_reservation: 1116 case Opt_reservation:
@@ -1177,6 +1193,12 @@ static int parse_options(char *options, struct super_block *sb,
1177 sbi->s_mount_opt |= data_opt; 1193 sbi->s_mount_opt |= data_opt;
1178 } 1194 }
1179 break; 1195 break;
1196 case Opt_data_err_abort:
1197 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1198 break;
1199 case Opt_data_err_ignore:
1200 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1201 break;
1180#ifdef CONFIG_QUOTA 1202#ifdef CONFIG_QUOTA
1181 case Opt_usrjquota: 1203 case Opt_usrjquota:
1182 qtype = USRQUOTA; 1204 qtype = USRQUOTA;
@@ -1188,8 +1210,8 @@ set_qf_name:
1188 sb_any_quota_suspended(sb)) && 1210 sb_any_quota_suspended(sb)) &&
1189 !sbi->s_qf_names[qtype]) { 1211 !sbi->s_qf_names[qtype]) {
1190 printk(KERN_ERR 1212 printk(KERN_ERR
1191 "EXT4-fs: Cannot change journaled " 1213 "EXT4-fs: Cannot change journaled "
1192 "quota options when quota turned on.\n"); 1214 "quota options when quota turned on.\n");
1193 return 0; 1215 return 0;
1194 } 1216 }
1195 qname = match_strdup(&args[0]); 1217 qname = match_strdup(&args[0]);
@@ -1356,12 +1378,6 @@ set_qf_format:
1356 case Opt_nodelalloc: 1378 case Opt_nodelalloc:
1357 clear_opt(sbi->s_mount_opt, DELALLOC); 1379 clear_opt(sbi->s_mount_opt, DELALLOC);
1358 break; 1380 break;
1359 case Opt_mballoc:
1360 set_opt(sbi->s_mount_opt, MBALLOC);
1361 break;
1362 case Opt_nomballoc:
1363 clear_opt(sbi->s_mount_opt, MBALLOC);
1364 break;
1365 case Opt_stripe: 1381 case Opt_stripe:
1366 if (match_int(&args[0], &option)) 1382 if (match_int(&args[0], &option))
1367 return 0; 1383 return 0;
@@ -1372,6 +1388,13 @@ set_qf_format:
1372 case Opt_delalloc: 1388 case Opt_delalloc:
1373 set_opt(sbi->s_mount_opt, DELALLOC); 1389 set_opt(sbi->s_mount_opt, DELALLOC);
1374 break; 1390 break;
1391 case Opt_inode_readahead_blks:
1392 if (match_int(&args[0], &option))
1393 return 0;
1394 if (option < 0 || option > (1 << 30))
1395 return 0;
1396 sbi->s_inode_readahead_blks = option;
1397 break;
1375 default: 1398 default:
1376 printk(KERN_ERR 1399 printk(KERN_ERR
1377 "EXT4-fs: Unrecognized mount option \"%s\" " 1400 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1472,15 +1495,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1472 EXT4_INODES_PER_GROUP(sb), 1495 EXT4_INODES_PER_GROUP(sb),
1473 sbi->s_mount_opt); 1496 sbi->s_mount_opt);
1474 1497
1475 printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id); 1498 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1476 if (EXT4_SB(sb)->s_journal->j_inode == NULL) { 1499 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1477 char b[BDEVNAME_SIZE]; 1500 "external", EXT4_SB(sb)->s_journal->j_devname);
1478
1479 printk("external journal on %s\n",
1480 bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
1481 } else {
1482 printk("internal journal\n");
1483 }
1484 return res; 1501 return res;
1485} 1502}
1486 1503
@@ -1503,8 +1520,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
1503 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1520 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1504 groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1521 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1505 1522
1506 flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / 1523 /* We allocate both existing and potentially added groups */
1507 groups_per_flex; 1524 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1525 ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
1526 EXT4_DESC_PER_BLOCK_BITS(sb))) /
1527 groups_per_flex;
1508 sbi->s_flex_groups = kzalloc(flex_group_count * 1528 sbi->s_flex_groups = kzalloc(flex_group_count *
1509 sizeof(struct flex_groups), GFP_KERNEL); 1529 sizeof(struct flex_groups), GFP_KERNEL);
1510 if (sbi->s_flex_groups == NULL) { 1530 if (sbi->s_flex_groups == NULL) {
@@ -1583,7 +1603,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1583 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 1603 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1584 flexbg_flag = 1; 1604 flexbg_flag = 1;
1585 1605
1586 ext4_debug ("Checking group descriptors"); 1606 ext4_debug("Checking group descriptors");
1587 1607
1588 for (i = 0; i < sbi->s_groups_count; i++) { 1608 for (i = 0; i < sbi->s_groups_count; i++) {
1589 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1609 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1622,8 +1642,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1622 "Checksum for group %lu failed (%u!=%u)\n", 1642 "Checksum for group %lu failed (%u!=%u)\n",
1623 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1643 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1624 gdp)), le16_to_cpu(gdp->bg_checksum)); 1644 gdp)), le16_to_cpu(gdp->bg_checksum));
1625 if (!(sb->s_flags & MS_RDONLY)) 1645 if (!(sb->s_flags & MS_RDONLY)) {
1646 spin_unlock(sb_bgl_lock(sbi, i));
1626 return 0; 1647 return 0;
1648 }
1627 } 1649 }
1628 spin_unlock(sb_bgl_lock(sbi, i)); 1650 spin_unlock(sb_bgl_lock(sbi, i));
1629 if (!flexbg_flag) 1651 if (!flexbg_flag)
@@ -1713,9 +1735,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1713 DQUOT_INIT(inode); 1735 DQUOT_INIT(inode);
1714 if (inode->i_nlink) { 1736 if (inode->i_nlink) {
1715 printk(KERN_DEBUG 1737 printk(KERN_DEBUG
1716 "%s: truncating inode %lu to %Ld bytes\n", 1738 "%s: truncating inode %lu to %lld bytes\n",
1717 __func__, inode->i_ino, inode->i_size); 1739 __func__, inode->i_ino, inode->i_size);
1718 jbd_debug(2, "truncating inode %lu to %Ld bytes\n", 1740 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
1719 inode->i_ino, inode->i_size); 1741 inode->i_ino, inode->i_size);
1720 ext4_truncate(inode); 1742 ext4_truncate(inode);
1721 nr_truncates++; 1743 nr_truncates++;
@@ -1913,6 +1935,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1913 unsigned long journal_devnum = 0; 1935 unsigned long journal_devnum = 0;
1914 unsigned long def_mount_opts; 1936 unsigned long def_mount_opts;
1915 struct inode *root; 1937 struct inode *root;
1938 char *cp;
1916 int ret = -EINVAL; 1939 int ret = -EINVAL;
1917 int blocksize; 1940 int blocksize;
1918 int db_count; 1941 int db_count;
@@ -1929,10 +1952,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1929 sbi->s_mount_opt = 0; 1952 sbi->s_mount_opt = 0;
1930 sbi->s_resuid = EXT4_DEF_RESUID; 1953 sbi->s_resuid = EXT4_DEF_RESUID;
1931 sbi->s_resgid = EXT4_DEF_RESGID; 1954 sbi->s_resgid = EXT4_DEF_RESGID;
1955 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
1932 sbi->s_sb_block = sb_block; 1956 sbi->s_sb_block = sb_block;
1933 1957
1934 unlock_kernel(); 1958 unlock_kernel();
1935 1959
1960 /* Cleanup superblock name */
1961 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
1962 *cp = '!';
1963
1936 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 1964 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
1937 if (!blocksize) { 1965 if (!blocksize) {
1938 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); 1966 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
@@ -1972,11 +2000,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1972 set_opt(sbi->s_mount_opt, GRPID); 2000 set_opt(sbi->s_mount_opt, GRPID);
1973 if (def_mount_opts & EXT4_DEFM_UID16) 2001 if (def_mount_opts & EXT4_DEFM_UID16)
1974 set_opt(sbi->s_mount_opt, NO_UID32); 2002 set_opt(sbi->s_mount_opt, NO_UID32);
1975#ifdef CONFIG_EXT4DEV_FS_XATTR 2003#ifdef CONFIG_EXT4_FS_XATTR
1976 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 2004 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
1977 set_opt(sbi->s_mount_opt, XATTR_USER); 2005 set_opt(sbi->s_mount_opt, XATTR_USER);
1978#endif 2006#endif
1979#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 2007#ifdef CONFIG_EXT4_FS_POSIX_ACL
1980 if (def_mount_opts & EXT4_DEFM_ACL) 2008 if (def_mount_opts & EXT4_DEFM_ACL)
1981 set_opt(sbi->s_mount_opt, POSIX_ACL); 2009 set_opt(sbi->s_mount_opt, POSIX_ACL);
1982#endif 2010#endif
@@ -2011,11 +2039,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2011 ext4_warning(sb, __func__, 2039 ext4_warning(sb, __func__,
2012 "extents feature not enabled on this filesystem, " 2040 "extents feature not enabled on this filesystem, "
2013 "use tune2fs.\n"); 2041 "use tune2fs.\n");
2014 /*
2015 * turn on mballoc code by default in ext4 filesystem
2016 * Use -o nomballoc to turn it off
2017 */
2018 set_opt(sbi->s_mount_opt, MBALLOC);
2019 2042
2020 /* 2043 /*
2021 * enable delayed allocation by default 2044 * enable delayed allocation by default
@@ -2040,16 +2063,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2040 "running e2fsck is recommended\n"); 2063 "running e2fsck is recommended\n");
2041 2064
2042 /* 2065 /*
2043 * Since ext4 is still considered development code, we require
2044 * that the TEST_FILESYS flag in s->flags be set.
2045 */
2046 if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
2047 printk(KERN_WARNING "EXT4-fs: %s: not marked "
2048 "OK to use with test code.\n", sb->s_id);
2049 goto failed_mount;
2050 }
2051
2052 /*
2053 * Check feature flags regardless of the revision level, since we 2066 * Check feature flags regardless of the revision level, since we
2054 * previously didn't change the revision level when setting the flags, 2067 * previously didn't change the revision level when setting the flags,
2055 * so there is a chance incompat flags are set on a rev 0 filesystem. 2068 * so there is a chance incompat flags are set on a rev 0 filesystem.
@@ -2218,6 +2231,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2218 goto failed_mount; 2231 goto failed_mount;
2219 } 2232 }
2220 2233
2234#ifdef CONFIG_PROC_FS
2235 if (ext4_proc_root)
2236 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2237
2238 if (sbi->s_proc)
2239 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2240 &ext4_ui_proc_fops,
2241 &sbi->s_inode_readahead_blks);
2242#endif
2243
2221 bgl_lock_init(&sbi->s_blockgroup_lock); 2244 bgl_lock_init(&sbi->s_blockgroup_lock);
2222 2245
2223 for (i = 0; i < db_count; i++) { 2246 for (i = 0; i < db_count; i++) {
@@ -2256,24 +2279,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2256 err = percpu_counter_init(&sbi->s_dirs_counter, 2279 err = percpu_counter_init(&sbi->s_dirs_counter,
2257 ext4_count_dirs(sb)); 2280 ext4_count_dirs(sb));
2258 } 2281 }
2282 if (!err) {
2283 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2284 }
2259 if (err) { 2285 if (err) {
2260 printk(KERN_ERR "EXT4-fs: insufficient memory\n"); 2286 printk(KERN_ERR "EXT4-fs: insufficient memory\n");
2261 goto failed_mount3; 2287 goto failed_mount3;
2262 } 2288 }
2263 2289
2264 /* per fileystem reservation list head & lock */
2265 spin_lock_init(&sbi->s_rsv_window_lock);
2266 sbi->s_rsv_window_root = RB_ROOT;
2267 /* Add a single, static dummy reservation to the start of the
2268 * reservation window list --- it gives us a placeholder for
2269 * append-at-start-of-list which makes the allocation logic
2270 * _much_ simpler. */
2271 sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
2272 sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
2273 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
2274 sbi->s_rsv_window_head.rsv_goal_size = 0;
2275 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
2276
2277 sbi->s_stripe = ext4_get_stripe_size(sbi); 2290 sbi->s_stripe = ext4_get_stripe_size(sbi);
2278 2291
2279 /* 2292 /*
@@ -2470,7 +2483,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2470 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); 2483 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2471 2484
2472 ext4_ext_init(sb); 2485 ext4_ext_init(sb);
2473 ext4_mb_init(sb, needs_recovery); 2486 err = ext4_mb_init(sb, needs_recovery);
2487 if (err) {
2488 printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
2489 err);
2490 goto failed_mount4;
2491 }
2474 2492
2475 lock_kernel(); 2493 lock_kernel();
2476 return 0; 2494 return 0;
@@ -2488,11 +2506,16 @@ failed_mount3:
2488 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2506 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2489 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2507 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2490 percpu_counter_destroy(&sbi->s_dirs_counter); 2508 percpu_counter_destroy(&sbi->s_dirs_counter);
2509 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
2491failed_mount2: 2510failed_mount2:
2492 for (i = 0; i < db_count; i++) 2511 for (i = 0; i < db_count; i++)
2493 brelse(sbi->s_group_desc[i]); 2512 brelse(sbi->s_group_desc[i]);
2494 kfree(sbi->s_group_desc); 2513 kfree(sbi->s_group_desc);
2495failed_mount: 2514failed_mount:
2515 if (sbi->s_proc) {
2516 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2517 remove_proc_entry(sb->s_id, ext4_proc_root);
2518 }
2496#ifdef CONFIG_QUOTA 2519#ifdef CONFIG_QUOTA
2497 for (i = 0; i < MAXQUOTAS; i++) 2520 for (i = 0; i < MAXQUOTAS; i++)
2498 kfree(sbi->s_qf_names[i]); 2521 kfree(sbi->s_qf_names[i]);
@@ -2526,6 +2549,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2526 journal->j_flags |= JBD2_BARRIER; 2549 journal->j_flags |= JBD2_BARRIER;
2527 else 2550 else
2528 journal->j_flags &= ~JBD2_BARRIER; 2551 journal->j_flags &= ~JBD2_BARRIER;
2552 if (test_opt(sb, DATA_ERR_ABORT))
2553 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
2554 else
2555 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
2529 spin_unlock(&journal->j_state_lock); 2556 spin_unlock(&journal->j_state_lock);
2530} 2557}
2531 2558
@@ -2551,7 +2578,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2551 return NULL; 2578 return NULL;
2552 } 2579 }
2553 2580
2554 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", 2581 jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
2555 journal_inode, journal_inode->i_size); 2582 journal_inode, journal_inode->i_size);
2556 if (!S_ISREG(journal_inode->i_mode)) { 2583 if (!S_ISREG(journal_inode->i_mode)) {
2557 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); 2584 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -2714,6 +2741,11 @@ static int ext4_load_journal(struct super_block *sb,
2714 return -EINVAL; 2741 return -EINVAL;
2715 } 2742 }
2716 2743
2744 if (journal->j_flags & JBD2_BARRIER)
2745 printk(KERN_INFO "EXT4-fs: barriers enabled\n");
2746 else
2747 printk(KERN_INFO "EXT4-fs: barriers disabled\n");
2748
2717 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2749 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2718 err = jbd2_journal_update_format(journal); 2750 err = jbd2_journal_update_format(journal);
2719 if (err) { 2751 if (err) {
@@ -2798,13 +2830,34 @@ static void ext4_commit_super(struct super_block *sb,
2798 2830
2799 if (!sbh) 2831 if (!sbh)
2800 return; 2832 return;
2833 if (buffer_write_io_error(sbh)) {
2834 /*
2835 * Oh, dear. A previous attempt to write the
2836 * superblock failed. This could happen because the
2837 * USB device was yanked out. Or it could happen to
2838 * be a transient write error and maybe the block will
2839 * be remapped. Nothing we can do but to retry the
2840 * write and hope for the best.
2841 */
2842 printk(KERN_ERR "ext4: previous I/O error to "
2843 "superblock detected for %s.\n", sb->s_id);
2844 clear_buffer_write_io_error(sbh);
2845 set_buffer_uptodate(sbh);
2846 }
2801 es->s_wtime = cpu_to_le32(get_seconds()); 2847 es->s_wtime = cpu_to_le32(get_seconds());
2802 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); 2848 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
2803 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 2849 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
2804 BUFFER_TRACE(sbh, "marking dirty"); 2850 BUFFER_TRACE(sbh, "marking dirty");
2805 mark_buffer_dirty(sbh); 2851 mark_buffer_dirty(sbh);
2806 if (sync) 2852 if (sync) {
2807 sync_dirty_buffer(sbh); 2853 sync_dirty_buffer(sbh);
2854 if (buffer_write_io_error(sbh)) {
2855 printk(KERN_ERR "ext4: I/O error while writing "
2856 "superblock for %s.\n", sb->s_id);
2857 clear_buffer_write_io_error(sbh);
2858 set_buffer_uptodate(sbh);
2859 }
2860 }
2808} 2861}
2809 2862
2810 2863
@@ -2819,7 +2872,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2819 journal_t *journal = EXT4_SB(sb)->s_journal; 2872 journal_t *journal = EXT4_SB(sb)->s_journal;
2820 2873
2821 jbd2_journal_lock_updates(journal); 2874 jbd2_journal_lock_updates(journal);
2822 jbd2_journal_flush(journal); 2875 if (jbd2_journal_flush(journal) < 0)
2876 goto out;
2877
2823 lock_super(sb); 2878 lock_super(sb);
2824 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && 2879 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2825 sb->s_flags & MS_RDONLY) { 2880 sb->s_flags & MS_RDONLY) {
@@ -2828,6 +2883,8 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2828 ext4_commit_super(sb, es, 1); 2883 ext4_commit_super(sb, es, 1);
2829 } 2884 }
2830 unlock_super(sb); 2885 unlock_super(sb);
2886
2887out:
2831 jbd2_journal_unlock_updates(journal); 2888 jbd2_journal_unlock_updates(journal);
2832} 2889}
2833 2890
@@ -2906,6 +2963,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2906{ 2963{
2907 tid_t target; 2964 tid_t target;
2908 2965
2966 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2909 sb->s_dirt = 0; 2967 sb->s_dirt = 0;
2910 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 2968 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2911 if (wait) 2969 if (wait)
@@ -2927,7 +2985,13 @@ static void ext4_write_super_lockfs(struct super_block *sb)
2927 2985
2928 /* Now we set up the journal barrier. */ 2986 /* Now we set up the journal barrier. */
2929 jbd2_journal_lock_updates(journal); 2987 jbd2_journal_lock_updates(journal);
2930 jbd2_journal_flush(journal); 2988
2989 /*
2990 * We don't want to clear needs_recovery flag when we failed
2991 * to flush the journal.
2992 */
2993 if (jbd2_journal_flush(journal) < 0)
2994 return;
2931 2995
2932 /* Journal blocked and flushed, clear needs_recovery flag. */ 2996 /* Journal blocked and flushed, clear needs_recovery flag. */
2933 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 2997 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -3161,7 +3225,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3161 buf->f_type = EXT4_SUPER_MAGIC; 3225 buf->f_type = EXT4_SUPER_MAGIC;
3162 buf->f_bsize = sb->s_blocksize; 3226 buf->f_bsize = sb->s_blocksize;
3163 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3227 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
3164 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); 3228 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
3229 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
3165 ext4_free_blocks_count_set(es, buf->f_bfree); 3230 ext4_free_blocks_count_set(es, buf->f_bfree);
3166 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3231 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
3167 if (buf->f_bfree < ext4_r_blocks_count(es)) 3232 if (buf->f_bfree < ext4_r_blocks_count(es))
@@ -3366,8 +3431,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3366 * otherwise be livelocked... 3431 * otherwise be livelocked...
3367 */ 3432 */
3368 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 3433 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
3369 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 3434 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
3370 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 3435 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3436 if (err) {
3437 path_put(&nd.path);
3438 return err;
3439 }
3371 } 3440 }
3372 3441
3373 err = vfs_quota_on_path(sb, type, format_id, &nd.path); 3442 err = vfs_quota_on_path(sb, type, format_id, &nd.path);
@@ -3431,7 +3500,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3431 handle_t *handle = journal_current_handle(); 3500 handle_t *handle = journal_current_handle();
3432 3501
3433 if (!handle) { 3502 if (!handle) {
3434 printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)" 3503 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3435 " cancelled because transaction is not started.\n", 3504 " cancelled because transaction is not started.\n",
3436 (unsigned long long)off, (unsigned long long)len); 3505 (unsigned long long)off, (unsigned long long)len);
3437 return -EIO; 3506 return -EIO;
@@ -3492,18 +3561,82 @@ static int ext4_get_sb(struct file_system_type *fs_type,
3492 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3561 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3493} 3562}
3494 3563
3564#ifdef CONFIG_PROC_FS
3565static int ext4_ui_proc_show(struct seq_file *m, void *v)
3566{
3567 unsigned int *p = m->private;
3568
3569 seq_printf(m, "%u\n", *p);
3570 return 0;
3571}
3572
3573static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3574{
3575 return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3576}
3577
3578static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3579 size_t cnt, loff_t *ppos)
3580{
3581 unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
3582 char str[32];
3583 unsigned long value;
3584
3585 if (cnt >= sizeof(str))
3586 return -EINVAL;
3587 if (copy_from_user(str, buf, cnt))
3588 return -EFAULT;
3589 value = simple_strtol(str, NULL, 0);
3590 if (value < 0)
3591 return -ERANGE;
3592 *p = value;
3593 return cnt;
3594}
3595
3596const struct file_operations ext4_ui_proc_fops = {
3597 .owner = THIS_MODULE,
3598 .open = ext4_ui_proc_open,
3599 .read = seq_read,
3600 .llseek = seq_lseek,
3601 .release = single_release,
3602 .write = ext4_ui_proc_write,
3603};
3604#endif
3605
3606static struct file_system_type ext4_fs_type = {
3607 .owner = THIS_MODULE,
3608 .name = "ext4",
3609 .get_sb = ext4_get_sb,
3610 .kill_sb = kill_block_super,
3611 .fs_flags = FS_REQUIRES_DEV,
3612};
3613
3614#ifdef CONFIG_EXT4DEV_COMPAT
3615static int ext4dev_get_sb(struct file_system_type *fs_type,
3616 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
3617{
3618 printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
3619 "to mount using ext4\n");
3620 printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
3621 "will go away by 2.6.31\n");
3622 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3623}
3624
3495static struct file_system_type ext4dev_fs_type = { 3625static struct file_system_type ext4dev_fs_type = {
3496 .owner = THIS_MODULE, 3626 .owner = THIS_MODULE,
3497 .name = "ext4dev", 3627 .name = "ext4dev",
3498 .get_sb = ext4_get_sb, 3628 .get_sb = ext4dev_get_sb,
3499 .kill_sb = kill_block_super, 3629 .kill_sb = kill_block_super,
3500 .fs_flags = FS_REQUIRES_DEV, 3630 .fs_flags = FS_REQUIRES_DEV,
3501}; 3631};
3632MODULE_ALIAS("ext4dev");
3633#endif
3502 3634
3503static int __init init_ext4_fs(void) 3635static int __init init_ext4_fs(void)
3504{ 3636{
3505 int err; 3637 int err;
3506 3638
3639 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3507 err = init_ext4_mballoc(); 3640 err = init_ext4_mballoc();
3508 if (err) 3641 if (err)
3509 return err; 3642 return err;
@@ -3514,9 +3647,16 @@ static int __init init_ext4_fs(void)
3514 err = init_inodecache(); 3647 err = init_inodecache();
3515 if (err) 3648 if (err)
3516 goto out1; 3649 goto out1;
3517 err = register_filesystem(&ext4dev_fs_type); 3650 err = register_filesystem(&ext4_fs_type);
3518 if (err) 3651 if (err)
3519 goto out; 3652 goto out;
3653#ifdef CONFIG_EXT4DEV_COMPAT
3654 err = register_filesystem(&ext4dev_fs_type);
3655 if (err) {
3656 unregister_filesystem(&ext4_fs_type);
3657 goto out;
3658 }
3659#endif
3520 return 0; 3660 return 0;
3521out: 3661out:
3522 destroy_inodecache(); 3662 destroy_inodecache();
@@ -3529,10 +3669,14 @@ out2:
3529 3669
3530static void __exit exit_ext4_fs(void) 3670static void __exit exit_ext4_fs(void)
3531{ 3671{
3672 unregister_filesystem(&ext4_fs_type);
3673#ifdef CONFIG_EXT4DEV_COMPAT
3532 unregister_filesystem(&ext4dev_fs_type); 3674 unregister_filesystem(&ext4dev_fs_type);
3675#endif
3533 destroy_inodecache(); 3676 destroy_inodecache();
3534 exit_ext4_xattr(); 3677 exit_ext4_xattr();
3535 exit_ext4_mballoc(); 3678 exit_ext4_mballoc();
3679 remove_proc_entry("fs/ext4", NULL);
3536} 3680}
3537 3681
3538MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3682MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e9178643dc01..00740cb32be3 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
23#include "ext4.h" 23#include "ext4.h"
24#include "xattr.h" 24#include "xattr.h"
25 25
26static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd) 26static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
27{ 27{
28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); 28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data); 29 nd_set_link(nd, (char *) ei->i_data);
30 return NULL; 30 return NULL;
31} 31}
32 32
@@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37#ifdef CONFIG_EXT4DEV_FS_XATTR 37#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 39 .getxattr = generic_getxattr,
40 .listxattr = ext4_listxattr, 40 .listxattr = ext4_listxattr,
@@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 45const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 46 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 47 .follow_link = ext4_follow_link,
48#ifdef CONFIG_EXT4DEV_FS_XATTR 48#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 50 .getxattr = generic_getxattr,
51 .listxattr = ext4_listxattr, 51 .listxattr = ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8954208b4893..80626d516fee 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
104 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, 104 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
105#endif 105#endif
106 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, 106 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
107#ifdef CONFIG_EXT4DEV_FS_SECURITY 107#ifdef CONFIG_EXT4_FS_SECURITY
108 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, 108 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
109#endif 109#endif
110}; 110};
@@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
112struct xattr_handler *ext4_xattr_handlers[] = { 112struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
116 &ext4_xattr_acl_access_handler, 116 &ext4_xattr_acl_access_handler,
117 &ext4_xattr_acl_default_handler, 117 &ext4_xattr_acl_default_handler,
118#endif 118#endif
119#ifdef CONFIG_EXT4DEV_FS_SECURITY 119#ifdef CONFIG_EXT4_FS_SECURITY
120 &ext4_xattr_security_handler, 120 &ext4_xattr_security_handler,
121#endif 121#endif
122 NULL 122 NULL
@@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
959 struct ext4_xattr_block_find bs = { 959 struct ext4_xattr_block_find bs = {
960 .s = { .not_found = -ENODATA, }, 960 .s = { .not_found = -ENODATA, },
961 }; 961 };
962 unsigned long no_expand;
962 int error; 963 int error;
963 964
964 if (!name) 965 if (!name)
@@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
966 if (strlen(name) > 255) 967 if (strlen(name) > 255)
967 return -ERANGE; 968 return -ERANGE;
968 down_write(&EXT4_I(inode)->xattr_sem); 969 down_write(&EXT4_I(inode)->xattr_sem);
970 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
971 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
972
969 error = ext4_get_inode_loc(inode, &is.iloc); 973 error = ext4_get_inode_loc(inode, &is.iloc);
970 if (error) 974 if (error)
971 goto cleanup; 975 goto cleanup;
@@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1042cleanup: 1046cleanup:
1043 brelse(is.iloc.bh); 1047 brelse(is.iloc.bh);
1044 brelse(bs.bh); 1048 brelse(bs.bh);
1049 if (no_expand == 0)
1050 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
1045 up_write(&EXT4_I(inode)->xattr_sem); 1051 up_write(&EXT4_I(inode)->xattr_sem);
1046 return error; 1052 return error;
1047} 1053}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe979bb9..8ede88b18c29 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@ struct ext4_xattr_entry {
51 (((name_len) + EXT4_XATTR_ROUND + \ 51 (((name_len) + EXT4_XATTR_ROUND + \
52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) 52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
53#define EXT4_XATTR_NEXT(entry) \ 53#define EXT4_XATTR_NEXT(entry) \
54 ( (struct ext4_xattr_entry *)( \ 54 ((struct ext4_xattr_entry *)( \
55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) ) 55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
56#define EXT4_XATTR_SIZE(size) \ 56#define EXT4_XATTR_SIZE(size) \
57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) 57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
58 58
@@ -63,7 +63,7 @@ struct ext4_xattr_entry {
63 EXT4_I(inode)->i_extra_isize)) 63 EXT4_I(inode)->i_extra_isize))
64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
65 65
66# ifdef CONFIG_EXT4DEV_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4DEV_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
93static inline int 93static inline int
94ext4_xattr_get(struct inode *inode, int name_index, const char *name, 94ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
141 141
142#define ext4_xattr_handlers NULL 142#define ext4_xattr_handlers NULL
143 143
144# endif /* CONFIG_EXT4DEV_FS_XATTR */ 144# endif /* CONFIG_EXT4_FS_XATTR */
145 145
146#ifdef CONFIG_EXT4DEV_FS_SECURITY 146#ifdef CONFIG_EXT4_FS_SECURITY
147extern int ext4_init_security(handle_t *handle, struct inode *inode, 147extern int ext4_init_security(handle_t *handle, struct inode *inode,
148 struct inode *dir); 148 struct inode *dir);
149#else 149#else
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7e..fb98b3d847ed 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/msdos_fs.h> 8#include <linux/msdos_fs.h>
9#include <linux/blkdev.h>
9 10
10struct fatent_operations { 11struct fatent_operations {
11 void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); 12 void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
535 struct fat_entry fatent; 536 struct fat_entry fatent;
536 struct buffer_head *bhs[MAX_BUF_PER_PAGE]; 537 struct buffer_head *bhs[MAX_BUF_PER_PAGE];
537 int i, err, nr_bhs; 538 int i, err, nr_bhs;
539 int first_cl = cluster;
538 540
539 nr_bhs = 0; 541 nr_bhs = 0;
540 fatent_init(&fatent); 542 fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
551 goto error; 553 goto error;
552 } 554 }
553 555
556 /*
557 * Issue discard for the sectors we no longer care about,
558 * batching contiguous clusters into one request
559 */
560 if (cluster != fatent.entry + 1) {
561 int nr_clus = fatent.entry - first_cl + 1;
562
563 sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
564 nr_clus * sbi->sec_per_clus);
565 first_cl = cluster;
566 }
567
554 ops->ent_put(&fatent, FAT_ENT_FREE); 568 ops->ent_put(&fatent, FAT_ENT_FREE);
555 if (sbi->free_clusters != -1) { 569 if (sbi->free_clusters != -1) {
556 sbi->free_clusters++; 570 sbi->free_clusters++;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6d266d793e2c..d12cdf2a0406 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -562,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait)
562 struct buffer_head *bh; 562 struct buffer_head *bh;
563 struct msdos_dir_entry *raw_entry; 563 struct msdos_dir_entry *raw_entry;
564 loff_t i_pos; 564 loff_t i_pos;
565 int err = 0; 565 int err;
566 566
567retry: 567retry:
568 i_pos = MSDOS_I(inode)->i_pos; 568 i_pos = MSDOS_I(inode)->i_pos;
569 if (inode->i_ino == MSDOS_ROOT_INO || !i_pos) 569 if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
570 return 0; 570 return 0;
571 571
572 lock_super(sb);
573 bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); 572 bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
574 if (!bh) { 573 if (!bh) {
575 printk(KERN_ERR "FAT: unable to read inode block " 574 printk(KERN_ERR "FAT: unable to read inode block "
576 "for updating (i_pos %lld)\n", i_pos); 575 "for updating (i_pos %lld)\n", i_pos);
577 err = -EIO; 576 return -EIO;
578 goto out;
579 } 577 }
580 spin_lock(&sbi->inode_hash_lock); 578 spin_lock(&sbi->inode_hash_lock);
581 if (i_pos != MSDOS_I(inode)->i_pos) { 579 if (i_pos != MSDOS_I(inode)->i_pos) {
582 spin_unlock(&sbi->inode_hash_lock); 580 spin_unlock(&sbi->inode_hash_lock);
583 brelse(bh); 581 brelse(bh);
584 unlock_super(sb);
585 goto retry; 582 goto retry;
586 } 583 }
587 584
@@ -607,11 +604,10 @@ retry:
607 } 604 }
608 spin_unlock(&sbi->inode_hash_lock); 605 spin_unlock(&sbi->inode_hash_lock);
609 mark_buffer_dirty(bh); 606 mark_buffer_dirty(bh);
607 err = 0;
610 if (wait) 608 if (wait)
611 err = sync_dirty_buffer(bh); 609 err = sync_dirty_buffer(bh);
612 brelse(bh); 610 brelse(bh);
613out:
614 unlock_super(sb);
615 return err; 611 return err;
616} 612}
617 613
@@ -859,7 +855,7 @@ enum {
859 Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err, 855 Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
860}; 856};
861 857
862static match_table_t fat_tokens = { 858static const match_table_t fat_tokens = {
863 {Opt_check_r, "check=relaxed"}, 859 {Opt_check_r, "check=relaxed"},
864 {Opt_check_s, "check=strict"}, 860 {Opt_check_s, "check=strict"},
865 {Opt_check_n, "check=normal"}, 861 {Opt_check_n, "check=normal"},
@@ -894,14 +890,14 @@ static match_table_t fat_tokens = {
894 {Opt_tz_utc, "tz=UTC"}, 890 {Opt_tz_utc, "tz=UTC"},
895 {Opt_err, NULL}, 891 {Opt_err, NULL},
896}; 892};
897static match_table_t msdos_tokens = { 893static const match_table_t msdos_tokens = {
898 {Opt_nodots, "nodots"}, 894 {Opt_nodots, "nodots"},
899 {Opt_nodots, "dotsOK=no"}, 895 {Opt_nodots, "dotsOK=no"},
900 {Opt_dots, "dots"}, 896 {Opt_dots, "dots"},
901 {Opt_dots, "dotsOK=yes"}, 897 {Opt_dots, "dotsOK=yes"},
902 {Opt_err, NULL} 898 {Opt_err, NULL}
903}; 899};
904static match_table_t vfat_tokens = { 900static const match_table_t vfat_tokens = {
905 {Opt_charset, "iocharset=%s"}, 901 {Opt_charset, "iocharset=%s"},
906 {Opt_shortname_lower, "shortname=lower"}, 902 {Opt_shortname_lower, "shortname=lower"},
907 {Opt_shortname_win95, "shortname=win95"}, 903 {Opt_shortname_win95, "shortname=win95"},
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d2249f174e20..6a84388cacff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -354,7 +354,7 @@ enum {
354 OPT_ERR 354 OPT_ERR
355}; 355};
356 356
357static match_table_t tokens = { 357static const match_table_t tokens = {
358 {OPT_FD, "fd=%u"}, 358 {OPT_FD, "fd=%u"},
359 {OPT_ROOTMODE, "rootmode=%o"}, 359 {OPT_ROOTMODE, "rootmode=%o"},
360 {OPT_USER_ID, "user_id=%u"}, 360 {OPT_USER_ID, "user_id=%u"},
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13391e546616..c962283d4e7f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1265,6 +1265,8 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1265 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1265 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1266 if (time_before(now, holdtime)) 1266 if (time_before(now, holdtime))
1267 delay = holdtime - now; 1267 delay = holdtime - now;
1268 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1269 delay = gl->gl_ops->go_min_hold_time;
1268 1270
1269 spin_lock(&gl->gl_spin); 1271 spin_lock(&gl->gl_spin);
1270 handle_callback(gl, state, 1, delay); 1272 handle_callback(gl, state, 1, delay);
@@ -1578,8 +1580,6 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1578 *p++ = 'a'; 1580 *p++ = 'a';
1579 if (flags & GL_EXACT) 1581 if (flags & GL_EXACT)
1580 *p++ = 'E'; 1582 *p++ = 'E';
1581 if (flags & GL_ATIME)
1582 *p++ = 'a';
1583 if (flags & GL_NOCACHE) 1583 if (flags & GL_NOCACHE)
1584 *p++ = 'c'; 1584 *p++ = 'c';
1585 if (test_bit(HIF_HOLDER, &iflags)) 1585 if (test_bit(HIF_HOLDER, &iflags))
@@ -1816,15 +1816,17 @@ restart:
1816 if (gl) { 1816 if (gl) {
1817 gi->gl = hlist_entry(gl->gl_list.next, 1817 gi->gl = hlist_entry(gl->gl_list.next,
1818 struct gfs2_glock, gl_list); 1818 struct gfs2_glock, gl_list);
1819 if (gi->gl) 1819 } else {
1820 gfs2_glock_hold(gi->gl); 1820 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
1821 struct gfs2_glock, gl_list);
1821 } 1822 }
1823 if (gi->gl)
1824 gfs2_glock_hold(gi->gl);
1822 read_unlock(gl_lock_addr(gi->hash)); 1825 read_unlock(gl_lock_addr(gi->hash));
1823 if (gl) 1826 if (gl)
1824 gfs2_glock_put(gl); 1827 gfs2_glock_put(gl);
1825 if (gl && gi->gl == NULL)
1826 gi->hash++;
1827 while (gi->gl == NULL) { 1828 while (gi->gl == NULL) {
1829 gi->hash++;
1828 if (gi->hash >= GFS2_GL_HASH_SIZE) 1830 if (gi->hash >= GFS2_GL_HASH_SIZE)
1829 return 1; 1831 return 1;
1830 read_lock(gl_lock_addr(gi->hash)); 1832 read_lock(gl_lock_addr(gi->hash));
@@ -1833,7 +1835,6 @@ restart:
1833 if (gi->gl) 1835 if (gi->gl)
1834 gfs2_glock_hold(gi->gl); 1836 gfs2_glock_hold(gi->gl);
1835 read_unlock(gl_lock_addr(gi->hash)); 1837 read_unlock(gl_lock_addr(gi->hash));
1836 gi->hash++;
1837 } 1838 }
1838 1839
1839 if (gi->sdp != gi->gl->gl_sbd) 1840 if (gi->sdp != gi->gl->gl_sbd)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 971d92af70fc..695c6b193611 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -24,7 +24,6 @@
24#define GL_ASYNC 0x00000040 24#define GL_ASYNC 0x00000040
25#define GL_EXACT 0x00000080 25#define GL_EXACT 0x00000080
26#define GL_SKIP 0x00000100 26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400 27#define GL_NOCACHE 0x00000400
29 28
30#define GLR_TRYFAILED 13 29#define GLR_TRYFAILED 13
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 448697a5c462..f566ec1b4e8e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -386,20 +386,21 @@ struct gfs2_statfs_change_host {
386#define GFS2_DATA_ORDERED 2 386#define GFS2_DATA_ORDERED 2
387 387
388struct gfs2_args { 388struct gfs2_args {
389 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ 389 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
390 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 390 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
391 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 391 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
392 int ar_spectator; /* Don't get a journal because we're always RO */ 392 unsigned int ar_spectator:1; /* Don't get a journal */
393 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */ 393 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
394 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */ 394 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
395 int ar_localcaching; /* Local-style caching (dangerous on multihost) */ 395 unsigned int ar_localcaching:1; /* Local caching */
396 int ar_debug; /* Oops on errors instead of trying to be graceful */ 396 unsigned int ar_debug:1; /* Oops on errors */
397 int ar_upgrade; /* Upgrade ondisk/multihost format */ 397 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
398 unsigned int ar_num_glockd; /* Number of glockd threads */ 398 unsigned int ar_posix_acl:1; /* Enable posix acls */
399 int ar_posix_acl; /* Enable posix acls */ 399 unsigned int ar_quota:2; /* off/account/on */
400 int ar_quota; /* off/account/on */ 400 unsigned int ar_suiddir:1; /* suiddir support */
401 int ar_suiddir; /* suiddir support */ 401 unsigned int ar_data:2; /* ordered/writeback */
402 int ar_data; /* ordered/writeback */ 402 unsigned int ar_meta:1; /* mount metafs */
403 unsigned int ar_num_glockd; /* Number of glockd threads */
403}; 404};
404 405
405struct gfs2_tune { 406struct gfs2_tune {
@@ -419,7 +420,6 @@ struct gfs2_tune {
419 unsigned int gt_quota_scale_den; /* Denominator */ 420 unsigned int gt_quota_scale_den; /* Denominator */
420 unsigned int gt_quota_cache_secs; 421 unsigned int gt_quota_cache_secs;
421 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 422 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
422 unsigned int gt_atime_quantum; /* Min secs between atime updates */
423 unsigned int gt_new_files_jdata; 423 unsigned int gt_new_files_jdata;
424 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 424 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
425 unsigned int gt_stall_secs; /* Detects trouble! */ 425 unsigned int gt_stall_secs; /* Detects trouble! */
@@ -432,7 +432,7 @@ enum {
432 SDF_JOURNAL_CHECKED = 0, 432 SDF_JOURNAL_CHECKED = 0,
433 SDF_JOURNAL_LIVE = 1, 433 SDF_JOURNAL_LIVE = 1,
434 SDF_SHUTDOWN = 2, 434 SDF_SHUTDOWN = 2,
435 SDF_NOATIME = 3, 435 SDF_NOBARRIERS = 3,
436}; 436};
437 437
438#define GFS2_FSNAME_LEN 256 438#define GFS2_FSNAME_LEN 256
@@ -461,7 +461,6 @@ struct gfs2_sb_host {
461 461
462struct gfs2_sbd { 462struct gfs2_sbd {
463 struct super_block *sd_vfs; 463 struct super_block *sd_vfs;
464 struct super_block *sd_vfs_meta;
465 struct kobject sd_kobj; 464 struct kobject sd_kobj;
466 unsigned long sd_flags; /* SDF_... */ 465 unsigned long sd_flags; /* SDF_... */
467 struct gfs2_sb_host sd_sb; 466 struct gfs2_sb_host sd_sb;
@@ -499,7 +498,9 @@ struct gfs2_sbd {
499 498
500 /* Inode Stuff */ 499 /* Inode Stuff */
501 500
502 struct inode *sd_master_dir; 501 struct dentry *sd_master_dir;
502 struct dentry *sd_root_dir;
503
503 struct inode *sd_jindex; 504 struct inode *sd_jindex;
504 struct inode *sd_inum_inode; 505 struct inode *sd_inum_inode;
505 struct inode *sd_statfs_inode; 506 struct inode *sd_statfs_inode;
@@ -634,7 +635,6 @@ struct gfs2_sbd {
634 /* Debugging crud */ 635 /* Debugging crud */
635 636
636 unsigned long sd_last_warning; 637 unsigned long sd_last_warning;
637 struct vfsmount *sd_gfs2mnt;
638 struct dentry *debugfs_dir; /* debugfs directory */ 638 struct dentry *debugfs_dir; /* debugfs directory */
639 struct dentry *debugfs_dentry_glocks; /* for debugfs */ 639 struct dentry *debugfs_dentry_glocks; /* for debugfs */
640}; 640};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8b0806a32948..7cee695fa441 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,6 +18,7 @@
18#include <linux/crc32.h> 18#include <linux/crc32.h>
19#include <linux/lm_interface.h> 19#include <linux/lm_interface.h>
20#include <linux/security.h> 20#include <linux/security.h>
21#include <linux/time.h>
21 22
22#include "gfs2.h" 23#include "gfs2.h"
23#include "incore.h" 24#include "incore.h"
@@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
249{ 250{
250 struct gfs2_dinode_host *di = &ip->i_di; 251 struct gfs2_dinode_host *di = &ip->i_di;
251 const struct gfs2_dinode *str = buf; 252 const struct gfs2_dinode *str = buf;
253 struct timespec atime;
252 u16 height, depth; 254 u16 height, depth;
253 255
254 if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) 256 if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
275 di->di_size = be64_to_cpu(str->di_size); 277 di->di_size = be64_to_cpu(str->di_size);
276 i_size_write(&ip->i_inode, di->di_size); 278 i_size_write(&ip->i_inode, di->di_size);
277 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 279 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
278 ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); 280 atime.tv_sec = be64_to_cpu(str->di_atime);
279 ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 281 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
282 if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
283 ip->i_inode.i_atime = atime;
280 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); 284 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
281 ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); 285 ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
282 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); 286 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
@@ -1033,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1033 1037
1034 if (bh) 1038 if (bh)
1035 brelse(bh); 1039 brelse(bh);
1036 if (!inode)
1037 return ERR_PTR(-ENOMEM);
1038 return inode; 1040 return inode;
1039 1041
1040fail_gunlock2: 1042fail_gunlock2:
1041 gfs2_glock_dq_uninit(ghs + 1); 1043 gfs2_glock_dq_uninit(ghs + 1);
1042 if (inode) 1044 if (inode && !IS_ERR(inode))
1043 iput(inode); 1045 iput(inode);
1044fail_gunlock: 1046fail_gunlock:
1045 gfs2_glock_dq(ghs); 1047 gfs2_glock_dq(ghs);
@@ -1140,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1140 return 0; 1142 return 0;
1141} 1143}
1142 1144
1143/*
1144 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1145 * @this: move this
1146 * @to: to here
1147 *
1148 * Follow @to back to the root and make sure we don't encounter @this
1149 * Assumes we already hold the rename lock.
1150 *
1151 * Returns: errno
1152 */
1153
1154int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1155{
1156 struct inode *dir = &to->i_inode;
1157 struct super_block *sb = dir->i_sb;
1158 struct inode *tmp;
1159 struct qstr dotdot;
1160 int error = 0;
1161
1162 gfs2_str2qstr(&dotdot, "..");
1163
1164 igrab(dir);
1165
1166 for (;;) {
1167 if (dir == &this->i_inode) {
1168 error = -EINVAL;
1169 break;
1170 }
1171 if (dir == sb->s_root->d_inode) {
1172 error = 0;
1173 break;
1174 }
1175
1176 tmp = gfs2_lookupi(dir, &dotdot, 1);
1177 if (IS_ERR(tmp)) {
1178 error = PTR_ERR(tmp);
1179 break;
1180 }
1181
1182 iput(dir);
1183 dir = tmp;
1184 }
1185
1186 iput(dir);
1187
1188 return error;
1189}
1190
1191/** 1145/**
1192 * gfs2_readlinki - return the contents of a symlink 1146 * gfs2_readlinki - return the contents of a symlink
1193 * @ip: the symlink's inode 1147 * @ip: the symlink's inode
@@ -1207,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1207 unsigned int x; 1161 unsigned int x;
1208 int error; 1162 int error;
1209 1163
1210 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); 1164 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
1211 error = gfs2_glock_nq_atime(&i_gh); 1165 error = gfs2_glock_nq(&i_gh);
1212 if (error) { 1166 if (error) {
1213 gfs2_holder_uninit(&i_gh); 1167 gfs2_holder_uninit(&i_gh);
1214 return error; 1168 return error;
@@ -1243,101 +1197,6 @@ out:
1243 return error; 1197 return error;
1244} 1198}
1245 1199
1246/**
1247 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1248 * conditionally update the inode's atime
1249 * @gh: the holder to acquire
1250 *
1251 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1252 * Update if the difference between the current time and the inode's current
1253 * atime is greater than an interval specified at mount.
1254 *
1255 * Returns: errno
1256 */
1257
1258int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1259{
1260 struct gfs2_glock *gl = gh->gh_gl;
1261 struct gfs2_sbd *sdp = gl->gl_sbd;
1262 struct gfs2_inode *ip = gl->gl_object;
1263 s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1264 unsigned int state;
1265 int flags;
1266 int error;
1267 struct timespec tv = CURRENT_TIME;
1268
1269 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1270 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1271 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1272 return -EINVAL;
1273
1274 state = gh->gh_state;
1275 flags = gh->gh_flags;
1276
1277 error = gfs2_glock_nq(gh);
1278 if (error)
1279 return error;
1280
1281 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1282 (sdp->sd_vfs->s_flags & MS_RDONLY))
1283 return 0;
1284
1285 if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
1286 gfs2_glock_dq(gh);
1287 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1288 gh);
1289 error = gfs2_glock_nq(gh);
1290 if (error)
1291 return error;
1292
1293 /* Verify that atime hasn't been updated while we were
1294 trying to get exclusive lock. */
1295
1296 tv = CURRENT_TIME;
1297 if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
1298 struct buffer_head *dibh;
1299 struct gfs2_dinode *di;
1300
1301 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1302 if (error == -EROFS)
1303 return 0;
1304 if (error)
1305 goto fail;
1306
1307 error = gfs2_meta_inode_buffer(ip, &dibh);
1308 if (error)
1309 goto fail_end_trans;
1310
1311 ip->i_inode.i_atime = tv;
1312
1313 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1314 di = (struct gfs2_dinode *)dibh->b_data;
1315 di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1316 di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
1317 brelse(dibh);
1318
1319 gfs2_trans_end(sdp);
1320 }
1321
1322 /* If someone else has asked for the glock,
1323 unlock and let them have it. Then reacquire
1324 in the original state. */
1325 if (gfs2_glock_is_blocking(gl)) {
1326 gfs2_glock_dq(gh);
1327 gfs2_holder_reinit(state, flags, gh);
1328 return gfs2_glock_nq(gh);
1329 }
1330 }
1331
1332 return 0;
1333
1334fail_end_trans:
1335 gfs2_trans_end(sdp);
1336fail:
1337 gfs2_glock_dq(gh);
1338 return error;
1339}
1340
1341static int 1200static int
1342__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 1201__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1343{ 1202{
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 58f9607d6a86..2d43f69610a0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -91,9 +91,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
91int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 91int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
92 const struct gfs2_inode *ip); 92 const struct gfs2_inode *ip);
93int gfs2_permission(struct inode *inode, int mask); 93int gfs2_permission(struct inode *inode, int mask);
94int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); 94int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
96int gfs2_glock_nq_atime(struct gfs2_holder *gh);
97int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); 95int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
98struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 96struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
99void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 97void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 09d78c216f48..0c4cbe6c8285 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -144,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data,
144 144
145 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), 145 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
146 &ls->dlm_lockspace, 146 &ls->dlm_lockspace,
147 DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0), 147 DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
148 (nodir ? DLM_LSFL_NODIR : 0),
148 GDLM_LVB_SIZE); 149 GDLM_LVB_SIZE);
149 if (error) { 150 if (error) {
150 log_error("dlm_new_lockspace error %d", error); 151 log_error("dlm_new_lockspace error %d", error);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6c6af9f5e3ab..ad305854bdc6 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/freezer.h> 20#include <linux/freezer.h>
21#include <linux/bio.h>
21 22
22#include "gfs2.h" 23#include "gfs2.h"
23#include "incore.h" 24#include "incore.h"
@@ -584,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
584 memset(bh->b_data, 0, bh->b_size); 585 memset(bh->b_data, 0, bh->b_size);
585 set_buffer_uptodate(bh); 586 set_buffer_uptodate(bh);
586 clear_buffer_dirty(bh); 587 clear_buffer_dirty(bh);
587 unlock_buffer(bh);
588 588
589 gfs2_ail1_empty(sdp, 0); 589 gfs2_ail1_empty(sdp, 0);
590 tail = current_tail(sdp); 590 tail = current_tail(sdp);
@@ -601,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
601 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); 601 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
602 lh->lh_hash = cpu_to_be32(hash); 602 lh->lh_hash = cpu_to_be32(hash);
603 603
604 set_buffer_dirty(bh); 604 bh->b_end_io = end_buffer_write_sync;
605 if (sync_dirty_buffer(bh)) 605 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
606 goto skip_barrier;
607 get_bh(bh);
608 submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
609 wait_on_buffer(bh);
610 if (buffer_eopnotsupp(bh)) {
611 clear_buffer_eopnotsupp(bh);
612 set_buffer_uptodate(bh);
613 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
614 lock_buffer(bh);
615skip_barrier:
616 get_bh(bh);
617 submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
618 wait_on_buffer(bh);
619 }
620 if (!buffer_uptodate(bh))
606 gfs2_io_error_bh(sdp, bh); 621 gfs2_io_error_bh(sdp, bh);
607 brelse(bh); 622 brelse(bh);
608 623
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index b941f9f9f958..f96eb90a2cfa 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,10 +42,11 @@ enum {
42 Opt_nosuiddir, 42 Opt_nosuiddir,
43 Opt_data_writeback, 43 Opt_data_writeback,
44 Opt_data_ordered, 44 Opt_data_ordered,
45 Opt_meta,
45 Opt_err, 46 Opt_err,
46}; 47};
47 48
48static match_table_t tokens = { 49static const match_table_t tokens = {
49 {Opt_lockproto, "lockproto=%s"}, 50 {Opt_lockproto, "lockproto=%s"},
50 {Opt_locktable, "locktable=%s"}, 51 {Opt_locktable, "locktable=%s"},
51 {Opt_hostdata, "hostdata=%s"}, 52 {Opt_hostdata, "hostdata=%s"},
@@ -66,6 +67,7 @@ static match_table_t tokens = {
66 {Opt_nosuiddir, "nosuiddir"}, 67 {Opt_nosuiddir, "nosuiddir"},
67 {Opt_data_writeback, "data=writeback"}, 68 {Opt_data_writeback, "data=writeback"},
68 {Opt_data_ordered, "data=ordered"}, 69 {Opt_data_ordered, "data=ordered"},
70 {Opt_meta, "meta"},
69 {Opt_err, NULL} 71 {Opt_err, NULL}
70}; 72};
71 73
@@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
239 case Opt_data_ordered: 241 case Opt_data_ordered:
240 args->ar_data = GFS2_DATA_ORDERED; 242 args->ar_data = GFS2_DATA_ORDERED;
241 break; 243 break;
244 case Opt_meta:
245 if (remount && args->ar_meta != 1)
246 goto cant_remount;
247 args->ar_meta = 1;
248 break;
242 case Opt_err: 249 case Opt_err:
243 default: 250 default:
244 fs_info(sdp, "unknown option: %s\n", o); 251 fs_info(sdp, "unknown option: %s\n", o);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e64a1b04117a..27563816e1c5 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -512,8 +512,8 @@ static int gfs2_readpage(struct file *file, struct page *page)
512 int error; 512 int error;
513 513
514 unlock_page(page); 514 unlock_page(page);
515 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 515 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
516 error = gfs2_glock_nq_atime(&gh); 516 error = gfs2_glock_nq(&gh);
517 if (unlikely(error)) 517 if (unlikely(error))
518 goto out; 518 goto out;
519 error = AOP_TRUNCATED_PAGE; 519 error = AOP_TRUNCATED_PAGE;
@@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
594 struct gfs2_holder gh; 594 struct gfs2_holder gh;
595 int ret; 595 int ret;
596 596
597 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 597 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
598 ret = gfs2_glock_nq_atime(&gh); 598 ret = gfs2_glock_nq(&gh);
599 if (unlikely(ret)) 599 if (unlikely(ret))
600 goto out_uninit; 600 goto out_uninit;
601 if (!gfs2_is_stuffed(ip)) 601 if (!gfs2_is_stuffed(ip))
@@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
636 unsigned to = from + len; 636 unsigned to = from + len;
637 struct page *page; 637 struct page *page;
638 638
639 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); 639 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
640 error = gfs2_glock_nq_atime(&ip->i_gh); 640 error = gfs2_glock_nq(&ip->i_gh);
641 if (unlikely(error)) 641 if (unlikely(error))
642 goto out_uninit; 642 goto out_uninit;
643 643
@@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
975 if (gfs2_is_stuffed(ip)) 975 if (gfs2_is_stuffed(ip))
976 return 0; 976 return 0;
977 977
978 if (offset > i_size_read(&ip->i_inode)) 978 if (offset >= i_size_read(&ip->i_inode))
979 return 0; 979 return 0;
980 return 1; 980 return 1;
981} 981}
@@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1000 * unfortunately have the option of only flushing a range like 1000 * unfortunately have the option of only flushing a range like
1001 * the VFS does. 1001 * the VFS does.
1002 */ 1002 */
1003 gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh); 1003 gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
1004 rv = gfs2_glock_nq_atime(&gh); 1004 rv = gfs2_glock_nq(&gh);
1005 if (rv) 1005 if (rv)
1006 return rv; 1006 return rv;
1007 rv = gfs2_ok_for_dio(ip, rw, offset); 1007 rv = gfs2_ok_for_dio(ip, rw, offset);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e9a366d4411c..3a747f8e2188 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -89,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
89 u64 offset = file->f_pos; 89 u64 offset = file->f_pos;
90 int error; 90 int error;
91 91
92 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); 92 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
93 error = gfs2_glock_nq_atime(&d_gh); 93 error = gfs2_glock_nq(&d_gh);
94 if (error) { 94 if (error) {
95 gfs2_holder_uninit(&d_gh); 95 gfs2_holder_uninit(&d_gh);
96 return error; 96 return error;
@@ -153,8 +153,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
153 int error; 153 int error;
154 u32 fsflags; 154 u32 fsflags;
155 155
156 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 156 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
157 error = gfs2_glock_nq_atime(&gh); 157 error = gfs2_glock_nq(&gh);
158 if (error) 158 if (error)
159 return error; 159 return error;
160 160
@@ -351,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
351 struct gfs2_alloc *al; 351 struct gfs2_alloc *al;
352 int ret; 352 int ret;
353 353
354 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh); 354 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
355 ret = gfs2_glock_nq_atime(&gh); 355 ret = gfs2_glock_nq(&gh);
356 if (ret) 356 if (ret)
357 goto out; 357 goto out;
358 358
@@ -434,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
434 struct gfs2_holder i_gh; 434 struct gfs2_holder i_gh;
435 int error; 435 int error;
436 436
437 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); 437 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
438 error = gfs2_glock_nq_atime(&i_gh); 438 error = gfs2_glock_nq(&i_gh);
439 if (error) { 439 if (error) {
440 gfs2_holder_uninit(&i_gh); 440 gfs2_holder_uninit(&i_gh);
441 return error; 441 return error;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b4d1d6490633..b117fcf2c4f5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -40,6 +40,44 @@
40#define DO 0 40#define DO 0
41#define UNDO 1 41#define UNDO 1
42 42
43static const u32 gfs2_old_fs_formats[] = {
44 0
45};
46
47static const u32 gfs2_old_multihost_formats[] = {
48 0
49};
50
51/**
52 * gfs2_tune_init - Fill a gfs2_tune structure with default values
53 * @gt: tune
54 *
55 */
56
57static void gfs2_tune_init(struct gfs2_tune *gt)
58{
59 spin_lock_init(&gt->gt_spin);
60
61 gt->gt_demote_secs = 300;
62 gt->gt_incore_log_blocks = 1024;
63 gt->gt_log_flush_secs = 60;
64 gt->gt_recoverd_secs = 60;
65 gt->gt_logd_secs = 1;
66 gt->gt_quotad_secs = 5;
67 gt->gt_quota_simul_sync = 64;
68 gt->gt_quota_warn_period = 10;
69 gt->gt_quota_scale_num = 1;
70 gt->gt_quota_scale_den = 1;
71 gt->gt_quota_cache_secs = 300;
72 gt->gt_quota_quantum = 60;
73 gt->gt_new_files_jdata = 0;
74 gt->gt_max_readahead = 1 << 18;
75 gt->gt_stall_secs = 600;
76 gt->gt_complain_secs = 10;
77 gt->gt_statfs_quantum = 30;
78 gt->gt_statfs_slow = 0;
79}
80
43static struct gfs2_sbd *init_sbd(struct super_block *sb) 81static struct gfs2_sbd *init_sbd(struct super_block *sb)
44{ 82{
45 struct gfs2_sbd *sdp; 83 struct gfs2_sbd *sdp;
@@ -96,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
96 return sdp; 134 return sdp;
97} 135}
98 136
99static void init_vfs(struct super_block *sb, unsigned noatime) 137
138/**
139 * gfs2_check_sb - Check superblock
140 * @sdp: the filesystem
141 * @sb: The superblock
142 * @silent: Don't print a message if the check fails
143 *
144 * Checks the version code of the FS is one that we understand how to
145 * read and that the sizes of the various on-disk structures have not
146 * changed.
147 */
148
149static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
100{ 150{
101 struct gfs2_sbd *sdp = sb->s_fs_info; 151 unsigned int x;
102 152
103 sb->s_magic = GFS2_MAGIC; 153 if (sb->sb_magic != GFS2_MAGIC ||
104 sb->s_op = &gfs2_super_ops; 154 sb->sb_type != GFS2_METATYPE_SB) {
105 sb->s_export_op = &gfs2_export_ops; 155 if (!silent)
106 sb->s_time_gran = 1; 156 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
107 sb->s_maxbytes = MAX_LFS_FILESIZE; 157 return -EINVAL;
158 }
159
160 /* If format numbers match exactly, we're done. */
161
162 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
163 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
164 return 0;
165
166 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
167 for (x = 0; gfs2_old_fs_formats[x]; x++)
168 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
169 break;
170
171 if (!gfs2_old_fs_formats[x]) {
172 printk(KERN_WARNING
173 "GFS2: code version (%u, %u) is incompatible "
174 "with ondisk format (%u, %u)\n",
175 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
176 sb->sb_fs_format, sb->sb_multihost_format);
177 printk(KERN_WARNING
178 "GFS2: I don't know how to upgrade this FS\n");
179 return -EINVAL;
180 }
181 }
182
183 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
184 for (x = 0; gfs2_old_multihost_formats[x]; x++)
185 if (gfs2_old_multihost_formats[x] ==
186 sb->sb_multihost_format)
187 break;
188
189 if (!gfs2_old_multihost_formats[x]) {
190 printk(KERN_WARNING
191 "GFS2: code version (%u, %u) is incompatible "
192 "with ondisk format (%u, %u)\n",
193 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
194 sb->sb_fs_format, sb->sb_multihost_format);
195 printk(KERN_WARNING
196 "GFS2: I don't know how to upgrade this FS\n");
197 return -EINVAL;
198 }
199 }
200
201 if (!sdp->sd_args.ar_upgrade) {
202 printk(KERN_WARNING
203 "GFS2: code version (%u, %u) is incompatible "
204 "with ondisk format (%u, %u)\n",
205 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
206 sb->sb_fs_format, sb->sb_multihost_format);
207 printk(KERN_INFO
208 "GFS2: Use the \"upgrade\" mount option to upgrade "
209 "the FS\n");
210 printk(KERN_INFO "GFS2: See the manual for more details\n");
211 return -EINVAL;
212 }
213
214 return 0;
215}
216
217static void end_bio_io_page(struct bio *bio, int error)
218{
219 struct page *page = bio->bi_private;
108 220
109 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) 221 if (!error)
110 set_bit(noatime, &sdp->sd_flags); 222 SetPageUptodate(page);
223 else
224 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
225 unlock_page(page);
226}
227
228static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
229{
230 const struct gfs2_sb *str = buf;
231
232 sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
233 sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
234 sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
235 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
236 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
237 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
238 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
239 sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
240 sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
241 sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
242 sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
243
244 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
245 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
246}
247
248/**
249 * gfs2_read_super - Read the gfs2 super block from disk
250 * @sdp: The GFS2 super block
251 * @sector: The location of the super block
252 * @error: The error code to return
253 *
254 * This uses the bio functions to read the super block from disk
255 * because we want to be 100% sure that we never read cached data.
256 * A super block is read twice only during each GFS2 mount and is
257 * never written to by the filesystem. The first time its read no
258 * locks are held, and the only details which are looked at are those
259 * relating to the locking protocol. Once locking is up and working,
260 * the sb is read again under the lock to establish the location of
261 * the master directory (contains pointers to journals etc) and the
262 * root directory.
263 *
264 * Returns: 0 on success or error
265 */
266
267static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
268{
269 struct super_block *sb = sdp->sd_vfs;
270 struct gfs2_sb *p;
271 struct page *page;
272 struct bio *bio;
273
274 page = alloc_page(GFP_NOFS);
275 if (unlikely(!page))
276 return -ENOBUFS;
277
278 ClearPageUptodate(page);
279 ClearPageDirty(page);
280 lock_page(page);
281
282 bio = bio_alloc(GFP_NOFS, 1);
283 if (unlikely(!bio)) {
284 __free_page(page);
285 return -ENOBUFS;
286 }
111 287
112 /* Don't let the VFS update atimes. GFS2 handles this itself. */ 288 bio->bi_sector = sector * (sb->s_blocksize >> 9);
113 sb->s_flags |= MS_NOATIME | MS_NODIRATIME; 289 bio->bi_bdev = sb->s_bdev;
290 bio_add_page(bio, page, PAGE_SIZE, 0);
291
292 bio->bi_end_io = end_bio_io_page;
293 bio->bi_private = page;
294 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
295 wait_on_page_locked(page);
296 bio_put(bio);
297 if (!PageUptodate(page)) {
298 __free_page(page);
299 return -EIO;
300 }
301 p = kmap(page);
302 gfs2_sb_in(&sdp->sd_sb, p);
303 kunmap(page);
304 __free_page(page);
305 return 0;
306}
307/**
308 * gfs2_read_sb - Read super block
309 * @sdp: The GFS2 superblock
310 * @gl: the glock for the superblock (assumed to be held)
311 * @silent: Don't print message if mount fails
312 *
313 */
314
315static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
316{
317 u32 hash_blocks, ind_blocks, leaf_blocks;
318 u32 tmp_blocks;
319 unsigned int x;
320 int error;
321
322 error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
323 if (error) {
324 if (!silent)
325 fs_err(sdp, "can't read superblock\n");
326 return error;
327 }
328
329 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
330 if (error)
331 return error;
332
333 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
334 GFS2_BASIC_BLOCK_SHIFT;
335 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
336 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
337 sizeof(struct gfs2_dinode)) / sizeof(u64);
338 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
339 sizeof(struct gfs2_meta_header)) / sizeof(u64);
340 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
341 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
342 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
343 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
344 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
345 sizeof(struct gfs2_meta_header)) /
346 sizeof(struct gfs2_quota_change);
347
348 /* Compute maximum reservation required to add a entry to a directory */
349
350 hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
351 sdp->sd_jbsize);
352
353 ind_blocks = 0;
354 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
355 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
356 ind_blocks += tmp_blocks;
357 }
358
359 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
360
361 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
362
363 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
364 sizeof(struct gfs2_dinode);
365 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
366 for (x = 2;; x++) {
367 u64 space, d;
368 u32 m;
369
370 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
371 d = space;
372 m = do_div(d, sdp->sd_inptrs);
373
374 if (d != sdp->sd_heightsize[x - 1] || m)
375 break;
376 sdp->sd_heightsize[x] = space;
377 }
378 sdp->sd_max_height = x;
379 sdp->sd_heightsize[x] = ~0;
380 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
381
382 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
383 sizeof(struct gfs2_dinode);
384 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
385 for (x = 2;; x++) {
386 u64 space, d;
387 u32 m;
388
389 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
390 d = space;
391 m = do_div(d, sdp->sd_inptrs);
392
393 if (d != sdp->sd_jheightsize[x - 1] || m)
394 break;
395 sdp->sd_jheightsize[x] = space;
396 }
397 sdp->sd_max_jheight = x;
398 sdp->sd_jheightsize[x] = ~0;
399 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
400
401 return 0;
114} 402}
115 403
116static int init_names(struct gfs2_sbd *sdp, int silent) 404static int init_names(struct gfs2_sbd *sdp, int silent)
@@ -224,51 +512,59 @@ fail:
224 return error; 512 return error;
225} 513}
226 514
227static inline struct inode *gfs2_lookup_root(struct super_block *sb, 515static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
228 u64 no_addr) 516 u64 no_addr, const char *name)
229{ 517{
230 return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); 518 struct gfs2_sbd *sdp = sb->s_fs_info;
519 struct dentry *dentry;
520 struct inode *inode;
521
522 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
523 if (IS_ERR(inode)) {
524 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
525 return PTR_ERR(inode);
526 }
527 dentry = d_alloc_root(inode);
528 if (!dentry) {
529 fs_err(sdp, "can't alloc %s dentry\n", name);
530 iput(inode);
531 return -ENOMEM;
532 }
533 dentry->d_op = &gfs2_dops;
534 *dptr = dentry;
535 return 0;
231} 536}
232 537
233static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) 538static int init_sb(struct gfs2_sbd *sdp, int silent)
234{ 539{
235 struct super_block *sb = sdp->sd_vfs; 540 struct super_block *sb = sdp->sd_vfs;
236 struct gfs2_holder sb_gh; 541 struct gfs2_holder sb_gh;
237 u64 no_addr; 542 u64 no_addr;
238 struct inode *inode; 543 int ret;
239 int error = 0;
240 544
241 if (undo) { 545 ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
242 if (sb->s_root) { 546 LM_ST_SHARED, 0, &sb_gh);
243 dput(sb->s_root); 547 if (ret) {
244 sb->s_root = NULL; 548 fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
245 } 549 return ret;
246 return 0;
247 } 550 }
248 551
249 error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, 552 ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
250 LM_ST_SHARED, 0, &sb_gh); 553 if (ret) {
251 if (error) { 554 fs_err(sdp, "can't read superblock: %d\n", ret);
252 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
253 return error;
254 }
255
256 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
257 if (error) {
258 fs_err(sdp, "can't read superblock: %d\n", error);
259 goto out; 555 goto out;
260 } 556 }
261 557
262 /* Set up the buffer cache and SB for real */ 558 /* Set up the buffer cache and SB for real */
263 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { 559 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
264 error = -EINVAL; 560 ret = -EINVAL;
265 fs_err(sdp, "FS block size (%u) is too small for device " 561 fs_err(sdp, "FS block size (%u) is too small for device "
266 "block size (%u)\n", 562 "block size (%u)\n",
267 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); 563 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
268 goto out; 564 goto out;
269 } 565 }
270 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { 566 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
271 error = -EINVAL; 567 ret = -EINVAL;
272 fs_err(sdp, "FS block size (%u) is too big for machine " 568 fs_err(sdp, "FS block size (%u) is too big for machine "
273 "page size (%u)\n", 569 "page size (%u)\n",
274 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); 570 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
@@ -278,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
278 574
279 /* Get the root inode */ 575 /* Get the root inode */
280 no_addr = sdp->sd_sb.sb_root_dir.no_addr; 576 no_addr = sdp->sd_sb.sb_root_dir.no_addr;
281 if (sb->s_type == &gfs2meta_fs_type) 577 ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
282 no_addr = sdp->sd_sb.sb_master_dir.no_addr; 578 if (ret)
283 inode = gfs2_lookup_root(sb, no_addr);
284 if (IS_ERR(inode)) {
285 error = PTR_ERR(inode);
286 fs_err(sdp, "can't read in root inode: %d\n", error);
287 goto out; 579 goto out;
288 }
289 580
290 sb->s_root = d_alloc_root(inode); 581 /* Get the master inode */
291 if (!sb->s_root) { 582 no_addr = sdp->sd_sb.sb_master_dir.no_addr;
292 fs_err(sdp, "can't get root dentry\n"); 583 ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
293 error = -ENOMEM; 584 if (ret) {
294 iput(inode); 585 dput(sdp->sd_root_dir);
295 } else 586 goto out;
296 sb->s_root->d_op = &gfs2_dops; 587 }
297 588 sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
298out: 589out:
299 gfs2_glock_dq_uninit(&sb_gh); 590 gfs2_glock_dq_uninit(&sb_gh);
300 return error; 591 return ret;
301} 592}
302 593
303/** 594/**
@@ -372,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
372 663
373static int init_journal(struct gfs2_sbd *sdp, int undo) 664static int init_journal(struct gfs2_sbd *sdp, int undo)
374{ 665{
666 struct inode *master = sdp->sd_master_dir->d_inode;
375 struct gfs2_holder ji_gh; 667 struct gfs2_holder ji_gh;
376 struct task_struct *p; 668 struct task_struct *p;
377 struct gfs2_inode *ip; 669 struct gfs2_inode *ip;
@@ -383,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
383 goto fail_recoverd; 675 goto fail_recoverd;
384 } 676 }
385 677
386 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex"); 678 sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
387 if (IS_ERR(sdp->sd_jindex)) { 679 if (IS_ERR(sdp->sd_jindex)) {
388 fs_err(sdp, "can't lookup journal index: %d\n", error); 680 fs_err(sdp, "can't lookup journal index: %d\n", error);
389 return PTR_ERR(sdp->sd_jindex); 681 return PTR_ERR(sdp->sd_jindex);
@@ -506,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
506{ 798{
507 int error = 0; 799 int error = 0;
508 struct gfs2_inode *ip; 800 struct gfs2_inode *ip;
509 struct inode *inode; 801 struct inode *master = sdp->sd_master_dir->d_inode;
510 802
511 if (undo) 803 if (undo)
512 goto fail_qinode; 804 goto fail_qinode;
513 805
514 inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
515 if (IS_ERR(inode)) {
516 error = PTR_ERR(inode);
517 fs_err(sdp, "can't read in master directory: %d\n", error);
518 goto fail;
519 }
520 sdp->sd_master_dir = inode;
521
522 error = init_journal(sdp, undo); 806 error = init_journal(sdp, undo);
523 if (error) 807 if (error)
524 goto fail_master; 808 goto fail;
525 809
526 /* Read in the master inode number inode */ 810 /* Read in the master inode number inode */
527 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum"); 811 sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
528 if (IS_ERR(sdp->sd_inum_inode)) { 812 if (IS_ERR(sdp->sd_inum_inode)) {
529 error = PTR_ERR(sdp->sd_inum_inode); 813 error = PTR_ERR(sdp->sd_inum_inode);
530 fs_err(sdp, "can't read in inum inode: %d\n", error); 814 fs_err(sdp, "can't read in inum inode: %d\n", error);
@@ -533,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
533 817
534 818
535 /* Read in the master statfs inode */ 819 /* Read in the master statfs inode */
536 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs"); 820 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
537 if (IS_ERR(sdp->sd_statfs_inode)) { 821 if (IS_ERR(sdp->sd_statfs_inode)) {
538 error = PTR_ERR(sdp->sd_statfs_inode); 822 error = PTR_ERR(sdp->sd_statfs_inode);
539 fs_err(sdp, "can't read in statfs inode: %d\n", error); 823 fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -541,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
541 } 825 }
542 826
543 /* Read in the resource index inode */ 827 /* Read in the resource index inode */
544 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex"); 828 sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
545 if (IS_ERR(sdp->sd_rindex)) { 829 if (IS_ERR(sdp->sd_rindex)) {
546 error = PTR_ERR(sdp->sd_rindex); 830 error = PTR_ERR(sdp->sd_rindex);
547 fs_err(sdp, "can't get resource index inode: %d\n", error); 831 fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -552,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
552 sdp->sd_rindex_uptodate = 0; 836 sdp->sd_rindex_uptodate = 0;
553 837
554 /* Read in the quota inode */ 838 /* Read in the quota inode */
555 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); 839 sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
556 if (IS_ERR(sdp->sd_quota_inode)) { 840 if (IS_ERR(sdp->sd_quota_inode)) {
557 error = PTR_ERR(sdp->sd_quota_inode); 841 error = PTR_ERR(sdp->sd_quota_inode);
558 fs_err(sdp, "can't get quota file inode: %d\n", error); 842 fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -571,8 +855,6 @@ fail_inum:
571 iput(sdp->sd_inum_inode); 855 iput(sdp->sd_inum_inode);
572fail_journal: 856fail_journal:
573 init_journal(sdp, UNDO); 857 init_journal(sdp, UNDO);
574fail_master:
575 iput(sdp->sd_master_dir);
576fail: 858fail:
577 return error; 859 return error;
578} 860}
@@ -583,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
583 char buf[30]; 865 char buf[30];
584 int error = 0; 866 int error = 0;
585 struct gfs2_inode *ip; 867 struct gfs2_inode *ip;
868 struct inode *master = sdp->sd_master_dir->d_inode;
586 869
587 if (sdp->sd_args.ar_spectator) 870 if (sdp->sd_args.ar_spectator)
588 return 0; 871 return 0;
@@ -590,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
590 if (undo) 873 if (undo)
591 goto fail_qc_gh; 874 goto fail_qc_gh;
592 875
593 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node"); 876 pn = gfs2_lookup_simple(master, "per_node");
594 if (IS_ERR(pn)) { 877 if (IS_ERR(pn)) {
595 error = PTR_ERR(pn); 878 error = PTR_ERR(pn);
596 fs_err(sdp, "can't find per_node directory: %d\n", error); 879 fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
800 goto fail; 1083 goto fail;
801 } 1084 }
802 1085
803 init_vfs(sb, SDF_NOATIME); 1086 sb->s_magic = GFS2_MAGIC;
1087 sb->s_op = &gfs2_super_ops;
1088 sb->s_export_op = &gfs2_export_ops;
1089 sb->s_time_gran = 1;
1090 sb->s_maxbytes = MAX_LFS_FILESIZE;
804 1091
805 /* Set up the buffer cache and fill in some fake block size values 1092 /* Set up the buffer cache and fill in some fake block size values
806 to allow us to read-in the on-disk superblock. */ 1093 to allow us to read-in the on-disk superblock. */
@@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
828 if (error) 1115 if (error)
829 goto fail_lm; 1116 goto fail_lm;
830 1117
831 error = init_sb(sdp, silent, DO); 1118 error = init_sb(sdp, silent);
832 if (error) 1119 if (error)
833 goto fail_locking; 1120 goto fail_locking;
834 1121
@@ -869,7 +1156,11 @@ fail_per_node:
869fail_inodes: 1156fail_inodes:
870 init_inodes(sdp, UNDO); 1157 init_inodes(sdp, UNDO);
871fail_sb: 1158fail_sb:
872 init_sb(sdp, 0, UNDO); 1159 if (sdp->sd_root_dir)
1160 dput(sdp->sd_root_dir);
1161 if (sdp->sd_master_dir)
1162 dput(sdp->sd_master_dir);
1163 sb->s_root = NULL;
873fail_locking: 1164fail_locking:
874 init_locking(sdp, &mount_gh, UNDO); 1165 init_locking(sdp, &mount_gh, UNDO);
875fail_lm: 1166fail_lm:
@@ -887,151 +1178,63 @@ fail:
887} 1178}
888 1179
889static int gfs2_get_sb(struct file_system_type *fs_type, int flags, 1180static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
890 const char *dev_name, void *data, struct vfsmount *mnt) 1181 const char *dev_name, void *data, struct vfsmount *mnt)
891{ 1182{
892 struct super_block *sb; 1183 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
893 struct gfs2_sbd *sdp;
894 int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
895 if (error)
896 goto out;
897 sb = mnt->mnt_sb;
898 sdp = sb->s_fs_info;
899 sdp->sd_gfs2mnt = mnt;
900out:
901 return error;
902} 1184}
903 1185
904static int fill_super_meta(struct super_block *sb, struct super_block *new, 1186static struct super_block *get_gfs2_sb(const char *dev_name)
905 void *data, int silent)
906{ 1187{
907 struct gfs2_sbd *sdp = sb->s_fs_info; 1188 struct super_block *sb;
908 struct inode *inode;
909 int error = 0;
910
911 new->s_fs_info = sdp;
912 sdp->sd_vfs_meta = sb;
913
914 init_vfs(new, SDF_NOATIME);
915
916 /* Get the master inode */
917 inode = igrab(sdp->sd_master_dir);
918
919 new->s_root = d_alloc_root(inode);
920 if (!new->s_root) {
921 fs_err(sdp, "can't get root dentry\n");
922 error = -ENOMEM;
923 iput(inode);
924 } else
925 new->s_root->d_op = &gfs2_dops;
926
927 return error;
928}
929
930static int set_bdev_super(struct super_block *s, void *data)
931{
932 s->s_bdev = data;
933 s->s_dev = s->s_bdev->bd_dev;
934 return 0;
935}
936
937static int test_bdev_super(struct super_block *s, void *data)
938{
939 return s->s_bdev == data;
940}
941
942static struct super_block* get_gfs2_sb(const char *dev_name)
943{
944 struct kstat stat;
945 struct nameidata nd; 1189 struct nameidata nd;
946 struct super_block *sb = NULL, *s;
947 int error; 1190 int error;
948 1191
949 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); 1192 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
950 if (error) { 1193 if (error) {
951 printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n", 1194 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
952 dev_name); 1195 dev_name, error);
953 goto out; 1196 return NULL;
954 }
955 error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
956
957 list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
958 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
959 (S_ISDIR(stat.mode) &&
960 s == nd.path.dentry->d_inode->i_sb)) {
961 sb = s;
962 goto free_nd;
963 }
964 } 1197 }
965 1198 sb = nd.path.dentry->d_inode->i_sb;
966 printk(KERN_WARNING "GFS2: Unrecognized block device or " 1199 if (sb && (sb->s_type == &gfs2_fs_type))
967 "mount point %s\n", dev_name); 1200 atomic_inc(&sb->s_active);
968 1201 else
969free_nd: 1202 sb = NULL;
970 path_put(&nd.path); 1203 path_put(&nd.path);
971out:
972 return sb; 1204 return sb;
973} 1205}
974 1206
975static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, 1207static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
976 const char *dev_name, void *data, struct vfsmount *mnt) 1208 const char *dev_name, void *data, struct vfsmount *mnt)
977{ 1209{
978 int error = 0; 1210 struct super_block *sb = NULL;
979 struct super_block *sb = NULL, *new;
980 struct gfs2_sbd *sdp; 1211 struct gfs2_sbd *sdp;
981 1212
982 sb = get_gfs2_sb(dev_name); 1213 sb = get_gfs2_sb(dev_name);
983 if (!sb) { 1214 if (!sb) {
984 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1215 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
985 error = -ENOENT; 1216 return -ENOENT;
986 goto error;
987 } 1217 }
988 sdp = sb->s_fs_info; 1218 sdp = sb->s_fs_info;
989 if (sdp->sd_vfs_meta) { 1219 mnt->mnt_sb = sb;
990 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); 1220 mnt->mnt_root = dget(sdp->sd_master_dir);
991 error = -EBUSY; 1221 return 0;
992 goto error;
993 }
994 down(&sb->s_bdev->bd_mount_sem);
995 new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
996 up(&sb->s_bdev->bd_mount_sem);
997 if (IS_ERR(new)) {
998 error = PTR_ERR(new);
999 goto error;
1000 }
1001 new->s_flags = flags;
1002 strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
1003 sb_set_blocksize(new, sb->s_blocksize);
1004 error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
1005 if (error) {
1006 up_write(&new->s_umount);
1007 deactivate_super(new);
1008 goto error;
1009 }
1010
1011 new->s_flags |= MS_ACTIVE;
1012
1013 /* Grab a reference to the gfs2 mount point */
1014 atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
1015 return simple_set_mnt(mnt, new);
1016error:
1017 return error;
1018} 1222}
1019 1223
1020static void gfs2_kill_sb(struct super_block *sb) 1224static void gfs2_kill_sb(struct super_block *sb)
1021{ 1225{
1022 if (sb->s_fs_info) { 1226 struct gfs2_sbd *sdp = sb->s_fs_info;
1023 gfs2_delete_debugfs_file(sb->s_fs_info); 1227 if (sdp) {
1024 gfs2_meta_syncfs(sb->s_fs_info); 1228 gfs2_meta_syncfs(sdp);
1229 dput(sdp->sd_root_dir);
1230 dput(sdp->sd_master_dir);
1231 sdp->sd_root_dir = NULL;
1232 sdp->sd_master_dir = NULL;
1025 } 1233 }
1234 shrink_dcache_sb(sb);
1026 kill_block_super(sb); 1235 kill_block_super(sb);
1027} 1236 if (sdp)
1028 1237 gfs2_delete_debugfs_file(sdp);
1029static void gfs2_kill_sb_meta(struct super_block *sb)
1030{
1031 struct gfs2_sbd *sdp = sb->s_fs_info;
1032 generic_shutdown_super(sb);
1033 sdp->sd_vfs_meta = NULL;
1034 atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
1035} 1238}
1036 1239
1037struct file_system_type gfs2_fs_type = { 1240struct file_system_type gfs2_fs_type = {
@@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = {
1046 .name = "gfs2meta", 1249 .name = "gfs2meta",
1047 .fs_flags = FS_REQUIRES_DEV, 1250 .fs_flags = FS_REQUIRES_DEV,
1048 .get_sb = gfs2_get_sb_meta, 1251 .get_sb = gfs2_get_sb_meta,
1049 .kill_sb = gfs2_kill_sb_meta,
1050 .owner = THIS_MODULE, 1252 .owner = THIS_MODULE,
1051}; 1253};
1052 1254
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e2c62f73a778..534e1e2c65ca 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -159,9 +159,13 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
159 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 159 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
160 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 160 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
161 161
162 error = gfs2_glock_nq_m(2, ghs); 162 error = gfs2_glock_nq(ghs); /* parent */
163 if (error) 163 if (error)
164 goto out; 164 goto out_parent;
165
166 error = gfs2_glock_nq(ghs + 1); /* child */
167 if (error)
168 goto out_child;
165 169
166 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); 170 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
167 if (error) 171 if (error)
@@ -245,8 +249,10 @@ out_alloc:
245 if (alloc_required) 249 if (alloc_required)
246 gfs2_alloc_put(dip); 250 gfs2_alloc_put(dip);
247out_gunlock: 251out_gunlock:
248 gfs2_glock_dq_m(2, ghs); 252 gfs2_glock_dq(ghs + 1);
249out: 253out_child:
254 gfs2_glock_dq(ghs);
255out_parent:
250 gfs2_holder_uninit(ghs); 256 gfs2_holder_uninit(ghs);
251 gfs2_holder_uninit(ghs + 1); 257 gfs2_holder_uninit(ghs + 1);
252 if (!error) { 258 if (!error) {
@@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
302 308
303 error = gfs2_unlink_ok(dip, &dentry->d_name, ip); 309 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
304 if (error) 310 if (error)
305 goto out_rgrp; 311 goto out_gunlock;
306 312
307 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); 313 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
308 if (error) 314 if (error)
@@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
316 322
317out_end_trans: 323out_end_trans:
318 gfs2_trans_end(sdp); 324 gfs2_trans_end(sdp);
325out_gunlock:
319 gfs2_glock_dq(ghs + 2); 326 gfs2_glock_dq(ghs + 2);
320out_rgrp: 327out_rgrp:
321 gfs2_holder_uninit(ghs + 2); 328 gfs2_holder_uninit(ghs + 2);
@@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
485 struct gfs2_holder ri_gh; 492 struct gfs2_holder ri_gh;
486 int error; 493 int error;
487 494
488
489 error = gfs2_rindex_hold(sdp, &ri_gh); 495 error = gfs2_rindex_hold(sdp, &ri_gh);
490 if (error) 496 if (error)
491 return error; 497 return error;
@@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
495 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); 501 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
496 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); 502 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
497 503
498 error = gfs2_glock_nq_m(3, ghs); 504 error = gfs2_glock_nq(ghs); /* parent */
499 if (error) 505 if (error)
500 goto out; 506 goto out_parent;
507
508 error = gfs2_glock_nq(ghs + 1); /* child */
509 if (error)
510 goto out_child;
511
512 error = gfs2_glock_nq(ghs + 2); /* rgrp */
513 if (error)
514 goto out_rgrp;
501 515
502 error = gfs2_unlink_ok(dip, &dentry->d_name, ip); 516 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
503 if (error) 517 if (error)
@@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
523 gfs2_trans_end(sdp); 537 gfs2_trans_end(sdp);
524 538
525out_gunlock: 539out_gunlock:
526 gfs2_glock_dq_m(3, ghs); 540 gfs2_glock_dq(ghs + 2);
527out: 541out_rgrp:
528 gfs2_holder_uninit(ghs);
529 gfs2_holder_uninit(ghs + 1);
530 gfs2_holder_uninit(ghs + 2); 542 gfs2_holder_uninit(ghs + 2);
543 gfs2_glock_dq(ghs + 1);
544out_child:
545 gfs2_holder_uninit(ghs + 1);
546 gfs2_glock_dq(ghs);
547out_parent:
548 gfs2_holder_uninit(ghs);
531 gfs2_glock_dq_uninit(&ri_gh); 549 gfs2_glock_dq_uninit(&ri_gh);
532 return error; 550 return error;
533} 551}
@@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
571 return 0; 589 return 0;
572} 590}
573 591
592/*
593 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
594 * @this: move this
595 * @to: to here
596 *
597 * Follow @to back to the root and make sure we don't encounter @this
598 * Assumes we already hold the rename lock.
599 *
600 * Returns: errno
601 */
602
603static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
604{
605 struct inode *dir = &to->i_inode;
606 struct super_block *sb = dir->i_sb;
607 struct inode *tmp;
608 struct qstr dotdot;
609 int error = 0;
610
611 gfs2_str2qstr(&dotdot, "..");
612
613 igrab(dir);
614
615 for (;;) {
616 if (dir == &this->i_inode) {
617 error = -EINVAL;
618 break;
619 }
620 if (dir == sb->s_root->d_inode) {
621 error = 0;
622 break;
623 }
624
625 tmp = gfs2_lookupi(dir, &dotdot, 1);
626 if (IS_ERR(tmp)) {
627 error = PTR_ERR(tmp);
628 break;
629 }
630
631 iput(dir);
632 dir = tmp;
633 }
634
635 iput(dir);
636
637 return error;
638}
639
574/** 640/**
575 * gfs2_rename - Rename a file 641 * gfs2_rename - Rename a file
576 * @odir: Parent directory of old file name 642 * @odir: Parent directory of old file name
@@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
589 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 655 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
590 struct gfs2_inode *nip = NULL; 656 struct gfs2_inode *nip = NULL;
591 struct gfs2_sbd *sdp = GFS2_SB(odir); 657 struct gfs2_sbd *sdp = GFS2_SB(odir);
592 struct gfs2_holder ghs[5], r_gh; 658 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
593 struct gfs2_rgrpd *nrgd; 659 struct gfs2_rgrpd *nrgd;
594 unsigned int num_gh; 660 unsigned int num_gh;
595 int dir_rename = 0; 661 int dir_rename = 0;
@@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
603 return 0; 669 return 0;
604 } 670 }
605 671
606 /* Make sure we aren't trying to move a dirctory into it's subdir */
607
608 if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
609 dir_rename = 1;
610 672
611 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, 673 if (odip != ndip) {
612 &r_gh); 674 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
675 0, &r_gh);
613 if (error) 676 if (error)
614 goto out; 677 goto out;
615 678
616 error = gfs2_ok_to_move(ip, ndip); 679 if (S_ISDIR(ip->i_inode.i_mode)) {
617 if (error) 680 dir_rename = 1;
618 goto out_gunlock_r; 681 /* don't move a dirctory into it's subdir */
682 error = gfs2_ok_to_move(ip, ndip);
683 if (error)
684 goto out_gunlock_r;
685 }
619 } 686 }
620 687
621 num_gh = 1; 688 num_gh = 1;
@@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
639 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); 706 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
640 } 707 }
641 708
642 error = gfs2_glock_nq_m(num_gh, ghs); 709 for (x = 0; x < num_gh; x++) {
643 if (error) 710 error = gfs2_glock_nq(ghs + x);
644 goto out_uninit; 711 if (error)
712 goto out_gunlock;
713 }
645 714
646 /* Check out the old directory */ 715 /* Check out the old directory */
647 716
@@ -804,12 +873,12 @@ out_alloc:
804 if (alloc_required) 873 if (alloc_required)
805 gfs2_alloc_put(ndip); 874 gfs2_alloc_put(ndip);
806out_gunlock: 875out_gunlock:
807 gfs2_glock_dq_m(num_gh, ghs); 876 while (x--) {
808out_uninit: 877 gfs2_glock_dq(ghs + x);
809 for (x = 0; x < num_gh; x++)
810 gfs2_holder_uninit(ghs + x); 878 gfs2_holder_uninit(ghs + x);
879 }
811out_gunlock_r: 880out_gunlock_r:
812 if (dir_rename) 881 if (r_gh.gh_gl)
813 gfs2_glock_dq_uninit(&r_gh); 882 gfs2_glock_dq_uninit(&r_gh);
814out: 883out:
815 return error; 884 return error;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index f66ea0f7a356..d5355d9b5926 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -20,6 +20,7 @@
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/lm_interface.h> 22#include <linux/lm_interface.h>
23#include <linux/time.h>
23 24
24#include "gfs2.h" 25#include "gfs2.h"
25#include "incore.h" 26#include "incore.h"
@@ -38,6 +39,7 @@
38#include "dir.h" 39#include "dir.h"
39#include "eattr.h" 40#include "eattr.h"
40#include "bmap.h" 41#include "bmap.h"
42#include "meta_io.h"
41 43
42/** 44/**
43 * gfs2_write_inode - Make sure the inode is stable on the disk 45 * gfs2_write_inode - Make sure the inode is stable on the disk
@@ -50,16 +52,74 @@
50static int gfs2_write_inode(struct inode *inode, int sync) 52static int gfs2_write_inode(struct inode *inode, int sync)
51{ 53{
52 struct gfs2_inode *ip = GFS2_I(inode); 54 struct gfs2_inode *ip = GFS2_I(inode);
53 55 struct gfs2_sbd *sdp = GFS2_SB(inode);
54 /* Check this is a "normal" inode */ 56 struct gfs2_holder gh;
55 if (test_bit(GIF_USER, &ip->i_flags)) { 57 struct buffer_head *bh;
56 if (current->flags & PF_MEMALLOC) 58 struct timespec atime;
57 return 0; 59 struct gfs2_dinode *di;
58 if (sync) 60 int ret = 0;
59 gfs2_log_flush(GFS2_SB(inode), ip->i_gl); 61
62 /* Check this is a "normal" inode, etc */
63 if (!test_bit(GIF_USER, &ip->i_flags) ||
64 (current->flags & PF_MEMALLOC))
65 return 0;
66 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
67 if (ret)
68 goto do_flush;
69 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
70 if (ret)
71 goto do_unlock;
72 ret = gfs2_meta_inode_buffer(ip, &bh);
73 if (ret == 0) {
74 di = (struct gfs2_dinode *)bh->b_data;
75 atime.tv_sec = be64_to_cpu(di->di_atime);
76 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
77 if (timespec_compare(&inode->i_atime, &atime) > 0) {
78 gfs2_trans_add_bh(ip->i_gl, bh, 1);
79 gfs2_dinode_out(ip, bh->b_data);
80 }
81 brelse(bh);
60 } 82 }
83 gfs2_trans_end(sdp);
84do_unlock:
85 gfs2_glock_dq_uninit(&gh);
86do_flush:
87 if (sync != 0)
88 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
89 return ret;
90}
61 91
62 return 0; 92/**
93 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
94 * @sdp: the filesystem
95 *
96 * Returns: errno
97 */
98
99static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
100{
101 struct gfs2_holder t_gh;
102 int error;
103
104 gfs2_quota_sync(sdp);
105 gfs2_statfs_sync(sdp);
106
107 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
108 &t_gh);
109 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
110 return error;
111
112 gfs2_meta_syncfs(sdp);
113 gfs2_log_shutdown(sdp);
114
115 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
116
117 if (t_gh.gh_gl)
118 gfs2_glock_dq_uninit(&t_gh);
119
120 gfs2_quota_cleanup(sdp);
121
122 return error;
63} 123}
64 124
65/** 125/**
@@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb)
73 struct gfs2_sbd *sdp = sb->s_fs_info; 133 struct gfs2_sbd *sdp = sb->s_fs_info;
74 int error; 134 int error;
75 135
76 if (!sdp)
77 return;
78
79 if (!strncmp(sb->s_type->name, "gfs2meta", 8))
80 return; /* Nothing to do */
81
82 /* Unfreeze the filesystem, if we need to */ 136 /* Unfreeze the filesystem, if we need to */
83 137
84 mutex_lock(&sdp->sd_freeze_lock); 138 mutex_lock(&sdp->sd_freeze_lock);
@@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb)
101 155
102 /* Release stuff */ 156 /* Release stuff */
103 157
104 iput(sdp->sd_master_dir);
105 iput(sdp->sd_jindex); 158 iput(sdp->sd_jindex);
106 iput(sdp->sd_inum_inode); 159 iput(sdp->sd_inum_inode);
107 iput(sdp->sd_statfs_inode); 160 iput(sdp->sd_statfs_inode);
@@ -152,6 +205,7 @@ static void gfs2_write_super(struct super_block *sb)
152 * 205 *
153 * Flushes the log to disk. 206 * Flushes the log to disk.
154 */ 207 */
208
155static int gfs2_sync_fs(struct super_block *sb, int wait) 209static int gfs2_sync_fs(struct super_block *sb, int wait)
156{ 210{
157 sb->s_dirt = 0; 211 sb->s_dirt = 0;
@@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
270 } 324 }
271 } 325 }
272 326
273 if (*flags & (MS_NOATIME | MS_NODIRATIME))
274 set_bit(SDF_NOATIME, &sdp->sd_flags);
275 else
276 clear_bit(SDF_NOATIME, &sdp->sd_flags);
277
278 /* Don't let the VFS update atimes. GFS2 handles this itself. */
279 *flags |= MS_NOATIME | MS_NODIRATIME;
280
281 return error; 327 return error;
282} 328}
283 329
@@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
295 * inode's blocks, or alternatively pass the baton on to another 341 * inode's blocks, or alternatively pass the baton on to another
296 * node for later deallocation. 342 * node for later deallocation.
297 */ 343 */
344
298static void gfs2_drop_inode(struct inode *inode) 345static void gfs2_drop_inode(struct inode *inode)
299{ 346{
300 struct gfs2_inode *ip = GFS2_I(inode); 347 struct gfs2_inode *ip = GFS2_I(inode);
@@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode)
333 } 380 }
334} 381}
335 382
383static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
384{
385 do {
386 if (d1 == d2)
387 return 1;
388 d1 = d1->d_parent;
389 } while (!IS_ROOT(d1));
390 return 0;
391}
392
336/** 393/**
337 * gfs2_show_options - Show mount options for /proc/mounts 394 * gfs2_show_options - Show mount options for /proc/mounts
338 * @s: seq_file structure 395 * @s: seq_file structure
@@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
346 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; 403 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
347 struct gfs2_args *args = &sdp->sd_args; 404 struct gfs2_args *args = &sdp->sd_args;
348 405
406 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
407 seq_printf(s, ",meta");
349 if (args->ar_lockproto[0]) 408 if (args->ar_lockproto[0])
350 seq_printf(s, ",lockproto=%s", args->ar_lockproto); 409 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
351 if (args->ar_locktable[0]) 410 if (args->ar_locktable[0])
@@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
414 * conversion on the iopen lock, but we can change that later. This 473 * conversion on the iopen lock, but we can change that later. This
415 * is safe, just less efficient. 474 * is safe, just less efficient.
416 */ 475 */
476
417static void gfs2_delete_inode(struct inode *inode) 477static void gfs2_delete_inode(struct inode *inode)
418{ 478{
419 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; 479 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
@@ -478,8 +538,6 @@ out:
478 clear_inode(inode); 538 clear_inode(inode);
479} 539}
480 540
481
482
483static struct inode *gfs2_alloc_inode(struct super_block *sb) 541static struct inode *gfs2_alloc_inode(struct super_block *sb)
484{ 542{
485 struct gfs2_inode *ip; 543 struct gfs2_inode *ip;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ca831991cbc2..c3ba3d9d0aac 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,313 +33,6 @@
33#include "trans.h" 33#include "trans.h"
34#include "util.h" 34#include "util.h"
35 35
36static const u32 gfs2_old_fs_formats[] = {
37 0
38};
39
40static const u32 gfs2_old_multihost_formats[] = {
41 0
42};
43
44/**
45 * gfs2_tune_init - Fill a gfs2_tune structure with default values
46 * @gt: tune
47 *
48 */
49
50void gfs2_tune_init(struct gfs2_tune *gt)
51{
52 spin_lock_init(&gt->gt_spin);
53
54 gt->gt_demote_secs = 300;
55 gt->gt_incore_log_blocks = 1024;
56 gt->gt_log_flush_secs = 60;
57 gt->gt_recoverd_secs = 60;
58 gt->gt_logd_secs = 1;
59 gt->gt_quotad_secs = 5;
60 gt->gt_quota_simul_sync = 64;
61 gt->gt_quota_warn_period = 10;
62 gt->gt_quota_scale_num = 1;
63 gt->gt_quota_scale_den = 1;
64 gt->gt_quota_cache_secs = 300;
65 gt->gt_quota_quantum = 60;
66 gt->gt_atime_quantum = 3600;
67 gt->gt_new_files_jdata = 0;
68 gt->gt_max_readahead = 1 << 18;
69 gt->gt_stall_secs = 600;
70 gt->gt_complain_secs = 10;
71 gt->gt_statfs_quantum = 30;
72 gt->gt_statfs_slow = 0;
73}
74
75/**
76 * gfs2_check_sb - Check superblock
77 * @sdp: the filesystem
78 * @sb: The superblock
79 * @silent: Don't print a message if the check fails
80 *
81 * Checks the version code of the FS is one that we understand how to
82 * read and that the sizes of the various on-disk structures have not
83 * changed.
84 */
85
86int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
87{
88 unsigned int x;
89
90 if (sb->sb_magic != GFS2_MAGIC ||
91 sb->sb_type != GFS2_METATYPE_SB) {
92 if (!silent)
93 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
94 return -EINVAL;
95 }
96
97 /* If format numbers match exactly, we're done. */
98
99 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
100 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
101 return 0;
102
103 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
104 for (x = 0; gfs2_old_fs_formats[x]; x++)
105 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
106 break;
107
108 if (!gfs2_old_fs_formats[x]) {
109 printk(KERN_WARNING
110 "GFS2: code version (%u, %u) is incompatible "
111 "with ondisk format (%u, %u)\n",
112 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
113 sb->sb_fs_format, sb->sb_multihost_format);
114 printk(KERN_WARNING
115 "GFS2: I don't know how to upgrade this FS\n");
116 return -EINVAL;
117 }
118 }
119
120 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
121 for (x = 0; gfs2_old_multihost_formats[x]; x++)
122 if (gfs2_old_multihost_formats[x] ==
123 sb->sb_multihost_format)
124 break;
125
126 if (!gfs2_old_multihost_formats[x]) {
127 printk(KERN_WARNING
128 "GFS2: code version (%u, %u) is incompatible "
129 "with ondisk format (%u, %u)\n",
130 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
131 sb->sb_fs_format, sb->sb_multihost_format);
132 printk(KERN_WARNING
133 "GFS2: I don't know how to upgrade this FS\n");
134 return -EINVAL;
135 }
136 }
137
138 if (!sdp->sd_args.ar_upgrade) {
139 printk(KERN_WARNING
140 "GFS2: code version (%u, %u) is incompatible "
141 "with ondisk format (%u, %u)\n",
142 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
143 sb->sb_fs_format, sb->sb_multihost_format);
144 printk(KERN_INFO
145 "GFS2: Use the \"upgrade\" mount option to upgrade "
146 "the FS\n");
147 printk(KERN_INFO "GFS2: See the manual for more details\n");
148 return -EINVAL;
149 }
150
151 return 0;
152}
153
154
155static void end_bio_io_page(struct bio *bio, int error)
156{
157 struct page *page = bio->bi_private;
158
159 if (!error)
160 SetPageUptodate(page);
161 else
162 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
163 unlock_page(page);
164}
165
166static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
167{
168 const struct gfs2_sb *str = buf;
169
170 sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
171 sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
172 sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
173 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
174 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
175 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
176 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
177 sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
178 sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
179 sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
180 sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
181
182 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
183 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
184}
185
186/**
187 * gfs2_read_super - Read the gfs2 super block from disk
188 * @sdp: The GFS2 super block
189 * @sector: The location of the super block
190 * @error: The error code to return
191 *
192 * This uses the bio functions to read the super block from disk
193 * because we want to be 100% sure that we never read cached data.
194 * A super block is read twice only during each GFS2 mount and is
195 * never written to by the filesystem. The first time its read no
196 * locks are held, and the only details which are looked at are those
197 * relating to the locking protocol. Once locking is up and working,
198 * the sb is read again under the lock to establish the location of
199 * the master directory (contains pointers to journals etc) and the
200 * root directory.
201 *
202 * Returns: 0 on success or error
203 */
204
205int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
206{
207 struct super_block *sb = sdp->sd_vfs;
208 struct gfs2_sb *p;
209 struct page *page;
210 struct bio *bio;
211
212 page = alloc_page(GFP_NOFS);
213 if (unlikely(!page))
214 return -ENOBUFS;
215
216 ClearPageUptodate(page);
217 ClearPageDirty(page);
218 lock_page(page);
219
220 bio = bio_alloc(GFP_NOFS, 1);
221 if (unlikely(!bio)) {
222 __free_page(page);
223 return -ENOBUFS;
224 }
225
226 bio->bi_sector = sector * (sb->s_blocksize >> 9);
227 bio->bi_bdev = sb->s_bdev;
228 bio_add_page(bio, page, PAGE_SIZE, 0);
229
230 bio->bi_end_io = end_bio_io_page;
231 bio->bi_private = page;
232 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
233 wait_on_page_locked(page);
234 bio_put(bio);
235 if (!PageUptodate(page)) {
236 __free_page(page);
237 return -EIO;
238 }
239 p = kmap(page);
240 gfs2_sb_in(&sdp->sd_sb, p);
241 kunmap(page);
242 __free_page(page);
243 return 0;
244}
245
246/**
247 * gfs2_read_sb - Read super block
248 * @sdp: The GFS2 superblock
249 * @gl: the glock for the superblock (assumed to be held)
250 * @silent: Don't print message if mount fails
251 *
252 */
253
254int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
255{
256 u32 hash_blocks, ind_blocks, leaf_blocks;
257 u32 tmp_blocks;
258 unsigned int x;
259 int error;
260
261 error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
262 if (error) {
263 if (!silent)
264 fs_err(sdp, "can't read superblock\n");
265 return error;
266 }
267
268 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
269 if (error)
270 return error;
271
272 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
273 GFS2_BASIC_BLOCK_SHIFT;
274 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
275 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
276 sizeof(struct gfs2_dinode)) / sizeof(u64);
277 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
278 sizeof(struct gfs2_meta_header)) / sizeof(u64);
279 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
280 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
281 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
282 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
283 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
284 sizeof(struct gfs2_meta_header)) /
285 sizeof(struct gfs2_quota_change);
286
287 /* Compute maximum reservation required to add a entry to a directory */
288
289 hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
290 sdp->sd_jbsize);
291
292 ind_blocks = 0;
293 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
294 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
295 ind_blocks += tmp_blocks;
296 }
297
298 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
299
300 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
301
302 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
303 sizeof(struct gfs2_dinode);
304 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
305 for (x = 2;; x++) {
306 u64 space, d;
307 u32 m;
308
309 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
310 d = space;
311 m = do_div(d, sdp->sd_inptrs);
312
313 if (d != sdp->sd_heightsize[x - 1] || m)
314 break;
315 sdp->sd_heightsize[x] = space;
316 }
317 sdp->sd_max_height = x;
318 sdp->sd_heightsize[x] = ~0;
319 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
320
321 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
322 sizeof(struct gfs2_dinode);
323 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
324 for (x = 2;; x++) {
325 u64 space, d;
326 u32 m;
327
328 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
329 d = space;
330 m = do_div(d, sdp->sd_inptrs);
331
332 if (d != sdp->sd_jheightsize[x - 1] || m)
333 break;
334 sdp->sd_jheightsize[x] = space;
335 }
336 sdp->sd_max_jheight = x;
337 sdp->sd_jheightsize[x] = ~0;
338 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
339
340 return 0;
341}
342
343/** 36/**
344 * gfs2_jindex_hold - Grab a lock on the jindex 37 * gfs2_jindex_hold - Grab a lock on the jindex
345 * @sdp: The GFS2 superblock 38 * @sdp: The GFS2 superblock
@@ -581,39 +274,6 @@ fail:
581 return error; 274 return error;
582} 275}
583 276
584/**
585 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
586 * @sdp: the filesystem
587 *
588 * Returns: errno
589 */
590
591int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
592{
593 struct gfs2_holder t_gh;
594 int error;
595
596 gfs2_quota_sync(sdp);
597 gfs2_statfs_sync(sdp);
598
599 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
600 &t_gh);
601 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
602 return error;
603
604 gfs2_meta_syncfs(sdp);
605 gfs2_log_shutdown(sdp);
606
607 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
608
609 if (t_gh.gh_gl)
610 gfs2_glock_dq_uninit(&t_gh);
611
612 gfs2_quota_cleanup(sdp);
613
614 return error;
615}
616
617static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) 277static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
618{ 278{
619 const struct gfs2_statfs_change *str = buf; 279 const struct gfs2_statfs_change *str = buf;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 44361ecc44f7..50a4c9b1215e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -12,11 +12,6 @@
12 12
13#include "incore.h" 13#include "incore.h"
14 14
15void gfs2_tune_init(struct gfs2_tune *gt);
16
17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
19int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
20void gfs2_lm_unmount(struct gfs2_sbd *sdp); 15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
21 16
22static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) 17static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
@@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
40 struct gfs2_inode **ipp); 35 struct gfs2_inode **ipp);
41 36
42int gfs2_make_fs_rw(struct gfs2_sbd *sdp); 37int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
43int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
44 38
45int gfs2_statfs_init(struct gfs2_sbd *sdp); 39int gfs2_statfs_init(struct gfs2_sbd *sdp);
46void gfs2_statfs_change(struct gfs2_sbd *sdp, 40void gfs2_statfs_change(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 74846559fc3f..7e1879f1a02c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -269,14 +269,6 @@ ARGS_ATTR(quota, "%u\n");
269ARGS_ATTR(suiddir, "%d\n"); 269ARGS_ATTR(suiddir, "%d\n");
270ARGS_ATTR(data, "%d\n"); 270ARGS_ATTR(data, "%d\n");
271 271
272/* one oddball doesn't fit the macro mold */
273static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
274{
275 return snprintf(buf, PAGE_SIZE, "%d\n",
276 !!test_bit(SDF_NOATIME, &sdp->sd_flags));
277}
278static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
279
280static struct attribute *args_attrs[] = { 272static struct attribute *args_attrs[] = {
281 &args_attr_lockproto.attr, 273 &args_attr_lockproto.attr,
282 &args_attr_locktable.attr, 274 &args_attr_locktable.attr,
@@ -292,7 +284,6 @@ static struct attribute *args_attrs[] = {
292 &args_attr_quota.attr, 284 &args_attr_quota.attr,
293 &args_attr_suiddir.attr, 285 &args_attr_suiddir.attr,
294 &args_attr_data.attr, 286 &args_attr_data.attr,
295 &args_attr_noatime.attr,
296 NULL, 287 NULL,
297}; 288};
298 289
@@ -407,7 +398,6 @@ TUNE_ATTR(incore_log_blocks, 0);
407TUNE_ATTR(log_flush_secs, 0); 398TUNE_ATTR(log_flush_secs, 0);
408TUNE_ATTR(quota_warn_period, 0); 399TUNE_ATTR(quota_warn_period, 0);
409TUNE_ATTR(quota_quantum, 0); 400TUNE_ATTR(quota_quantum, 0);
410TUNE_ATTR(atime_quantum, 0);
411TUNE_ATTR(max_readahead, 0); 401TUNE_ATTR(max_readahead, 0);
412TUNE_ATTR(complain_secs, 0); 402TUNE_ATTR(complain_secs, 0);
413TUNE_ATTR(statfs_slow, 0); 403TUNE_ATTR(statfs_slow, 0);
@@ -427,7 +417,6 @@ static struct attribute *tune_attrs[] = {
427 &tune_attr_log_flush_secs.attr, 417 &tune_attr_log_flush_secs.attr,
428 &tune_attr_quota_warn_period.attr, 418 &tune_attr_quota_warn_period.attr,
429 &tune_attr_quota_quantum.attr, 419 &tune_attr_quota_quantum.attr,
430 &tune_attr_atime_quantum.attr,
431 &tune_attr_max_readahead.attr, 420 &tune_attr_max_readahead.attr,
432 &tune_attr_complain_secs.attr, 421 &tune_attr_complain_secs.attr,
433 &tune_attr_statfs_slow.attr, 422 &tune_attr_statfs_slow.attr,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4abb1047c689..3c7c7637719c 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -173,7 +173,7 @@ enum {
173 opt_err 173 opt_err
174}; 174};
175 175
176static match_table_t tokens = { 176static const match_table_t tokens = {
177 { opt_uid, "uid=%u" }, 177 { opt_uid, "uid=%u" },
178 { opt_gid, "gid=%u" }, 178 { opt_gid, "gid=%u" },
179 { opt_umask, "umask=%o" }, 179 { opt_umask, "umask=%o" },
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 9997cbf8beb5..9699c56d323f 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -25,7 +25,7 @@ enum {
25 opt_force, opt_err 25 opt_force, opt_err
26}; 26};
27 27
28static match_table_t tokens = { 28static const match_table_t tokens = {
29 { opt_creator, "creator=%s" }, 29 { opt_creator, "creator=%s" },
30 { opt_type, "type=%s" }, 30 { opt_type, "type=%s" },
31 { opt_umask, "umask=%o" }, 31 { opt_umask, "umask=%o" },
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b8ae9c90ada0..29ad461d568f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -215,7 +215,7 @@ enum {
215 Opt_timeshift, Opt_err, 215 Opt_timeshift, Opt_err,
216}; 216};
217 217
218static match_table_t tokens = { 218static const match_table_t tokens = {
219 {Opt_help, "help"}, 219 {Opt_help, "help"},
220 {Opt_uid, "uid=%u"}, 220 {Opt_uid, "uid=%u"},
221 {Opt_gid, "gid=%u"}, 221 {Opt_gid, "gid=%u"},
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3f58923fb39b..61edc701b0e6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -57,7 +57,7 @@ enum {
57 Opt_err, 57 Opt_err,
58}; 58};
59 59
60static match_table_t tokens = { 60static const match_table_t tokens = {
61 {Opt_size, "size=%s"}, 61 {Opt_size, "size=%s"},
62 {Opt_nr_inodes, "nr_inodes=%s"}, 62 {Opt_nr_inodes, "nr_inodes=%s"},
63 {Opt_mode, "mode=%o"}, 63 {Opt_mode, "mode=%o"},
diff --git a/fs/inode.c b/fs/inode.c
index b6726f644530..0487ddba1397 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb)
166 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); 166 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
167 mapping->assoc_mapping = NULL; 167 mapping->assoc_mapping = NULL;
168 mapping->backing_dev_info = &default_backing_dev_info; 168 mapping->backing_dev_info = &default_backing_dev_info;
169 mapping->writeback_index = 0;
169 170
170 /* 171 /*
171 * If the block_device provides a backing_dev_info for client 172 * If the block_device provides a backing_dev_info for client
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 60249429a253..d85c7d931cdf 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -323,7 +323,7 @@ out:
323} 323}
324 324
325/* 325/*
326 * remove_kevent - cleans up and ultimately frees the given kevent 326 * remove_kevent - cleans up the given kevent
327 * 327 *
328 * Caller must hold dev->ev_mutex. 328 * Caller must hold dev->ev_mutex.
329 */ 329 */
@@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,
334 334
335 dev->event_count--; 335 dev->event_count--;
336 dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; 336 dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
337}
337 338
339/*
340 * free_kevent - frees the given kevent.
341 */
342static void free_kevent(struct inotify_kernel_event *kevent)
343{
338 kfree(kevent->name); 344 kfree(kevent->name);
339 kmem_cache_free(event_cachep, kevent); 345 kmem_cache_free(event_cachep, kevent);
340} 346}
@@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
350 struct inotify_kernel_event *kevent; 356 struct inotify_kernel_event *kevent;
351 kevent = inotify_dev_get_event(dev); 357 kevent = inotify_dev_get_event(dev);
352 remove_kevent(dev, kevent); 358 remove_kevent(dev, kevent);
359 free_kevent(kevent);
353 } 360 }
354} 361}
355 362
@@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
433 dev = file->private_data; 440 dev = file->private_data;
434 441
435 while (1) { 442 while (1) {
436 int events;
437 443
438 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); 444 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
439 445
440 mutex_lock(&dev->ev_mutex); 446 mutex_lock(&dev->ev_mutex);
441 events = !list_empty(&dev->events); 447 if (!list_empty(&dev->events)) {
442 mutex_unlock(&dev->ev_mutex);
443 if (events) {
444 ret = 0; 448 ret = 0;
445 break; 449 break;
446 } 450 }
451 mutex_unlock(&dev->ev_mutex);
447 452
448 if (file->f_flags & O_NONBLOCK) { 453 if (file->f_flags & O_NONBLOCK) {
449 ret = -EAGAIN; 454 ret = -EAGAIN;
@@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
462 if (ret) 467 if (ret)
463 return ret; 468 return ret;
464 469
465 mutex_lock(&dev->ev_mutex);
466 while (1) { 470 while (1) {
467 struct inotify_kernel_event *kevent; 471 struct inotify_kernel_event *kevent;
468 472
@@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
481 } 485 }
482 break; 486 break;
483 } 487 }
488 remove_kevent(dev, kevent);
489
490 /*
491 * Must perform the copy_to_user outside the mutex in order
492 * to avoid a lock order reversal with mmap_sem.
493 */
494 mutex_unlock(&dev->ev_mutex);
484 495
485 if (copy_to_user(buf, &kevent->event, event_size)) { 496 if (copy_to_user(buf, &kevent->event, event_size)) {
486 ret = -EFAULT; 497 ret = -EFAULT;
@@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
498 count -= kevent->event.len; 509 count -= kevent->event.len;
499 } 510 }
500 511
501 remove_kevent(dev, kevent); 512 free_kevent(kevent);
513
514 mutex_lock(&dev->ev_mutex);
502 } 515 }
503 mutex_unlock(&dev->ev_mutex); 516 mutex_unlock(&dev->ev_mutex);
504 517
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3382d3..d152856c371b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,9 +13,14 @@
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/writeback.h>
17#include <linux/buffer_head.h>
16 18
17#include <asm/ioctls.h> 19#include <asm/ioctls.h>
18 20
21/* So that the fiemap access checks can't overflow on 32 bit machines. */
22#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
23
19/** 24/**
20 * vfs_ioctl - call filesystem specific ioctl methods 25 * vfs_ioctl - call filesystem specific ioctl methods
21 * @filp: open file to invoke ioctl method on 26 * @filp: open file to invoke ioctl method on
@@ -71,6 +76,276 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
71 return put_user(res, p); 76 return put_user(res, p);
72} 77}
73 78
79/**
80 * fiemap_fill_next_extent - Fiemap helper function
81 * @fieinfo: Fiemap context passed into ->fiemap
82 * @logical: Extent logical start offset, in bytes
83 * @phys: Extent physical start offset, in bytes
84 * @len: Extent length, in bytes
85 * @flags: FIEMAP_EXTENT flags that describe this extent
86 *
87 * Called from file system ->fiemap callback. Will populate extent
88 * info as passed in via arguments and copy to user memory. On
89 * success, extent count on fieinfo is incremented.
90 *
91 * Returns 0 on success, -errno on error, 1 if this was the last
92 * extent that will fit in user array.
93 */
94#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
95#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
96#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
97int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
98 u64 phys, u64 len, u32 flags)
99{
100 struct fiemap_extent extent;
101 struct fiemap_extent *dest = fieinfo->fi_extents_start;
102
103 /* only count the extents */
104 if (fieinfo->fi_extents_max == 0) {
105 fieinfo->fi_extents_mapped++;
106 return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
107 }
108
109 if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
110 return 1;
111
112 if (flags & SET_UNKNOWN_FLAGS)
113 flags |= FIEMAP_EXTENT_UNKNOWN;
114 if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
115 flags |= FIEMAP_EXTENT_ENCODED;
116 if (flags & SET_NOT_ALIGNED_FLAGS)
117 flags |= FIEMAP_EXTENT_NOT_ALIGNED;
118
119 memset(&extent, 0, sizeof(extent));
120 extent.fe_logical = logical;
121 extent.fe_physical = phys;
122 extent.fe_length = len;
123 extent.fe_flags = flags;
124
125 dest += fieinfo->fi_extents_mapped;
126 if (copy_to_user(dest, &extent, sizeof(extent)))
127 return -EFAULT;
128
129 fieinfo->fi_extents_mapped++;
130 if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
131 return 1;
132 return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
133}
134EXPORT_SYMBOL(fiemap_fill_next_extent);
135
136/**
137 * fiemap_check_flags - check validity of requested flags for fiemap
138 * @fieinfo: Fiemap context passed into ->fiemap
139 * @fs_flags: Set of fiemap flags that the file system understands
140 *
141 * Called from file system ->fiemap callback. This will compute the
142 * intersection of valid fiemap flags and those that the fs supports. That
143 * value is then compared against the user supplied flags. In case of bad user
144 * flags, the invalid values will be written into the fieinfo structure, and
145 * -EBADR is returned, which tells ioctl_fiemap() to return those values to
146 * userspace. For this reason, a return code of -EBADR should be preserved.
147 *
148 * Returns 0 on success, -EBADR on bad flags.
149 */
150int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
151{
152 u32 incompat_flags;
153
154 incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
155 if (incompat_flags) {
156 fieinfo->fi_flags = incompat_flags;
157 return -EBADR;
158 }
159 return 0;
160}
161EXPORT_SYMBOL(fiemap_check_flags);
162
163static int fiemap_check_ranges(struct super_block *sb,
164 u64 start, u64 len, u64 *new_len)
165{
166 *new_len = len;
167
168 if (len == 0)
169 return -EINVAL;
170
171 if (start > sb->s_maxbytes)
172 return -EFBIG;
173
174 /*
175 * Shrink request scope to what the fs can actually handle.
176 */
177 if ((len > sb->s_maxbytes) ||
178 (sb->s_maxbytes - len) < start)
179 *new_len = sb->s_maxbytes - start;
180
181 return 0;
182}
183
184static int ioctl_fiemap(struct file *filp, unsigned long arg)
185{
186 struct fiemap fiemap;
187 struct fiemap_extent_info fieinfo = { 0, };
188 struct inode *inode = filp->f_path.dentry->d_inode;
189 struct super_block *sb = inode->i_sb;
190 u64 len;
191 int error;
192
193 if (!inode->i_op->fiemap)
194 return -EOPNOTSUPP;
195
196 if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
197 sizeof(struct fiemap)))
198 return -EFAULT;
199
200 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
201 return -EINVAL;
202
203 error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
204 &len);
205 if (error)
206 return error;
207
208 fieinfo.fi_flags = fiemap.fm_flags;
209 fieinfo.fi_extents_max = fiemap.fm_extent_count;
210 fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
211
212 if (fiemap.fm_extent_count != 0 &&
213 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
214 fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
215 return -EFAULT;
216
217 if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
218 filemap_write_and_wait(inode->i_mapping);
219
220 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
221 fiemap.fm_flags = fieinfo.fi_flags;
222 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
223 if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
224 error = -EFAULT;
225
226 return error;
227}
228
229#ifdef CONFIG_BLOCK
230
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
233
234/*
235 * @inode - the inode to map
236 * @arg - the pointer to userspace where we copy everything to
237 * @get_block - the fs's get_block function
238 *
239 * This does FIEMAP for block based inodes. Basically it will just loop
240 * through get_block until we hit the number of extents we want to map, or we
241 * go past the end of the file and hit a hole.
242 *
243 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
244 * please do not use this function, it will stop at the first unmapped block
245 * beyond i_size
246 */
247int generic_block_fiemap(struct inode *inode,
248 struct fiemap_extent_info *fieinfo, u64 start,
249 u64 len, get_block_t *get_block)
250{
251 struct buffer_head tmp;
252 unsigned int start_blk;
253 long long length = 0, map_len = 0;
254 u64 logical = 0, phys = 0, size = 0;
255 u32 flags = FIEMAP_EXTENT_MERGED;
256 int ret = 0;
257
258 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
259 return ret;
260
261 start_blk = logical_to_blk(inode, start);
262
263 /* guard against change */
264 mutex_lock(&inode->i_mutex);
265
266 length = (long long)min_t(u64, len, i_size_read(inode));
267 map_len = length;
268
269 do {
270 /*
271 * we set b_size to the total size we want so it will map as
272 * many contiguous blocks as possible at once
273 */
274 memset(&tmp, 0, sizeof(struct buffer_head));
275 tmp.b_size = map_len;
276
277 ret = get_block(inode, start_blk, &tmp, 0);
278 if (ret)
279 break;
280
281 /* HOLE */
282 if (!buffer_mapped(&tmp)) {
283 /*
284 * first hole after going past the EOF, this is our
285 * last extent
286 */
287 if (length <= 0) {
288 flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
289 ret = fiemap_fill_next_extent(fieinfo, logical,
290 phys, size,
291 flags);
292 break;
293 }
294
295 length -= blk_to_logical(inode, 1);
296
297 /* if we have holes up to/past EOF then we're done */
298 if (length <= 0)
299 break;
300
301 start_blk++;
302 } else {
303 if (length <= 0 && size) {
304 ret = fiemap_fill_next_extent(fieinfo, logical,
305 phys, size,
306 flags);
307 if (ret)
308 break;
309 }
310
311 logical = blk_to_logical(inode, start_blk);
312 phys = blk_to_logical(inode, tmp.b_blocknr);
313 size = tmp.b_size;
314 flags = FIEMAP_EXTENT_MERGED;
315
316 length -= tmp.b_size;
317 start_blk += logical_to_blk(inode, size);
318
319 /*
320 * if we are past the EOF we need to loop again to see
321 * if there is a hole so we can mark this extent as the
322 * last one, and if not keep mapping things until we
323 * find a hole, or we run out of slots in the extent
324 * array
325 */
326 if (length <= 0)
327 continue;
328
329 ret = fiemap_fill_next_extent(fieinfo, logical, phys,
330 size, flags);
331 if (ret)
332 break;
333 }
334 cond_resched();
335 } while (1);
336
337 mutex_unlock(&inode->i_mutex);
338
339 /* if ret is 1 then we just hit the end of the extent array */
340 if (ret == 1)
341 ret = 0;
342
343 return ret;
344}
345EXPORT_SYMBOL(generic_block_fiemap);
346
347#endif /* CONFIG_BLOCK */
348
74static int file_ioctl(struct file *filp, unsigned int cmd, 349static int file_ioctl(struct file *filp, unsigned int cmd,
75 unsigned long arg) 350 unsigned long arg)
76{ 351{
@@ -80,6 +355,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
80 switch (cmd) { 355 switch (cmd) {
81 case FIBMAP: 356 case FIBMAP:
82 return ioctl_fibmap(filp, p); 357 return ioctl_fibmap(filp, p);
358 case FS_IOC_FIEMAP:
359 return ioctl_fiemap(filp, arg);
83 case FIGETBSZ: 360 case FIGETBSZ:
84 return put_user(inode->i_sb->s_blocksize, p); 361 return put_user(inode->i_sb->s_blocksize, p);
85 case FIONREAD: 362 case FIONREAD:
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c4a1c3c65aac..da3cc460d4df 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
115 pgrp = task_pgrp(current); 115 pgrp = task_pgrp(current);
116 else 116 else
117 pgrp = find_vpid(who); 117 pgrp = find_vpid(who);
118 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 118 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
119 ret = set_task_ioprio(p, ioprio); 119 ret = set_task_ioprio(p, ioprio);
120 if (ret) 120 if (ret)
121 break; 121 break;
122 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 122 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
123 break; 123 break;
124 case IOPRIO_WHO_USER: 124 case IOPRIO_WHO_USER:
125 if (!who) 125 if (!who)
@@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
204 pgrp = task_pgrp(current); 204 pgrp = task_pgrp(current);
205 else 205 else
206 pgrp = find_vpid(who); 206 pgrp = find_vpid(who);
207 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 207 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
208 tmpio = get_task_ioprio(p); 208 tmpio = get_task_ioprio(p);
209 if (tmpio < 0) 209 if (tmpio < 0)
210 continue; 210 continue;
@@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
212 ret = tmpio; 212 ret = tmpio;
213 else 213 else
214 ret = ioprio_best(ret, tmpio); 214 ret = ioprio_best(ret, tmpio);
215 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 215 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
216 break; 216 break;
217 case IOPRIO_WHO_USER: 217 case IOPRIO_WHO_USER:
218 if (!who) 218 if (!who)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 26948a6033b6..3f8af0f1505b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -310,7 +310,7 @@ enum {
310 Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, 310 Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
311}; 311};
312 312
313static match_table_t tokens = { 313static const match_table_t tokens = {
314 {Opt_norock, "norock"}, 314 {Opt_norock, "norock"},
315 {Opt_nojoliet, "nojoliet"}, 315 {Opt_nojoliet, "nojoliet"},
316 {Opt_unhide, "unhide"}, 316 {Opt_unhide, "unhide"},
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 8dee32007500..0540ca27a446 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
291 goto out; 291 goto out;
292 } 292 }
293 293
294 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); 294 lock_map_acquire(&handle->h_lockdep_map);
295 295
296out: 296out:
297 return handle; 297 return handle;
@@ -1448,7 +1448,7 @@ int journal_stop(handle_t *handle)
1448 spin_unlock(&journal->j_state_lock); 1448 spin_unlock(&journal->j_state_lock);
1449 } 1449 }
1450 1450
1451 lock_release(&handle->h_lockdep_map, 1, _THIS_IP_); 1451 lock_map_release(&handle->h_lockdep_map);
1452 1452
1453 jbd_free_handle(handle); 1453 jbd_free_handle(handle);
1454 return err; 1454 return err;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 91389c8aee8a..9203c3332f17 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/marker.h>
23#include <linux/errno.h> 24#include <linux/errno.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
25 26
@@ -93,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
93 int ret = 0; 94 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh); 95 struct buffer_head *bh = jh2bh(jh);
95 96
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 97 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 99 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 100 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh); 101 jbd_unlock_bh_state(bh);
@@ -126,14 +128,29 @@ void __jbd2_log_wait_for_space(journal_t *journal)
126 128
127 /* 129 /*
128 * Test again, another process may have checkpointed while we 130 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock 131 * were waiting for the checkpoint lock. If there are no
132 * outstanding transactions there is nothing to checkpoint and
133 * we can't make progress. Abort the journal in this case.
130 */ 134 */
131 spin_lock(&journal->j_state_lock); 135 spin_lock(&journal->j_state_lock);
136 spin_lock(&journal->j_list_lock);
132 nblocks = jbd_space_needed(journal); 137 nblocks = jbd_space_needed(journal);
133 if (__jbd2_log_space_left(journal) < nblocks) { 138 if (__jbd2_log_space_left(journal) < nblocks) {
139 int chkpt = journal->j_checkpoint_transactions != NULL;
140
141 spin_unlock(&journal->j_list_lock);
134 spin_unlock(&journal->j_state_lock); 142 spin_unlock(&journal->j_state_lock);
135 jbd2_log_do_checkpoint(journal); 143 if (chkpt) {
144 jbd2_log_do_checkpoint(journal);
145 } else {
146 printk(KERN_ERR "%s: no transactions\n",
147 __func__);
148 jbd2_journal_abort(journal, 0);
149 }
150
136 spin_lock(&journal->j_state_lock); 151 spin_lock(&journal->j_state_lock);
152 } else {
153 spin_unlock(&journal->j_list_lock);
137 } 154 }
138 mutex_unlock(&journal->j_checkpoint_mutex); 155 mutex_unlock(&journal->j_checkpoint_mutex);
139 } 156 }
@@ -160,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
160 * buffers. Note that we take the buffers in the opposite ordering 177 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO. 178 * from the one in which they were submitted for IO.
162 * 179 *
180 * Return 0 on success, and return <0 if some buffers have failed
181 * to be written out.
182 *
163 * Called with j_list_lock held. 183 * Called with j_list_lock held.
164 */ 184 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 185static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{ 186{
167 struct journal_head *jh; 187 struct journal_head *jh;
168 struct buffer_head *bh; 188 struct buffer_head *bh;
169 tid_t this_tid; 189 tid_t this_tid;
170 int released = 0; 190 int released = 0;
191 int ret = 0;
171 192
172 this_tid = transaction->t_tid; 193 this_tid = transaction->t_tid;
173restart: 194restart:
174 /* Did somebody clean up the transaction in the meanwhile? */ 195 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction || 196 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid) 197 transaction->t_tid != this_tid)
177 return; 198 return ret;
178 while (!released && transaction->t_checkpoint_io_list) { 199 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list; 200 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh); 201 bh = jh2bh(jh);
@@ -194,6 +215,9 @@ restart:
194 spin_lock(&journal->j_list_lock); 215 spin_lock(&journal->j_list_lock);
195 goto restart; 216 goto restart;
196 } 217 }
218 if (unlikely(buffer_write_io_error(bh)))
219 ret = -EIO;
220
197 /* 221 /*
198 * Now in whatever state the buffer currently is, we know that 222 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list 223 * it has been written out and so we can drop it from the list
@@ -203,6 +227,8 @@ restart:
203 jbd2_journal_remove_journal_head(bh); 227 jbd2_journal_remove_journal_head(bh);
204 __brelse(bh); 228 __brelse(bh);
205 } 229 }
230
231 return ret;
206} 232}
207 233
208#define NR_BATCH 64 234#define NR_BATCH 64
@@ -226,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
226 * Try to flush one buffer from the checkpoint list to disk. 252 * Try to flush one buffer from the checkpoint list to disk.
227 * 253 *
228 * Return 1 if something happened which requires us to abort the current 254 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list. 255 * scan of the checkpoint list. Return <0 if the buffer has failed to
256 * be written out.
230 * 257 *
231 * Called with j_list_lock held and drops it if 1 is returned 258 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 259 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -258,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
258 jbd2_log_wait_commit(journal, tid); 285 jbd2_log_wait_commit(journal, tid);
259 ret = 1; 286 ret = 1;
260 } else if (!buffer_dirty(bh)) { 287 } else if (!buffer_dirty(bh)) {
288 ret = 1;
289 if (unlikely(buffer_write_io_error(bh)))
290 ret = -EIO;
261 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 291 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
262 BUFFER_TRACE(bh, "remove from checkpoint"); 292 BUFFER_TRACE(bh, "remove from checkpoint");
263 __jbd2_journal_remove_checkpoint(jh); 293 __jbd2_journal_remove_checkpoint(jh);
@@ -265,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
265 jbd_unlock_bh_state(bh); 295 jbd_unlock_bh_state(bh);
266 jbd2_journal_remove_journal_head(bh); 296 jbd2_journal_remove_journal_head(bh);
267 __brelse(bh); 297 __brelse(bh);
268 ret = 1;
269 } else { 298 } else {
270 /* 299 /*
271 * Important: we are about to write the buffer, and 300 * Important: we are about to write the buffer, and
@@ -298,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
298 * to disk. We submit larger chunks of data at once. 327 * to disk. We submit larger chunks of data at once.
299 * 328 *
300 * The journal should be locked before calling this function. 329 * The journal should be locked before calling this function.
330 * Called with j_checkpoint_mutex held.
301 */ 331 */
302int jbd2_log_do_checkpoint(journal_t *journal) 332int jbd2_log_do_checkpoint(journal_t *journal)
303{ 333{
@@ -313,6 +343,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
313 * journal straight away. 343 * journal straight away.
314 */ 344 */
315 result = jbd2_cleanup_journal_tail(journal); 345 result = jbd2_cleanup_journal_tail(journal);
346 trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
347 journal->j_devname, result);
316 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 348 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
317 if (result <= 0) 349 if (result <= 0)
318 return result; 350 return result;
@@ -321,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
321 * OK, we need to start writing disk blocks. Take one transaction 353 * OK, we need to start writing disk blocks. Take one transaction
322 * and write it. 354 * and write it.
323 */ 355 */
356 result = 0;
324 spin_lock(&journal->j_list_lock); 357 spin_lock(&journal->j_list_lock);
325 if (!journal->j_checkpoint_transactions) 358 if (!journal->j_checkpoint_transactions)
326 goto out; 359 goto out;
@@ -339,7 +372,7 @@ restart:
339 int batch_count = 0; 372 int batch_count = 0;
340 struct buffer_head *bhs[NR_BATCH]; 373 struct buffer_head *bhs[NR_BATCH];
341 struct journal_head *jh; 374 struct journal_head *jh;
342 int retry = 0; 375 int retry = 0, err;
343 376
344 while (!retry && transaction->t_checkpoint_list) { 377 while (!retry && transaction->t_checkpoint_list) {
345 struct buffer_head *bh; 378 struct buffer_head *bh;
@@ -353,6 +386,8 @@ restart:
353 } 386 }
354 retry = __process_buffer(journal, jh, bhs, &batch_count, 387 retry = __process_buffer(journal, jh, bhs, &batch_count,
355 transaction); 388 transaction);
389 if (retry < 0 && !result)
390 result = retry;
356 if (!retry && (need_resched() || 391 if (!retry && (need_resched() ||
357 spin_needbreak(&journal->j_list_lock))) { 392 spin_needbreak(&journal->j_list_lock))) {
358 spin_unlock(&journal->j_list_lock); 393 spin_unlock(&journal->j_list_lock);
@@ -377,14 +412,18 @@ restart:
377 * Now we have cleaned up the first transaction's checkpoint 412 * Now we have cleaned up the first transaction's checkpoint
378 * list. Let's clean up the second one 413 * list. Let's clean up the second one
379 */ 414 */
380 __wait_cp_io(journal, transaction); 415 err = __wait_cp_io(journal, transaction);
416 if (!result)
417 result = err;
381 } 418 }
382out: 419out:
383 spin_unlock(&journal->j_list_lock); 420 spin_unlock(&journal->j_list_lock);
384 result = jbd2_cleanup_journal_tail(journal);
385 if (result < 0) 421 if (result < 0)
386 return result; 422 jbd2_journal_abort(journal, result);
387 return 0; 423 else
424 result = jbd2_cleanup_journal_tail(journal);
425
426 return (result < 0) ? result : 0;
388} 427}
389 428
390/* 429/*
@@ -400,8 +439,9 @@ out:
400 * This is the only part of the journaling code which really needs to be 439 * This is the only part of the journaling code which really needs to be
401 * aware of transaction aborts. Checkpointing involves writing to the 440 * aware of transaction aborts. Checkpointing involves writing to the
402 * main filesystem area rather than to the journal, so it can proceed 441 * main filesystem area rather than to the journal, so it can proceed
403 * even in abort state, but we must not update the journal superblock if 442 * even in abort state, but we must not update the super block if
404 * we have an abort error outstanding. 443 * checkpointing may have failed. Otherwise, we would lose some metadata
444 * buffers which should be written-back to the filesystem.
405 */ 445 */
406 446
407int jbd2_cleanup_journal_tail(journal_t *journal) 447int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -410,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
410 tid_t first_tid; 450 tid_t first_tid;
411 unsigned long blocknr, freed; 451 unsigned long blocknr, freed;
412 452
453 if (is_journal_aborted(journal))
454 return 1;
455
413 /* OK, work out the oldest transaction remaining in the log, and 456 /* OK, work out the oldest transaction remaining in the log, and
414 * the log block it starts at. 457 * the log block it starts at.
415 * 458 *
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f2ad061e95ec..0abe02c4242a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd2.h> 18#include <linux/jbd2.h>
19#include <linux/marker.h>
19#include <linux/errno.h> 20#include <linux/errno.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -126,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
126 127
127 JBUFFER_TRACE(descriptor, "submit commit block"); 128 JBUFFER_TRACE(descriptor, "submit commit block");
128 lock_buffer(bh); 129 lock_buffer(bh);
129 get_bh(bh); 130 clear_buffer_dirty(bh);
130 set_buffer_dirty(bh);
131 set_buffer_uptodate(bh); 131 set_buffer_uptodate(bh);
132 bh->b_end_io = journal_end_buffer_io_sync; 132 bh->b_end_io = journal_end_buffer_io_sync;
133 133
@@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
147 * to remember if we sent a barrier request 147 * to remember if we sent a barrier request
148 */ 148 */
149 if (ret == -EOPNOTSUPP && barrier_done) { 149 if (ret == -EOPNOTSUPP && barrier_done) {
150 char b[BDEVNAME_SIZE];
151
152 printk(KERN_WARNING 150 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - " 151 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n", 152 "disabling barriers\n", journal->j_devname);
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock); 153 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JBD2_BARRIER; 154 journal->j_flags &= ~JBD2_BARRIER;
158 spin_unlock(&journal->j_state_lock); 155 spin_unlock(&journal->j_state_lock);
@@ -160,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
160 /* And try again, without the barrier */ 157 /* And try again, without the barrier */
161 lock_buffer(bh); 158 lock_buffer(bh);
162 set_buffer_uptodate(bh); 159 set_buffer_uptodate(bh);
163 set_buffer_dirty(bh); 160 clear_buffer_dirty(bh);
164 ret = submit_bh(WRITE, bh); 161 ret = submit_bh(WRITE, bh);
165 } 162 }
166 *cbh = bh; 163 *cbh = bh;
@@ -371,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
371 commit_transaction = journal->j_running_transaction; 368 commit_transaction = journal->j_running_transaction;
372 J_ASSERT(commit_transaction->t_state == T_RUNNING); 369 J_ASSERT(commit_transaction->t_state == T_RUNNING);
373 370
371 trace_mark(jbd2_start_commit, "dev %s transaction %d",
372 journal->j_devname, commit_transaction->t_tid);
374 jbd_debug(1, "JBD: starting commit of transaction %d\n", 373 jbd_debug(1, "JBD: starting commit of transaction %d\n",
375 commit_transaction->t_tid); 374 commit_transaction->t_tid);
376 375
@@ -505,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
505 jh = commit_transaction->t_buffers; 504 jh = commit_transaction->t_buffers;
506 505
507 /* If we're in abort mode, we just un-journal the buffer and 506 /* If we're in abort mode, we just un-journal the buffer and
508 release it for background writing. */ 507 release it. */
509 508
510 if (is_journal_aborted(journal)) { 509 if (is_journal_aborted(journal)) {
510 clear_buffer_jbddirty(jh2bh(jh));
511 JBUFFER_TRACE(jh, "journal is aborting: refile"); 511 JBUFFER_TRACE(jh, "journal is aborting: refile");
512 jbd2_journal_refile_buffer(journal, jh); 512 jbd2_journal_refile_buffer(journal, jh);
513 /* If that was the last one, we need to clean up 513 /* If that was the last one, we need to clean up
@@ -681,11 +681,11 @@ start_journal_io:
681 */ 681 */
682 err = journal_finish_inode_data_buffers(journal, commit_transaction); 682 err = journal_finish_inode_data_buffers(journal, commit_transaction);
683 if (err) { 683 if (err) {
684 char b[BDEVNAME_SIZE];
685
686 printk(KERN_WARNING 684 printk(KERN_WARNING
687 "JBD2: Detected IO errors while flushing file data " 685 "JBD2: Detected IO errors while flushing file data "
688 "on %s\n", bdevname(journal->j_fs_dev, b)); 686 "on %s\n", journal->j_devname);
687 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
688 jbd2_journal_abort(journal, err);
689 err = 0; 689 err = 0;
690 } 690 }
691 691
@@ -786,6 +786,9 @@ wait_for_iobuf:
786 /* AKPM: bforget here */ 786 /* AKPM: bforget here */
787 } 787 }
788 788
789 if (err)
790 jbd2_journal_abort(journal, err);
791
789 jbd_debug(3, "JBD: commit phase 5\n"); 792 jbd_debug(3, "JBD: commit phase 5\n");
790 793
791 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 794 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -884,6 +887,8 @@ restart_loop:
884 if (buffer_jbddirty(bh)) { 887 if (buffer_jbddirty(bh)) {
885 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 888 JBUFFER_TRACE(jh, "add to new checkpointing trans");
886 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 889 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
890 if (is_journal_aborted(journal))
891 clear_buffer_jbddirty(bh);
887 JBUFFER_TRACE(jh, "refile for checkpoint writeback"); 892 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
888 __jbd2_journal_refile_buffer(jh); 893 __jbd2_journal_refile_buffer(jh);
889 jbd_unlock_bh_state(bh); 894 jbd_unlock_bh_state(bh);
@@ -990,6 +995,9 @@ restart_loop:
990 } 995 }
991 spin_unlock(&journal->j_list_lock); 996 spin_unlock(&journal->j_list_lock);
992 997
998 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
999 journal->j_devname, commit_transaction->t_tid,
1000 journal->j_tail_sequence);
993 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1001 jbd_debug(1, "JBD: commit %d complete, head %d\n",
994 journal->j_commit_sequence, journal->j_tail_sequence); 1002 journal->j_commit_sequence, journal->j_tail_sequence);
995 1003
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8207a01c4edb..783de118de92 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -597,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
597 if (ret) 597 if (ret)
598 *retp = ret; 598 *retp = ret;
599 else { 599 else {
600 char b[BDEVNAME_SIZE];
601
602 printk(KERN_ALERT "%s: journal block not found " 600 printk(KERN_ALERT "%s: journal block not found "
603 "at offset %lu on %s\n", 601 "at offset %lu on %s\n",
604 __func__, 602 __func__, blocknr, journal->j_devname);
605 blocknr,
606 bdevname(journal->j_dev, b));
607 err = -EIO; 603 err = -EIO;
608 __journal_abort_soft(journal, err); 604 __journal_abort_soft(journal, err);
609 } 605 }
@@ -901,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
901 897
902static void jbd2_stats_proc_init(journal_t *journal) 898static void jbd2_stats_proc_init(journal_t *journal)
903{ 899{
904 char name[BDEVNAME_SIZE]; 900 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
905
906 bdevname(journal->j_dev, name);
907 journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
908 if (journal->j_proc_entry) { 901 if (journal->j_proc_entry) {
909 proc_create_data("history", S_IRUGO, journal->j_proc_entry, 902 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
910 &jbd2_seq_history_fops, journal); 903 &jbd2_seq_history_fops, journal);
@@ -915,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
915 908
916static void jbd2_stats_proc_exit(journal_t *journal) 909static void jbd2_stats_proc_exit(journal_t *journal)
917{ 910{
918 char name[BDEVNAME_SIZE];
919
920 bdevname(journal->j_dev, name);
921 remove_proc_entry("info", journal->j_proc_entry); 911 remove_proc_entry("info", journal->j_proc_entry);
922 remove_proc_entry("history", journal->j_proc_entry); 912 remove_proc_entry("history", journal->j_proc_entry);
923 remove_proc_entry(name, proc_jbd2_stats); 913 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
924} 914}
925 915
926static void journal_init_stats(journal_t *journal) 916static void journal_init_stats(journal_t *journal)
@@ -1018,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1018{ 1008{
1019 journal_t *journal = journal_init_common(); 1009 journal_t *journal = journal_init_common();
1020 struct buffer_head *bh; 1010 struct buffer_head *bh;
1011 char *p;
1021 int n; 1012 int n;
1022 1013
1023 if (!journal) 1014 if (!journal)
@@ -1039,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1039 journal->j_fs_dev = fs_dev; 1030 journal->j_fs_dev = fs_dev;
1040 journal->j_blk_offset = start; 1031 journal->j_blk_offset = start;
1041 journal->j_maxlen = len; 1032 journal->j_maxlen = len;
1033 bdevname(journal->j_dev, journal->j_devname);
1034 p = journal->j_devname;
1035 while ((p = strchr(p, '/')))
1036 *p = '!';
1042 jbd2_stats_proc_init(journal); 1037 jbd2_stats_proc_init(journal);
1043 1038
1044 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1039 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1061,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1061{ 1056{
1062 struct buffer_head *bh; 1057 struct buffer_head *bh;
1063 journal_t *journal = journal_init_common(); 1058 journal_t *journal = journal_init_common();
1059 char *p;
1064 int err; 1060 int err;
1065 int n; 1061 int n;
1066 unsigned long long blocknr; 1062 unsigned long long blocknr;
@@ -1070,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1070 1066
1071 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; 1067 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
1072 journal->j_inode = inode; 1068 journal->j_inode = inode;
1069 bdevname(journal->j_dev, journal->j_devname);
1070 p = journal->j_devname;
1071 while ((p = strchr(p, '/')))
1072 *p = '!';
1073 p = journal->j_devname + strlen(journal->j_devname);
1074 sprintf(p, ":%lu", journal->j_inode->i_ino);
1073 jbd_debug(1, 1075 jbd_debug(1,
1074 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 1076 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1075 journal, inode->i_sb->s_id, inode->i_ino, 1077 journal, inode->i_sb->s_id, inode->i_ino,
@@ -1253,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1253 goto out; 1255 goto out;
1254 } 1256 }
1255 1257
1258 if (buffer_write_io_error(bh)) {
1259 /*
1260 * Oh, dear. A previous attempt to write the journal
1261 * superblock failed. This could happen because the
1262 * USB device was yanked out. Or it could happen to
1263 * be a transient write error and maybe the block will
1264 * be remapped. Nothing we can do but to retry the
1265 * write and hope for the best.
1266 */
1267 printk(KERN_ERR "JBD2: previous I/O error detected "
1268 "for journal superblock update for %s.\n",
1269 journal->j_devname);
1270 clear_buffer_write_io_error(bh);
1271 set_buffer_uptodate(bh);
1272 }
1273
1256 spin_lock(&journal->j_state_lock); 1274 spin_lock(&journal->j_state_lock);
1257 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1275 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
1258 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1276 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1264,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1264 1282
1265 BUFFER_TRACE(bh, "marking dirty"); 1283 BUFFER_TRACE(bh, "marking dirty");
1266 mark_buffer_dirty(bh); 1284 mark_buffer_dirty(bh);
1267 if (wait) 1285 if (wait) {
1268 sync_dirty_buffer(bh); 1286 sync_dirty_buffer(bh);
1269 else 1287 if (buffer_write_io_error(bh)) {
1288 printk(KERN_ERR "JBD2: I/O error detected "
1289 "when updating journal superblock for %s.\n",
1290 journal->j_devname);
1291 clear_buffer_write_io_error(bh);
1292 set_buffer_uptodate(bh);
1293 }
1294 } else
1270 ll_rw_block(SWRITE, 1, &bh); 1295 ll_rw_block(SWRITE, 1, &bh);
1271 1296
1272out: 1297out:
@@ -1426,9 +1451,12 @@ recovery_error:
1426 * 1451 *
1427 * Release a journal_t structure once it is no longer in use by the 1452 * Release a journal_t structure once it is no longer in use by the
1428 * journaled object. 1453 * journaled object.
1454 * Return <0 if we couldn't clean up the journal.
1429 */ 1455 */
1430void jbd2_journal_destroy(journal_t *journal) 1456int jbd2_journal_destroy(journal_t *journal)
1431{ 1457{
1458 int err = 0;
1459
1432 /* Wait for the commit thread to wake up and die. */ 1460 /* Wait for the commit thread to wake up and die. */
1433 journal_kill_thread(journal); 1461 journal_kill_thread(journal);
1434 1462
@@ -1451,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
1451 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1479 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1452 spin_unlock(&journal->j_list_lock); 1480 spin_unlock(&journal->j_list_lock);
1453 1481
1454 /* We can now mark the journal as empty. */
1455 journal->j_tail = 0;
1456 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1457 if (journal->j_sb_buffer) { 1482 if (journal->j_sb_buffer) {
1458 jbd2_journal_update_superblock(journal, 1); 1483 if (!is_journal_aborted(journal)) {
1484 /* We can now mark the journal as empty. */
1485 journal->j_tail = 0;
1486 journal->j_tail_sequence =
1487 ++journal->j_transaction_sequence;
1488 jbd2_journal_update_superblock(journal, 1);
1489 } else {
1490 err = -EIO;
1491 }
1459 brelse(journal->j_sb_buffer); 1492 brelse(journal->j_sb_buffer);
1460 } 1493 }
1461 1494
@@ -1467,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
1467 jbd2_journal_destroy_revoke(journal); 1500 jbd2_journal_destroy_revoke(journal);
1468 kfree(journal->j_wbuf); 1501 kfree(journal->j_wbuf);
1469 kfree(journal); 1502 kfree(journal);
1503
1504 return err;
1470} 1505}
1471 1506
1472 1507
@@ -1692,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
1692 spin_lock(&journal->j_list_lock); 1727 spin_lock(&journal->j_list_lock);
1693 while (!err && journal->j_checkpoint_transactions != NULL) { 1728 while (!err && journal->j_checkpoint_transactions != NULL) {
1694 spin_unlock(&journal->j_list_lock); 1729 spin_unlock(&journal->j_list_lock);
1730 mutex_lock(&journal->j_checkpoint_mutex);
1695 err = jbd2_log_do_checkpoint(journal); 1731 err = jbd2_log_do_checkpoint(journal);
1732 mutex_unlock(&journal->j_checkpoint_mutex);
1696 spin_lock(&journal->j_list_lock); 1733 spin_lock(&journal->j_list_lock);
1697 } 1734 }
1698 spin_unlock(&journal->j_list_lock); 1735 spin_unlock(&journal->j_list_lock);
1736
1737 if (is_journal_aborted(journal))
1738 return -EIO;
1739
1699 jbd2_cleanup_journal_tail(journal); 1740 jbd2_cleanup_journal_tail(journal);
1700 1741
1701 /* Finally, mark the journal as really needing no recovery. 1742 /* Finally, mark the journal as really needing no recovery.
@@ -1717,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
1717 J_ASSERT(journal->j_head == journal->j_tail); 1758 J_ASSERT(journal->j_head == journal->j_tail);
1718 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1759 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1719 spin_unlock(&journal->j_state_lock); 1760 spin_unlock(&journal->j_state_lock);
1720 return err; 1761 return 0;
1721} 1762}
1722 1763
1723/** 1764/**
@@ -1761,23 +1802,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1761} 1802}
1762 1803
1763/* 1804/*
1764 * journal_dev_name: format a character string to describe on what
1765 * device this journal is present.
1766 */
1767
1768static const char *journal_dev_name(journal_t *journal, char *buffer)
1769{
1770 struct block_device *bdev;
1771
1772 if (journal->j_inode)
1773 bdev = journal->j_inode->i_sb->s_bdev;
1774 else
1775 bdev = journal->j_dev;
1776
1777 return bdevname(bdev, buffer);
1778}
1779
1780/*
1781 * Journal abort has very specific semantics, which we describe 1805 * Journal abort has very specific semantics, which we describe
1782 * for journal abort. 1806 * for journal abort.
1783 * 1807 *
@@ -1793,13 +1817,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
1793void __jbd2_journal_abort_hard(journal_t *journal) 1817void __jbd2_journal_abort_hard(journal_t *journal)
1794{ 1818{
1795 transaction_t *transaction; 1819 transaction_t *transaction;
1796 char b[BDEVNAME_SIZE];
1797 1820
1798 if (journal->j_flags & JBD2_ABORT) 1821 if (journal->j_flags & JBD2_ABORT)
1799 return; 1822 return;
1800 1823
1801 printk(KERN_ERR "Aborting journal on device %s.\n", 1824 printk(KERN_ERR "Aborting journal on device %s.\n",
1802 journal_dev_name(journal, b)); 1825 journal->j_devname);
1803 1826
1804 spin_lock(&journal->j_state_lock); 1827 spin_lock(&journal->j_state_lock);
1805 journal->j_flags |= JBD2_ABORT; 1828 journal->j_flags |= JBD2_ABORT;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 058f50f65b76..73063285b13f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,7 +225,7 @@ do { \
225 */ 225 */
226int jbd2_journal_recover(journal_t *journal) 226int jbd2_journal_recover(journal_t *journal)
227{ 227{
228 int err; 228 int err, err2;
229 journal_superblock_t * sb; 229 journal_superblock_t * sb;
230 230
231 struct recovery_info info; 231 struct recovery_info info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
263 journal->j_transaction_sequence = ++info.end_transaction; 263 journal->j_transaction_sequence = ++info.end_transaction;
264 264
265 jbd2_journal_clear_revoke(journal); 265 jbd2_journal_clear_revoke(journal);
266 sync_blockdev(journal->j_fs_dev); 266 err2 = sync_blockdev(journal->j_fs_dev);
267 if (!err)
268 err = err2;
269
267 return err; 270 return err;
268} 271}
269 272
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4f7cadbb19fa..e5d540588fa9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -301,7 +301,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
301 goto out; 301 goto out;
302 } 302 }
303 303
304 lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); 304 lock_map_acquire(&handle->h_lockdep_map);
305out: 305out:
306 return handle; 306 return handle;
307} 307}
@@ -1279,7 +1279,7 @@ int jbd2_journal_stop(handle_t *handle)
1279 spin_unlock(&journal->j_state_lock); 1279 spin_unlock(&journal->j_state_lock);
1280 } 1280 }
1281 1281
1282 lock_release(&handle->h_lockdep_map, 1, _THIS_IP_); 1282 lock_map_release(&handle->h_lockdep_map);
1283 1283
1284 jbd2_free_handle(handle); 1284 jbd2_free_handle(handle);
1285 return err; 1285 return err;
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 31559f45fdde..4c41db91eaa4 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -12,7 +12,6 @@
12#ifndef _JFFS2_FS_I 12#ifndef _JFFS2_FS_I
13#define _JFFS2_FS_I 13#define _JFFS2_FS_I
14 14
15#include <linux/version.h>
16#include <linux/rbtree.h> 15#include <linux/rbtree.h>
17#include <linux/posix_acl.h> 16#include <linux/posix_acl.h>
18#include <linux/mutex.h> 17#include <linux/mutex.h>
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3630718be395..0dae345e481b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -199,7 +199,7 @@ enum {
199 Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask 199 Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask
200}; 200};
201 201
202static match_table_t tokens = { 202static const match_table_t tokens = {
203 {Opt_integrity, "integrity"}, 203 {Opt_integrity, "integrity"},
204 {Opt_nointegrity, "nointegrity"}, 204 {Opt_nointegrity, "nointegrity"},
205 {Opt_iocharset, "iocharset=%s"}, 205 {Opt_iocharset, "iocharset=%s"},
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 7725a0a9a555..97f6073ab339 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
5obj-$(CONFIG_LOCKD) += lockd.o 5obj-$(CONFIG_LOCKD) += lockd.o
6 6
7lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ 7lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
8 svcproc.o svcsubs.o mon.o xdr.o 8 svcproc.o svcsubs.o mon.o xdr.o grace.o
9lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o 9lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
10lockd-objs := $(lockd-objs-y) 10lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0b45fd3a4bfd..8307dd64bf46 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
54 u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; 54 u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
55 int status; 55 int status;
56 56
57 status = lockd_up(nlm_init->protocol); 57 status = lockd_up();
58 if (status < 0) 58 if (status < 0)
59 return ERR_PTR(status); 59 return ERR_PTR(status);
60 60
61 host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address, 61 host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
62 nlm_init->protocol, nlm_version, 62 nlm_init->protocol, nlm_version,
63 nlm_init->hostname, 63 nlm_init->hostname);
64 strlen(nlm_init->hostname));
65 if (host == NULL) { 64 if (host == NULL) {
66 lockd_down(); 65 lockd_down();
67 return ERR_PTR(-ENOLCK); 66 return ERR_PTR(-ENOLCK);
@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
142/* 141/*
143 * The server lockd has called us back to tell us the lock was granted 142 * The server lockd has called us back to tell us the lock was granted
144 */ 143 */
145__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) 144__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
146{ 145{
147 const struct file_lock *fl = &lock->fl; 146 const struct file_lock *fl = &lock->fl;
148 const struct nfs_fh *fh = &lock->fh; 147 const struct nfs_fh *fh = &lock->fh;
@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
166 */ 165 */
167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 166 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
168 continue; 167 continue;
169 if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) 168 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
170 continue; 169 continue;
171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 170 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
172 continue; 171 continue;
@@ -216,7 +215,7 @@ reclaimer(void *ptr)
216 /* This one ensures that our parent doesn't terminate while the 215 /* This one ensures that our parent doesn't terminate while the
217 * reclaim is in progress */ 216 * reclaim is in progress */
218 lock_kernel(); 217 lock_kernel();
219 lockd_up(0); /* note: this cannot fail as lockd is already running */ 218 lockd_up(); /* note: this cannot fail as lockd is already running */
220 219
221 dprintk("lockd: reclaiming locks for host %s\n", host->h_name); 220 dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
222 221
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 000000000000..183cc1f0af1c
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
1/*
2 * Common code for control of lockd and nfsv4 grace periods.
3 */
4
5#include <linux/module.h>
6#include <linux/lockd/bind.h>
7
8static LIST_HEAD(grace_list);
9static DEFINE_SPINLOCK(grace_lock);
10
11/**
12 * locks_start_grace
13 * @lm: who this grace period is for
14 *
15 * A grace period is a period during which locks should not be given
16 * out. Currently grace periods are only enforced by the two lock
17 * managers (lockd and nfsd), using the locks_in_grace() function to
18 * check when they are in a grace period.
19 *
20 * This function is called to start a grace period.
21 */
22void locks_start_grace(struct lock_manager *lm)
23{
24 spin_lock(&grace_lock);
25 list_add(&lm->list, &grace_list);
26 spin_unlock(&grace_lock);
27}
28EXPORT_SYMBOL_GPL(locks_start_grace);
29
30/**
31 * locks_end_grace
32 * @lm: who this grace period is for
33 *
34 * Call this function to state that the given lock manager is ready to
35 * resume regular locking. The grace period will not end until all lock
36 * managers that called locks_start_grace() also call locks_end_grace().
37 * Note that callers count on it being safe to call this more than once,
38 * and the second call should be a no-op.
39 */
40void locks_end_grace(struct lock_manager *lm)
41{
42 spin_lock(&grace_lock);
43 list_del_init(&lm->list);
44 spin_unlock(&grace_lock);
45}
46EXPORT_SYMBOL_GPL(locks_end_grace);
47
48/**
49 * locks_in_grace
50 *
51 * Lock managers call this function to determine when it is OK for them
52 * to answer ordinary lock requests, and when they should accept only
53 * lock reclaims.
54 */
55int locks_in_grace(void)
56{
57 return !list_empty(&grace_list);
58}
59EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a17664c7eacc..9fd8889097b7 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -11,16 +11,17 @@
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/in.h> 13#include <linux/in.h>
14#include <linux/in6.h>
14#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
16#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
17#include <linux/lockd/sm_inter.h> 18#include <linux/lockd/sm_inter.h>
18#include <linux/mutex.h> 19#include <linux/mutex.h>
19 20
21#include <net/ipv6.h>
20 22
21#define NLMDBG_FACILITY NLMDBG_HOSTCACHE 23#define NLMDBG_FACILITY NLMDBG_HOSTCACHE
22#define NLM_HOST_NRHASH 32 24#define NLM_HOST_NRHASH 32
23#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1))
24#define NLM_HOST_REBIND (60 * HZ) 25#define NLM_HOST_REBIND (60 * HZ)
25#define NLM_HOST_EXPIRE (300 * HZ) 26#define NLM_HOST_EXPIRE (300 * HZ)
26#define NLM_HOST_COLLECT (120 * HZ) 27#define NLM_HOST_COLLECT (120 * HZ)
@@ -30,42 +31,115 @@ static unsigned long next_gc;
30static int nrhosts; 31static int nrhosts;
31static DEFINE_MUTEX(nlm_host_mutex); 32static DEFINE_MUTEX(nlm_host_mutex);
32 33
33
34static void nlm_gc_hosts(void); 34static void nlm_gc_hosts(void);
35static struct nsm_handle * __nsm_find(const struct sockaddr_in *, 35static struct nsm_handle *nsm_find(const struct sockaddr *sap,
36 const char *, unsigned int, int); 36 const size_t salen,
37static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, 37 const char *hostname,
38 const char *hostname, 38 const size_t hostname_len,
39 unsigned int hostname_len); 39 const int create);
40
41struct nlm_lookup_host_info {
42 const int server; /* search for server|client */
43 const struct sockaddr *sap; /* address to search for */
44 const size_t salen; /* it's length */
45 const unsigned short protocol; /* transport to search for*/
46 const u32 version; /* NLM version to search for */
47 const char *hostname; /* remote's hostname */
48 const size_t hostname_len; /* it's length */
49 const struct sockaddr *src_sap; /* our address (optional) */
50 const size_t src_len; /* it's length */
51};
52
53/*
54 * Hash function must work well on big- and little-endian platforms
55 */
56static unsigned int __nlm_hash32(const __be32 n)
57{
58 unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
59 return hash ^ (hash >> 8);
60}
61
62static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
63{
64 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
65 return __nlm_hash32(sin->sin_addr.s_addr);
66}
67
68static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
69{
70 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
71 const struct in6_addr addr = sin6->sin6_addr;
72 return __nlm_hash32(addr.s6_addr32[0]) ^
73 __nlm_hash32(addr.s6_addr32[1]) ^
74 __nlm_hash32(addr.s6_addr32[2]) ^
75 __nlm_hash32(addr.s6_addr32[3]);
76}
77
78static unsigned int nlm_hash_address(const struct sockaddr *sap)
79{
80 unsigned int hash;
81
82 switch (sap->sa_family) {
83 case AF_INET:
84 hash = __nlm_hash_addr4(sap);
85 break;
86 case AF_INET6:
87 hash = __nlm_hash_addr6(sap);
88 break;
89 default:
90 hash = 0;
91 }
92 return hash & (NLM_HOST_NRHASH - 1);
93}
94
95static void nlm_clear_port(struct sockaddr *sap)
96{
97 switch (sap->sa_family) {
98 case AF_INET:
99 ((struct sockaddr_in *)sap)->sin_port = 0;
100 break;
101 case AF_INET6:
102 ((struct sockaddr_in6 *)sap)->sin6_port = 0;
103 break;
104 }
105}
106
107static void nlm_display_address(const struct sockaddr *sap,
108 char *buf, const size_t len)
109{
110 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
111 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
112
113 switch (sap->sa_family) {
114 case AF_UNSPEC:
115 snprintf(buf, len, "unspecified");
116 break;
117 case AF_INET:
118 snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
119 break;
120 case AF_INET6:
121 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
122 snprintf(buf, len, NIPQUAD_FMT,
123 NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
124 else
125 snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
126 break;
127 default:
128 snprintf(buf, len, "unsupported address family");
129 break;
130 }
131}
40 132
41/* 133/*
42 * Common host lookup routine for server & client 134 * Common host lookup routine for server & client
43 */ 135 */
44static struct nlm_host *nlm_lookup_host(int server, 136static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
45 const struct sockaddr_in *sin,
46 int proto, u32 version,
47 const char *hostname,
48 unsigned int hostname_len,
49 const struct sockaddr_in *ssin)
50{ 137{
51 struct hlist_head *chain; 138 struct hlist_head *chain;
52 struct hlist_node *pos; 139 struct hlist_node *pos;
53 struct nlm_host *host; 140 struct nlm_host *host;
54 struct nsm_handle *nsm = NULL; 141 struct nsm_handle *nsm = NULL;
55 int hash;
56
57 dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
58 ", p=%d, v=%u, my role=%s, name=%.*s)\n",
59 NIPQUAD(ssin->sin_addr.s_addr),
60 NIPQUAD(sin->sin_addr.s_addr), proto, version,
61 server? "server" : "client",
62 hostname_len,
63 hostname? hostname : "<none>");
64 142
65
66 hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
67
68 /* Lock hash table */
69 mutex_lock(&nlm_host_mutex); 143 mutex_lock(&nlm_host_mutex);
70 144
71 if (time_after_eq(jiffies, next_gc)) 145 if (time_after_eq(jiffies, next_gc))
@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
78 * different NLM rpc_clients into one single nlm_host object. 152 * different NLM rpc_clients into one single nlm_host object.
79 * This would allow us to have one nlm_host per address. 153 * This would allow us to have one nlm_host per address.
80 */ 154 */
81 chain = &nlm_hosts[hash]; 155 chain = &nlm_hosts[nlm_hash_address(ni->sap)];
82 hlist_for_each_entry(host, pos, chain, h_hash) { 156 hlist_for_each_entry(host, pos, chain, h_hash) {
83 if (!nlm_cmp_addr(&host->h_addr, sin)) 157 if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
84 continue; 158 continue;
85 159
86 /* See if we have an NSM handle for this client */ 160 /* See if we have an NSM handle for this client */
87 if (!nsm) 161 if (!nsm)
88 nsm = host->h_nsmhandle; 162 nsm = host->h_nsmhandle;
89 163
90 if (host->h_proto != proto) 164 if (host->h_proto != ni->protocol)
91 continue; 165 continue;
92 if (host->h_version != version) 166 if (host->h_version != ni->version)
93 continue; 167 continue;
94 if (host->h_server != server) 168 if (host->h_server != ni->server)
95 continue; 169 continue;
96 if (!nlm_cmp_addr(&host->h_saddr, ssin)) 170 if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
97 continue; 171 continue;
98 172
99 /* Move to head of hash chain. */ 173 /* Move to head of hash chain. */
@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
101 hlist_add_head(&host->h_hash, chain); 175 hlist_add_head(&host->h_hash, chain);
102 176
103 nlm_get_host(host); 177 nlm_get_host(host);
178 dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
179 host->h_name, host->h_addrbuf);
104 goto out; 180 goto out;
105 } 181 }
106 if (nsm)
107 atomic_inc(&nsm->sm_count);
108
109 host = NULL;
110 182
111 /* Sadly, the host isn't in our hash table yet. See if 183 /*
112 * we have an NSM handle for it. If not, create one. 184 * The host wasn't in our hash table. If we don't
185 * have an NSM handle for it yet, create one.
113 */ 186 */
114 if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len))) 187 if (nsm)
115 goto out; 188 atomic_inc(&nsm->sm_count);
189 else {
190 host = NULL;
191 nsm = nsm_find(ni->sap, ni->salen,
192 ni->hostname, ni->hostname_len, 1);
193 if (!nsm) {
194 dprintk("lockd: nlm_lookup_host failed; "
195 "no nsm handle\n");
196 goto out;
197 }
198 }
116 199
117 host = kzalloc(sizeof(*host), GFP_KERNEL); 200 host = kzalloc(sizeof(*host), GFP_KERNEL);
118 if (!host) { 201 if (!host) {
119 nsm_release(nsm); 202 nsm_release(nsm);
203 dprintk("lockd: nlm_lookup_host failed; no memory\n");
120 goto out; 204 goto out;
121 } 205 }
122 host->h_name = nsm->sm_name; 206 host->h_name = nsm->sm_name;
123 host->h_addr = *sin; 207 memcpy(nlm_addr(host), ni->sap, ni->salen);
124 host->h_addr.sin_port = 0; /* ouch! */ 208 host->h_addrlen = ni->salen;
125 host->h_saddr = *ssin; 209 nlm_clear_port(nlm_addr(host));
126 host->h_version = version; 210 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
127 host->h_proto = proto; 211 host->h_version = ni->version;
212 host->h_proto = ni->protocol;
128 host->h_rpcclnt = NULL; 213 host->h_rpcclnt = NULL;
129 mutex_init(&host->h_mutex); 214 mutex_init(&host->h_mutex);
130 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 215 host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
135 host->h_state = 0; /* pseudo NSM state */ 220 host->h_state = 0; /* pseudo NSM state */
136 host->h_nsmstate = 0; /* real NSM state */ 221 host->h_nsmstate = 0; /* real NSM state */
137 host->h_nsmhandle = nsm; 222 host->h_nsmhandle = nsm;
138 host->h_server = server; 223 host->h_server = ni->server;
139 hlist_add_head(&host->h_hash, chain); 224 hlist_add_head(&host->h_hash, chain);
140 INIT_LIST_HEAD(&host->h_lockowners); 225 INIT_LIST_HEAD(&host->h_lockowners);
141 spin_lock_init(&host->h_lock); 226 spin_lock_init(&host->h_lock);
@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
143 INIT_LIST_HEAD(&host->h_reclaim); 228 INIT_LIST_HEAD(&host->h_reclaim);
144 229
145 nrhosts++; 230 nrhosts++;
231
232 nlm_display_address((struct sockaddr *)&host->h_addr,
233 host->h_addrbuf, sizeof(host->h_addrbuf));
234 nlm_display_address((struct sockaddr *)&host->h_srcaddr,
235 host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
236
237 dprintk("lockd: nlm_lookup_host created host %s\n",
238 host->h_name);
239
146out: 240out:
147 mutex_unlock(&nlm_host_mutex); 241 mutex_unlock(&nlm_host_mutex);
148 return host; 242 return host;
@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
170 kfree(host); 264 kfree(host);
171} 265}
172 266
173/* 267/**
174 * Find an NLM server handle in the cache. If there is none, create it. 268 * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
269 * @sap: network address of server
270 * @salen: length of server address
271 * @protocol: transport protocol to use
272 * @version: NLM protocol version
273 * @hostname: '\0'-terminated hostname of server
274 *
275 * Returns an nlm_host structure that matches the passed-in
276 * [server address, transport protocol, NLM version, server hostname].
277 * If one doesn't already exist in the host cache, a new handle is
278 * created and returned.
175 */ 279 */
176struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, 280struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
177 int proto, u32 version, 281 const size_t salen,
178 const char *hostname, 282 const unsigned short protocol,
179 unsigned int hostname_len) 283 const u32 version, const char *hostname)
180{ 284{
181 struct sockaddr_in ssin = {0}; 285 const struct sockaddr source = {
182 286 .sa_family = AF_UNSPEC,
183 return nlm_lookup_host(0, sin, proto, version, 287 };
184 hostname, hostname_len, &ssin); 288 struct nlm_lookup_host_info ni = {
289 .server = 0,
290 .sap = sap,
291 .salen = salen,
292 .protocol = protocol,
293 .version = version,
294 .hostname = hostname,
295 .hostname_len = strlen(hostname),
296 .src_sap = &source,
297 .src_len = sizeof(source),
298 };
299
300 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
301 (hostname ? hostname : "<none>"), version,
302 (protocol == IPPROTO_UDP ? "udp" : "tcp"));
303
304 return nlm_lookup_host(&ni);
185} 305}
186 306
187/* 307/**
188 * Find an NLM client handle in the cache. If there is none, create it. 308 * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
309 * @rqstp: incoming NLM request
310 * @hostname: name of client host
311 * @hostname_len: length of client hostname
312 *
313 * Returns an nlm_host structure that matches the [client address,
314 * transport protocol, NLM version, client hostname] of the passed-in
315 * NLM request. If one doesn't already exist in the host cache, a
316 * new handle is created and returned.
317 *
318 * Before possibly creating a new nlm_host, construct a sockaddr
319 * for a specific source address in case the local system has
320 * multiple network addresses. The family of the address in
321 * rq_daddr is guaranteed to be the same as the family of the
322 * address in rq_addr, so it's safe to use the same family for
323 * the source address.
189 */ 324 */
190struct nlm_host * 325struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
191nlmsvc_lookup_host(struct svc_rqst *rqstp, 326 const char *hostname,
192 const char *hostname, unsigned int hostname_len) 327 const size_t hostname_len)
193{ 328{
194 struct sockaddr_in ssin = {0}; 329 struct sockaddr_in sin = {
330 .sin_family = AF_INET,
331 };
332 struct sockaddr_in6 sin6 = {
333 .sin6_family = AF_INET6,
334 };
335 struct nlm_lookup_host_info ni = {
336 .server = 1,
337 .sap = svc_addr(rqstp),
338 .salen = rqstp->rq_addrlen,
339 .protocol = rqstp->rq_prot,
340 .version = rqstp->rq_vers,
341 .hostname = hostname,
342 .hostname_len = hostname_len,
343 .src_len = rqstp->rq_addrlen,
344 };
345
346 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
347 (int)hostname_len, hostname, rqstp->rq_vers,
348 (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
349
350 switch (ni.sap->sa_family) {
351 case AF_INET:
352 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
353 ni.src_sap = (struct sockaddr *)&sin;
354 break;
355 case AF_INET6:
356 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
357 ni.src_sap = (struct sockaddr *)&sin6;
358 break;
359 default:
360 return NULL;
361 }
195 362
196 ssin.sin_addr = rqstp->rq_daddr.addr; 363 return nlm_lookup_host(&ni);
197 return nlm_lookup_host(1, svc_addr_in(rqstp),
198 rqstp->rq_prot, rqstp->rq_vers,
199 hostname, hostname_len, &ssin);
200} 364}
201 365
202/* 366/*
@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
207{ 371{
208 struct rpc_clnt *clnt; 372 struct rpc_clnt *clnt;
209 373
210 dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n", 374 dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
211 NIPQUAD(host->h_saddr.sin_addr), 375 host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
212 NIPQUAD(host->h_addr.sin_addr));
213 376
214 /* Lock host handle */ 377 /* Lock host handle */
215 mutex_lock(&host->h_mutex); 378 mutex_lock(&host->h_mutex);
@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
221 if (time_after_eq(jiffies, host->h_nextrebind)) { 384 if (time_after_eq(jiffies, host->h_nextrebind)) {
222 rpc_force_rebind(clnt); 385 rpc_force_rebind(clnt);
223 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 386 host->h_nextrebind = jiffies + NLM_HOST_REBIND;
224 dprintk("lockd: next rebind in %ld jiffies\n", 387 dprintk("lockd: next rebind in %lu jiffies\n",
225 host->h_nextrebind - jiffies); 388 host->h_nextrebind - jiffies);
226 } 389 }
227 } else { 390 } else {
@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
234 }; 397 };
235 struct rpc_create_args args = { 398 struct rpc_create_args args = {
236 .protocol = host->h_proto, 399 .protocol = host->h_proto,
237 .address = (struct sockaddr *)&host->h_addr, 400 .address = nlm_addr(host),
238 .addrsize = sizeof(host->h_addr), 401 .addrsize = host->h_addrlen,
239 .saddress = (struct sockaddr *)&host->h_saddr, 402 .saddress = nlm_srcaddr(host),
240 .timeout = &timeparms, 403 .timeout = &timeparms,
241 .servername = host->h_name, 404 .servername = host->h_name,
242 .program = &nlm_program, 405 .program = &nlm_program,
@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
324 struct nsm_handle *nsm; 487 struct nsm_handle *nsm;
325 struct nlm_host *host; 488 struct nlm_host *host;
326 489
327 dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n", 490 nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
328 hostname, NIPQUAD(sin->sin_addr)); 491 hostname, hostname_len, 0);
329 492 if (nsm == NULL) {
330 /* Find the NSM handle for this peer */ 493 dprintk("lockd: never saw rebooted peer '%.*s' before\n",
331 if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0))) 494 hostname_len, hostname);
332 return; 495 return;
496 }
497
498 dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
499 hostname_len, hostname, nsm->sm_addrbuf);
333 500
334 /* When reclaiming locks on this peer, make sure that 501 /* When reclaiming locks on this peer, make sure that
335 * we set up a new notification */ 502 * we set up a new notification */
@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
461static LIST_HEAD(nsm_handles); 628static LIST_HEAD(nsm_handles);
462static DEFINE_SPINLOCK(nsm_lock); 629static DEFINE_SPINLOCK(nsm_lock);
463 630
464static struct nsm_handle * 631static struct nsm_handle *nsm_find(const struct sockaddr *sap,
465__nsm_find(const struct sockaddr_in *sin, 632 const size_t salen,
466 const char *hostname, unsigned int hostname_len, 633 const char *hostname,
467 int create) 634 const size_t hostname_len,
635 const int create)
468{ 636{
469 struct nsm_handle *nsm = NULL; 637 struct nsm_handle *nsm = NULL;
470 struct nsm_handle *pos; 638 struct nsm_handle *pos;
471 639
472 if (!sin) 640 if (!sap)
473 return NULL; 641 return NULL;
474 642
475 if (hostname && memchr(hostname, '/', hostname_len) != NULL) { 643 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
476 if (printk_ratelimit()) { 644 if (printk_ratelimit()) {
477 printk(KERN_WARNING "Invalid hostname \"%.*s\" " 645 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
478 "in NFS lock request\n", 646 "in NFS lock request\n",
479 hostname_len, hostname); 647 (int)hostname_len, hostname);
480 } 648 }
481 return NULL; 649 return NULL;
482 } 650 }
@@ -489,7 +657,7 @@ retry:
489 if (strlen(pos->sm_name) != hostname_len 657 if (strlen(pos->sm_name) != hostname_len
490 || memcmp(pos->sm_name, hostname, hostname_len)) 658 || memcmp(pos->sm_name, hostname, hostname_len))
491 continue; 659 continue;
492 } else if (!nlm_cmp_addr(&pos->sm_addr, sin)) 660 } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
493 continue; 661 continue;
494 atomic_inc(&pos->sm_count); 662 atomic_inc(&pos->sm_count);
495 kfree(nsm); 663 kfree(nsm);
@@ -509,10 +677,13 @@ retry:
509 if (nsm == NULL) 677 if (nsm == NULL)
510 return NULL; 678 return NULL;
511 679
512 nsm->sm_addr = *sin; 680 memcpy(nsm_addr(nsm), sap, salen);
681 nsm->sm_addrlen = salen;
513 nsm->sm_name = (char *) (nsm + 1); 682 nsm->sm_name = (char *) (nsm + 1);
514 memcpy(nsm->sm_name, hostname, hostname_len); 683 memcpy(nsm->sm_name, hostname, hostname_len);
515 nsm->sm_name[hostname_len] = '\0'; 684 nsm->sm_name[hostname_len] = '\0';
685 nlm_display_address((struct sockaddr *)&nsm->sm_addr,
686 nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
516 atomic_set(&nsm->sm_count, 1); 687 atomic_set(&nsm->sm_count, 1);
517 goto retry; 688 goto retry;
518 689
@@ -521,13 +692,6 @@ found:
521 return nsm; 692 return nsm;
522} 693}
523 694
524static struct nsm_handle *
525nsm_find(const struct sockaddr_in *sin, const char *hostname,
526 unsigned int hostname_len)
527{
528 return __nsm_find(sin, hostname, hostname_len, 1);
529}
530
531/* 695/*
532 * Release an NSM handle 696 * Release an NSM handle
533 */ 697 */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4d563543b11..4e7e958e8f67 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
51 51
52 memset(&args, 0, sizeof(args)); 52 memset(&args, 0, sizeof(args));
53 args.mon_name = nsm->sm_name; 53 args.mon_name = nsm->sm_name;
54 args.addr = nsm->sm_addr.sin_addr.s_addr; 54 args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
55 args.prog = NLM_PROGRAM; 55 args.prog = NLM_PROGRAM;
56 args.vers = 3; 56 args.vers = 3;
57 args.proc = NLMPROC_NSM_NOTIFY; 57 args.proc = NLMPROC_NSM_NOTIFY;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 5bd9bf0fa9df..c631a83931ce 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex);
51static unsigned int nlmsvc_users; 51static unsigned int nlmsvc_users;
52static struct task_struct *nlmsvc_task; 52static struct task_struct *nlmsvc_task;
53static struct svc_rqst *nlmsvc_rqst; 53static struct svc_rqst *nlmsvc_rqst;
54int nlmsvc_grace_period;
55unsigned long nlmsvc_timeout; 54unsigned long nlmsvc_timeout;
56 55
57/* 56/*
@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
85 return nlm_timeout * 5 * HZ; 84 return nlm_timeout * 5 * HZ;
86} 85}
87 86
88unsigned long get_nfs_grace_period(void) 87static struct lock_manager lockd_manager = {
89{ 88};
90 unsigned long lockdgrace = get_lockd_grace_period();
91 unsigned long nfsdgrace = 0;
92
93 if (nlmsvc_ops)
94 nfsdgrace = nlmsvc_ops->get_grace_period();
95
96 return max(lockdgrace, nfsdgrace);
97}
98EXPORT_SYMBOL(get_nfs_grace_period);
99 89
100static unsigned long set_grace_period(void) 90static void grace_ender(struct work_struct *not_used)
101{ 91{
102 nlmsvc_grace_period = 1; 92 locks_end_grace(&lockd_manager);
103 return get_nfs_grace_period() + jiffies;
104} 93}
105 94
106static inline void clear_grace_period(void) 95static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
96
97static void set_grace_period(void)
107{ 98{
108 nlmsvc_grace_period = 0; 99 unsigned long grace_period = get_lockd_grace_period();
100
101 locks_start_grace(&lockd_manager);
102 cancel_delayed_work_sync(&grace_period_end);
103 schedule_delayed_work(&grace_period_end, grace_period);
109} 104}
110 105
111/* 106/*
@@ -116,7 +111,6 @@ lockd(void *vrqstp)
116{ 111{
117 int err = 0, preverr = 0; 112 int err = 0, preverr = 0;
118 struct svc_rqst *rqstp = vrqstp; 113 struct svc_rqst *rqstp = vrqstp;
119 unsigned long grace_period_expire;
120 114
121 /* try_to_freeze() is called from svc_recv() */ 115 /* try_to_freeze() is called from svc_recv() */
122 set_freezable(); 116 set_freezable();
@@ -139,7 +133,7 @@ lockd(void *vrqstp)
139 nlm_timeout = LOCKD_DFLT_TIMEO; 133 nlm_timeout = LOCKD_DFLT_TIMEO;
140 nlmsvc_timeout = nlm_timeout * HZ; 134 nlmsvc_timeout = nlm_timeout * HZ;
141 135
142 grace_period_expire = set_grace_period(); 136 set_grace_period();
143 137
144 /* 138 /*
145 * The main request loop. We don't terminate until the last 139 * The main request loop. We don't terminate until the last
@@ -153,21 +147,12 @@ lockd(void *vrqstp)
153 flush_signals(current); 147 flush_signals(current);
154 if (nlmsvc_ops) { 148 if (nlmsvc_ops) {
155 nlmsvc_invalidate_all(); 149 nlmsvc_invalidate_all();
156 grace_period_expire = set_grace_period(); 150 set_grace_period();
157 } 151 }
158 continue; 152 continue;
159 } 153 }
160 154
161 /* 155 timeout = nlmsvc_retry_blocked();
162 * Retry any blocked locks that have been notified by
163 * the VFS. Don't do this during grace period.
164 * (Theoretically, there shouldn't even be blocked locks
165 * during grace period).
166 */
167 if (!nlmsvc_grace_period) {
168 timeout = nlmsvc_retry_blocked();
169 } else if (time_before(grace_period_expire, jiffies))
170 clear_grace_period();
171 156
172 /* 157 /*
173 * Find a socket with data available and call its 158 * Find a socket with data available and call its
@@ -195,6 +180,7 @@ lockd(void *vrqstp)
195 svc_process(rqstp); 180 svc_process(rqstp);
196 } 181 }
197 flush_signals(current); 182 flush_signals(current);
183 cancel_delayed_work_sync(&grace_period_end);
198 if (nlmsvc_ops) 184 if (nlmsvc_ops)
199 nlmsvc_invalidate_all(); 185 nlmsvc_invalidate_all();
200 nlm_shutdown_hosts(); 186 nlm_shutdown_hosts();
@@ -203,25 +189,28 @@ lockd(void *vrqstp)
203} 189}
204 190
205/* 191/*
206 * Make any sockets that are needed but not present. 192 * Ensure there are active UDP and TCP listeners for lockd.
207 * If nlm_udpport or nlm_tcpport were set as module 193 *
208 * options, make those sockets unconditionally 194 * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
195 * local services (such as rpc.statd) still require UDP, and
196 * some NFS servers do not yet support NLM over TCP.
197 *
198 * Returns zero if all listeners are available; otherwise a
199 * negative errno value is returned.
209 */ 200 */
210static int make_socks(struct svc_serv *serv, int proto) 201static int make_socks(struct svc_serv *serv)
211{ 202{
212 static int warned; 203 static int warned;
213 struct svc_xprt *xprt; 204 struct svc_xprt *xprt;
214 int err = 0; 205 int err = 0;
215 206
216 if (proto == IPPROTO_UDP || nlm_udpport) { 207 xprt = svc_find_xprt(serv, "udp", 0, 0);
217 xprt = svc_find_xprt(serv, "udp", 0, 0); 208 if (!xprt)
218 if (!xprt) 209 err = svc_create_xprt(serv, "udp", nlm_udpport,
219 err = svc_create_xprt(serv, "udp", nlm_udpport, 210 SVC_SOCK_DEFAULTS);
220 SVC_SOCK_DEFAULTS); 211 else
221 else 212 svc_xprt_put(xprt);
222 svc_xprt_put(xprt); 213 if (err >= 0) {
223 }
224 if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
225 xprt = svc_find_xprt(serv, "tcp", 0, 0); 214 xprt = svc_find_xprt(serv, "tcp", 0, 0);
226 if (!xprt) 215 if (!xprt)
227 err = svc_create_xprt(serv, "tcp", nlm_tcpport, 216 err = svc_create_xprt(serv, "tcp", nlm_tcpport,
@@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto)
241/* 230/*
242 * Bring up the lockd process if it's not already up. 231 * Bring up the lockd process if it's not already up.
243 */ 232 */
244int 233int lockd_up(void)
245lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
246{ 234{
247 struct svc_serv *serv; 235 struct svc_serv *serv;
248 int error = 0; 236 int error = 0;
@@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
251 /* 239 /*
252 * Check whether we're already up and running. 240 * Check whether we're already up and running.
253 */ 241 */
254 if (nlmsvc_rqst) { 242 if (nlmsvc_rqst)
255 if (proto)
256 error = make_socks(nlmsvc_rqst->rq_server, proto);
257 goto out; 243 goto out;
258 }
259 244
260 /* 245 /*
261 * Sanity check: if there's no pid, 246 * Sanity check: if there's no pid,
@@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
266 "lockd_up: no pid, %d users??\n", nlmsvc_users); 251 "lockd_up: no pid, %d users??\n", nlmsvc_users);
267 252
268 error = -ENOMEM; 253 error = -ENOMEM;
269 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); 254 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
270 if (!serv) { 255 if (!serv) {
271 printk(KERN_WARNING "lockd_up: create service failed\n"); 256 printk(KERN_WARNING "lockd_up: create service failed\n");
272 goto out; 257 goto out;
273 } 258 }
274 259
275 if ((error = make_socks(serv, proto)) < 0) 260 error = make_socks(serv);
261 if (error < 0)
276 goto destroy_and_out; 262 goto destroy_and_out;
277 263
278 /* 264 /*
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 399444639337..014f6ce48172 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -83,17 +83,11 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
83{ 83{
84 struct nlm_host *host; 84 struct nlm_host *host;
85 struct nlm_file *file; 85 struct nlm_file *file;
86 int rc = rpc_success; 86 __be32 rc = rpc_success;
87 87
88 dprintk("lockd: TEST4 called\n"); 88 dprintk("lockd: TEST4 called\n");
89 resp->cookie = argp->cookie; 89 resp->cookie = argp->cookie;
90 90
91 /* Don't accept test requests during grace period */
92 if (nlmsvc_grace_period) {
93 resp->status = nlm_lck_denied_grace_period;
94 return rc;
95 }
96
97 /* Obtain client and file */ 91 /* Obtain client and file */
98 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 92 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
99 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; 93 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -116,18 +110,12 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
116{ 110{
117 struct nlm_host *host; 111 struct nlm_host *host;
118 struct nlm_file *file; 112 struct nlm_file *file;
119 int rc = rpc_success; 113 __be32 rc = rpc_success;
120 114
121 dprintk("lockd: LOCK called\n"); 115 dprintk("lockd: LOCK called\n");
122 116
123 resp->cookie = argp->cookie; 117 resp->cookie = argp->cookie;
124 118
125 /* Don't accept new lock requests during grace period */
126 if (nlmsvc_grace_period && !argp->reclaim) {
127 resp->status = nlm_lck_denied_grace_period;
128 return rc;
129 }
130
131 /* Obtain client and file */ 119 /* Obtain client and file */
132 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 120 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
133 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; 121 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
146 134
147 /* Now try to lock the file */ 135 /* Now try to lock the file */
148 resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, 136 resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
149 argp->block, &argp->cookie); 137 argp->block, &argp->cookie,
138 argp->reclaim);
150 if (resp->status == nlm_drop_reply) 139 if (resp->status == nlm_drop_reply)
151 rc = rpc_drop_reply; 140 rc = rpc_drop_reply;
152 else 141 else
@@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
169 resp->cookie = argp->cookie; 158 resp->cookie = argp->cookie;
170 159
171 /* Don't accept requests during grace period */ 160 /* Don't accept requests during grace period */
172 if (nlmsvc_grace_period) { 161 if (locks_in_grace()) {
173 resp->status = nlm_lck_denied_grace_period; 162 resp->status = nlm_lck_denied_grace_period;
174 return rpc_success; 163 return rpc_success;
175 } 164 }
@@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
202 resp->cookie = argp->cookie; 191 resp->cookie = argp->cookie;
203 192
204 /* Don't accept new lock requests during grace period */ 193 /* Don't accept new lock requests during grace period */
205 if (nlmsvc_grace_period) { 194 if (locks_in_grace()) {
206 resp->status = nlm_lck_denied_grace_period; 195 resp->status = nlm_lck_denied_grace_period;
207 return rpc_success; 196 return rpc_success;
208 } 197 }
@@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
231 resp->cookie = argp->cookie; 220 resp->cookie = argp->cookie;
232 221
233 dprintk("lockd: GRANTED called\n"); 222 dprintk("lockd: GRANTED called\n");
234 resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); 223 resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
235 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); 224 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
236 return rpc_success; 225 return rpc_success;
237} 226}
@@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
341 resp->cookie = argp->cookie; 330 resp->cookie = argp->cookie;
342 331
343 /* Don't accept new lock requests during grace period */ 332 /* Don't accept new lock requests during grace period */
344 if (nlmsvc_grace_period && !argp->reclaim) { 333 if (locks_in_grace() && !argp->reclaim) {
345 resp->status = nlm_lck_denied_grace_period; 334 resp->status = nlm_lck_denied_grace_period;
346 return rpc_success; 335 return rpc_success;
347 } 336 }
@@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
374 resp->cookie = argp->cookie; 363 resp->cookie = argp->cookie;
375 364
376 /* Don't accept requests during grace period */ 365 /* Don't accept requests during grace period */
377 if (nlmsvc_grace_period) { 366 if (locks_in_grace()) {
378 resp->status = nlm_lck_denied_grace_period; 367 resp->status = nlm_lck_denied_grace_period;
379 return rpc_success; 368 return rpc_success;
380 } 369 }
@@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
432{ 421{
433 struct sockaddr_in saddr; 422 struct sockaddr_in saddr;
434 423
435 memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
436
437 dprintk("lockd: SM_NOTIFY called\n"); 424 dprintk("lockd: SM_NOTIFY called\n");
438 if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) 425
439 || ntohs(saddr.sin_port) >= 1024) { 426 if (!nlm_privileged_requester(rqstp)) {
440 char buf[RPC_MAX_ADDRBUFLEN]; 427 char buf[RPC_MAX_ADDRBUFLEN];
441 printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", 428 printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
442 svc_print_addr(rqstp, buf, sizeof(buf))); 429 svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cf0d5c2c318d..6063a8e4b9f3 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
360__be32 360__be32
361nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, 361nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
362 struct nlm_host *host, struct nlm_lock *lock, int wait, 362 struct nlm_host *host, struct nlm_lock *lock, int wait,
363 struct nlm_cookie *cookie) 363 struct nlm_cookie *cookie, int reclaim)
364{ 364{
365 struct nlm_block *block = NULL; 365 struct nlm_block *block = NULL;
366 int error; 366 int error;
@@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
406 goto out; 406 goto out;
407 } 407 }
408 408
409 if (locks_in_grace() && !reclaim) {
410 ret = nlm_lck_denied_grace_period;
411 goto out;
412 }
413 if (reclaim && !locks_in_grace()) {
414 ret = nlm_lck_denied_grace_period;
415 goto out;
416 }
417
409 if (!wait) 418 if (!wait)
410 lock->fl.fl_flags &= ~FL_SLEEP; 419 lock->fl.fl_flags &= ~FL_SLEEP;
411 error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); 420 error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
@@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
502 goto out; 511 goto out;
503 } 512 }
504 513
514 if (locks_in_grace()) {
515 ret = nlm_lck_denied_grace_period;
516 goto out;
517 }
505 error = vfs_test_lock(file->f_file, &lock->fl); 518 error = vfs_test_lock(file->f_file, &lock->fl);
506 if (error == FILE_LOCK_DEFERRED) { 519 if (error == FILE_LOCK_DEFERRED) {
507 ret = nlmsvc_defer_lock_rqst(rqstp, block); 520 ret = nlmsvc_defer_lock_rqst(rqstp, block);
@@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
582 (long long)lock->fl.fl_start, 595 (long long)lock->fl.fl_start,
583 (long long)lock->fl.fl_end); 596 (long long)lock->fl.fl_end);
584 597
598 if (locks_in_grace())
599 return nlm_lck_denied_grace_period;
600
585 mutex_lock(&file->f_mutex); 601 mutex_lock(&file->f_mutex);
586 block = nlmsvc_lookup_block(file, lock); 602 block = nlmsvc_lookup_block(file, lock);
587 mutex_unlock(&file->f_mutex); 603 mutex_unlock(&file->f_mutex);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 76019d2ff72d..548b0bb2b84d 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -112,17 +112,11 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
112{ 112{
113 struct nlm_host *host; 113 struct nlm_host *host;
114 struct nlm_file *file; 114 struct nlm_file *file;
115 int rc = rpc_success; 115 __be32 rc = rpc_success;
116 116
117 dprintk("lockd: TEST called\n"); 117 dprintk("lockd: TEST called\n");
118 resp->cookie = argp->cookie; 118 resp->cookie = argp->cookie;
119 119
120 /* Don't accept test requests during grace period */
121 if (nlmsvc_grace_period) {
122 resp->status = nlm_lck_denied_grace_period;
123 return rc;
124 }
125
126 /* Obtain client and file */ 120 /* Obtain client and file */
127 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 121 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
128 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; 122 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,18 +140,12 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
146{ 140{
147 struct nlm_host *host; 141 struct nlm_host *host;
148 struct nlm_file *file; 142 struct nlm_file *file;
149 int rc = rpc_success; 143 __be32 rc = rpc_success;
150 144
151 dprintk("lockd: LOCK called\n"); 145 dprintk("lockd: LOCK called\n");
152 146
153 resp->cookie = argp->cookie; 147 resp->cookie = argp->cookie;
154 148
155 /* Don't accept new lock requests during grace period */
156 if (nlmsvc_grace_period && !argp->reclaim) {
157 resp->status = nlm_lck_denied_grace_period;
158 return rc;
159 }
160
161 /* Obtain client and file */ 149 /* Obtain client and file */
162 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 150 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
163 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; 151 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
176 164
177 /* Now try to lock the file */ 165 /* Now try to lock the file */
178 resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, 166 resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
179 argp->block, &argp->cookie)); 167 argp->block, &argp->cookie,
168 argp->reclaim));
180 if (resp->status == nlm_drop_reply) 169 if (resp->status == nlm_drop_reply)
181 rc = rpc_drop_reply; 170 rc = rpc_drop_reply;
182 else 171 else
@@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
199 resp->cookie = argp->cookie; 188 resp->cookie = argp->cookie;
200 189
201 /* Don't accept requests during grace period */ 190 /* Don't accept requests during grace period */
202 if (nlmsvc_grace_period) { 191 if (locks_in_grace()) {
203 resp->status = nlm_lck_denied_grace_period; 192 resp->status = nlm_lck_denied_grace_period;
204 return rpc_success; 193 return rpc_success;
205 } 194 }
@@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
232 resp->cookie = argp->cookie; 221 resp->cookie = argp->cookie;
233 222
234 /* Don't accept new lock requests during grace period */ 223 /* Don't accept new lock requests during grace period */
235 if (nlmsvc_grace_period) { 224 if (locks_in_grace()) {
236 resp->status = nlm_lck_denied_grace_period; 225 resp->status = nlm_lck_denied_grace_period;
237 return rpc_success; 226 return rpc_success;
238 } 227 }
@@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
261 resp->cookie = argp->cookie; 250 resp->cookie = argp->cookie;
262 251
263 dprintk("lockd: GRANTED called\n"); 252 dprintk("lockd: GRANTED called\n");
264 resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); 253 resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
265 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); 254 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
266 return rpc_success; 255 return rpc_success;
267} 256}
@@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
373 resp->cookie = argp->cookie; 362 resp->cookie = argp->cookie;
374 363
375 /* Don't accept new lock requests during grace period */ 364 /* Don't accept new lock requests during grace period */
376 if (nlmsvc_grace_period && !argp->reclaim) { 365 if (locks_in_grace() && !argp->reclaim) {
377 resp->status = nlm_lck_denied_grace_period; 366 resp->status = nlm_lck_denied_grace_period;
378 return rpc_success; 367 return rpc_success;
379 } 368 }
@@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
406 resp->cookie = argp->cookie; 395 resp->cookie = argp->cookie;
407 396
408 /* Don't accept requests during grace period */ 397 /* Don't accept requests during grace period */
409 if (nlmsvc_grace_period) { 398 if (locks_in_grace()) {
410 resp->status = nlm_lck_denied_grace_period; 399 resp->status = nlm_lck_denied_grace_period;
411 return rpc_success; 400 return rpc_success;
412 } 401 }
@@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
464{ 453{
465 struct sockaddr_in saddr; 454 struct sockaddr_in saddr;
466 455
467 memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
468
469 dprintk("lockd: SM_NOTIFY called\n"); 456 dprintk("lockd: SM_NOTIFY called\n");
470 if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) 457
471 || ntohs(saddr.sin_port) >= 1024) { 458 if (!nlm_privileged_requester(rqstp)) {
472 char buf[RPC_MAX_ADDRBUFLEN]; 459 char buf[RPC_MAX_ADDRBUFLEN];
473 printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", 460 printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
474 svc_print_addr(rqstp, buf, sizeof(buf))); 461 svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 198b4e55b373..34c2766e27c7 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
418static int 418static int
419nlmsvc_match_ip(void *datap, struct nlm_host *host) 419nlmsvc_match_ip(void *datap, struct nlm_host *host)
420{ 420{
421 return nlm_cmp_addr(&host->h_saddr, datap); 421 return nlm_cmp_addr(nlm_srcaddr(host), datap);
422} 422}
423 423
424/** 424/**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 3e459e18cc31..1f226290c67c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
351 argp->state = ntohl(*p++); 351 argp->state = ntohl(*p++);
352 /* Preserve the address in network byte order */ 352 /* Preserve the address in network byte order */
353 argp->addr = *p++; 353 argp->addr = *p++;
354 argp->vers = *p++;
355 argp->proto = *p++;
356 return xdr_argsize_check(rqstp, p); 354 return xdr_argsize_check(rqstp, p);
357} 355}
358 356
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 43ff9397e6c6..50c493a8ad8e 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
358 argp->state = ntohl(*p++); 358 argp->state = ntohl(*p++);
359 /* Preserve the address in network byte order */ 359 /* Preserve the address in network byte order */
360 argp->addr = *p++; 360 argp->addr = *p++;
361 argp->vers = *p++;
362 argp->proto = *p++;
363 return xdr_argsize_check(rqstp, p); 361 return xdr_argsize_check(rqstp, p);
364} 362}
365 363
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f447f4b4476c..6a09760c5960 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -105,7 +105,8 @@ int nfs_callback_up(void)
105 mutex_lock(&nfs_callback_mutex); 105 mutex_lock(&nfs_callback_mutex);
106 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) 106 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
107 goto out; 107 goto out;
108 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); 108 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
109 AF_INET, NULL);
109 ret = -ENOMEM; 110 ret = -ENOMEM;
110 if (!serv) 111 if (!serv)
111 goto out_err; 112 goto out_err;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 46763d1cd397..8478fc25daee 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -127,7 +127,7 @@ enum {
127 Opt_err 127 Opt_err
128}; 128};
129 129
130static match_table_t __initdata tokens = { 130static match_table_t __initconst tokens = {
131 {Opt_port, "port=%u"}, 131 {Opt_port, "port=%u"},
132 {Opt_rsize, "rsize=%u"}, 132 {Opt_rsize, "rsize=%u"},
133 {Opt_wsize, "wsize=%u"}, 133 {Opt_wsize, "wsize=%u"},
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9abcd2b329f7..ffb697416cb1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -98,7 +98,7 @@ enum {
98 Opt_err 98 Opt_err
99}; 99};
100 100
101static match_table_t nfs_mount_option_tokens = { 101static const match_table_t nfs_mount_option_tokens = {
102 { Opt_userspace, "bg" }, 102 { Opt_userspace, "bg" },
103 { Opt_userspace, "fg" }, 103 { Opt_userspace, "fg" },
104 { Opt_userspace, "retry=%s" }, 104 { Opt_userspace, "retry=%s" },
@@ -163,7 +163,7 @@ enum {
163 Opt_xprt_err 163 Opt_xprt_err
164}; 164};
165 165
166static match_table_t nfs_xprt_protocol_tokens = { 166static const match_table_t nfs_xprt_protocol_tokens = {
167 { Opt_xprt_udp, "udp" }, 167 { Opt_xprt_udp, "udp" },
168 { Opt_xprt_tcp, "tcp" }, 168 { Opt_xprt_tcp, "tcp" },
169 { Opt_xprt_rdma, "rdma" }, 169 { Opt_xprt_rdma, "rdma" },
@@ -180,7 +180,7 @@ enum {
180 Opt_sec_err 180 Opt_sec_err
181}; 181};
182 182
183static match_table_t nfs_secflavor_tokens = { 183static const match_table_t nfs_secflavor_tokens = {
184 { Opt_sec_none, "none" }, 184 { Opt_sec_none, "none" },
185 { Opt_sec_none, "null" }, 185 { Opt_sec_none, "null" },
186 { Opt_sec_sys, "sys" }, 186 { Opt_sec_sys, "sys" },
@@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw,
1279 } 1279 }
1280 } 1280 }
1281 1281
1282 if (errors > 0) {
1283 dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
1284 errors, (errors == 1 ? "" : "s"));
1285 if (!sloppy)
1286 return 0;
1287 }
1282 return 1; 1288 return 1;
1283 1289
1284out_nomem: 1290out_nomem:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 33bfcf09db46..9dc036f18356 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1023,7 +1023,7 @@ exp_export(struct nfsctl_export *nxp)
1023 /* Look up the dentry */ 1023 /* Look up the dentry */
1024 err = path_lookup(nxp->ex_path, 0, &nd); 1024 err = path_lookup(nxp->ex_path, 0, &nd);
1025 if (err) 1025 if (err)
1026 goto out_unlock; 1026 goto out_put_clp;
1027 err = -EINVAL; 1027 err = -EINVAL;
1028 1028
1029 exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL); 1029 exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL);
@@ -1090,9 +1090,9 @@ finish:
1090 exp_put(exp); 1090 exp_put(exp);
1091 if (fsid_key && !IS_ERR(fsid_key)) 1091 if (fsid_key && !IS_ERR(fsid_key))
1092 cache_put(&fsid_key->h, &svc_expkey_cache); 1092 cache_put(&fsid_key->h, &svc_expkey_cache);
1093 if (clp)
1094 auth_domain_put(clp);
1095 path_put(&nd.path); 1093 path_put(&nd.path);
1094out_put_clp:
1095 auth_domain_put(clp);
1096out_unlock: 1096out_unlock:
1097 exp_writeunlock(); 1097 exp_writeunlock();
1098out: 1098out:
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 15c6faeec77c..b2786a5f9afe 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -70,7 +70,6 @@ nlm_fclose(struct file *filp)
70static struct nlmsvc_binding nfsd_nlm_ops = { 70static struct nlmsvc_binding nfsd_nlm_ops = {
71 .fopen = nlm_fopen, /* open file for locking */ 71 .fopen = nlm_fopen, /* open file for locking */
72 .fclose = nlm_fclose, /* close file */ 72 .fclose = nlm_fclose, /* close file */
73 .get_grace_period = get_nfs4_grace_period,
74}; 73};
75 74
76void 75void
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 4d617ea28cfc..9dbd2eb91281 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
63 SVCFH_fmt(&argp->fh)); 63 SVCFH_fmt(&argp->fh));
64 64
65 fh_copy(&resp->fh, &argp->fh); 65 fh_copy(&resp->fh, &argp->fh);
66 nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); 66 nfserr = fh_verify(rqstp, &resp->fh, 0,
67 NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
67 if (nfserr) 68 if (nfserr)
68 RETURN_STATUS(nfserr); 69 RETURN_STATUS(nfserr);
69 70
@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
530 dprintk("nfsd: FSSTAT(3) %s\n", 531 dprintk("nfsd: FSSTAT(3) %s\n",
531 SVCFH_fmt(&argp->fh)); 532 SVCFH_fmt(&argp->fh));
532 533
533 nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); 534 nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
534 fh_put(&argp->fh); 535 fh_put(&argp->fh);
535 RETURN_STATUS(nfserr); 536 RETURN_STATUS(nfserr);
536} 537}
@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
558 resp->f_maxfilesize = ~(u32) 0; 559 resp->f_maxfilesize = ~(u32) 0;
559 resp->f_properties = NFS3_FSF_DEFAULT; 560 resp->f_properties = NFS3_FSF_DEFAULT;
560 561
561 nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); 562 nfserr = fh_verify(rqstp, &argp->fh, 0,
563 NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
562 564
563 /* Check special features of the file system. May request 565 /* Check special features of the file system. May request
564 * different read/write sizes for file systems known to have 566 * different read/write sizes for file systems known to have
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index b6ed38380ab8..54b8b4140c8f 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt)
443 * enough space for either: 443 * enough space for either:
444 */ 444 */
445 alloc = sizeof(struct posix_ace_state_array) 445 alloc = sizeof(struct posix_ace_state_array)
446 + cnt*sizeof(struct posix_ace_state); 446 + cnt*sizeof(struct posix_user_ace_state);
447 state->users = kzalloc(alloc, GFP_KERNEL); 447 state->users = kzalloc(alloc, GFP_KERNEL);
448 if (!state->users) 448 if (!state->users)
449 return -ENOMEM; 449 return -ENOMEM;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 702fa577aa6e..094747a1227c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
225 225
226 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 226 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
227 WRITE32(OP_CB_RECALL); 227 WRITE32(OP_CB_RECALL);
228 WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t)); 228 WRITE32(cb_rec->cbr_stateid.si_generation);
229 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
229 WRITE32(cb_rec->cbr_trunc); 230 WRITE32(cb_rec->cbr_trunc);
230 WRITE32(len); 231 WRITE32(len);
231 WRITEMEM(cb_rec->cbr_fhval, len); 232 WRITEMEM(cb_rec->cbr_fhval, len);
@@ -379,6 +380,7 @@ static int do_probe_callback(void *data)
379 .addrsize = sizeof(addr), 380 .addrsize = sizeof(addr),
380 .timeout = &timeparms, 381 .timeout = &timeparms,
381 .program = &cb_program, 382 .program = &cb_program,
383 .prognumber = cb->cb_prog,
382 .version = nfs_cb_version[1]->number, 384 .version = nfs_cb_version[1]->number,
383 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 385 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 386 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
396 addr.sin_port = htons(cb->cb_port); 398 addr.sin_port = htons(cb->cb_port);
397 addr.sin_addr.s_addr = htonl(cb->cb_addr); 399 addr.sin_addr.s_addr = htonl(cb->cb_addr);
398 400
399 /* Initialize rpc_stat */
400 memset(args.program->stats, 0, sizeof(struct rpc_stat));
401
402 /* Create RPC client */ 401 /* Create RPC client */
403 client = rpc_create(&args); 402 client = rpc_create(&args);
404 if (IS_ERR(client)) { 403 if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2e51adac65de..669461e291ae 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
201 /* Openowner is now set, so sequence id will get bumped. Now we need 201 /* Openowner is now set, so sequence id will get bumped. Now we need
202 * these checks before we do any creates: */ 202 * these checks before we do any creates: */
203 status = nfserr_grace; 203 status = nfserr_grace;
204 if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) 204 if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
205 goto out; 205 goto out;
206 status = nfserr_no_grace; 206 status = nfserr_no_grace;
207 if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) 207 if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
208 goto out; 208 goto out;
209 209
210 switch (open->op_claim_type) { 210 switch (open->op_claim_type) {
@@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
575{ 575{
576 __be32 status; 576 __be32 status;
577 577
578 if (nfs4_in_grace()) 578 if (locks_in_grace())
579 return nfserr_grace; 579 return nfserr_grace;
580 status = nfsd_unlink(rqstp, &cstate->current_fh, 0, 580 status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
581 remove->rm_name, remove->rm_namelen); 581 remove->rm_name, remove->rm_namelen);
@@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
596 596
597 if (!cstate->save_fh.fh_dentry) 597 if (!cstate->save_fh.fh_dentry)
598 return status; 598 return status;
599 if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags 599 if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
600 & NFSEXP_NOSUBTREECHECK)) 600 & NFSEXP_NOSUBTREECHECK))
601 return nfserr_grace; 601 return nfserr_grace;
602 status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, 602 status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
@@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
867 int slack_bytes; 867 int slack_bytes;
868 __be32 status; 868 __be32 status;
869 869
870 status = nfserr_resource;
871 cstate = cstate_alloc();
872 if (cstate == NULL)
873 goto out;
874
875 resp->xbuf = &rqstp->rq_res; 870 resp->xbuf = &rqstp->rq_res;
876 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; 871 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
877 resp->tagp = resp->p; 872 resp->tagp = resp->p;
@@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
890 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) 885 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
891 goto out; 886 goto out;
892 887
888 status = nfserr_resource;
889 cstate = cstate_alloc();
890 if (cstate == NULL)
891 goto out;
892
893 status = nfs_ok; 893 status = nfs_ok;
894 while (!status && resp->opcnt < args->opcnt) { 894 while (!status && resp->opcnt < args->opcnt) {
895 op = &args->ops[resp->opcnt++]; 895 op = &args->ops[resp->opcnt++];
@@ -957,9 +957,9 @@ encode_op:
957 nfsd4_increment_op_stats(op->opnum); 957 nfsd4_increment_op_stats(op->opnum);
958 } 958 }
959 959
960 cstate_free(cstate);
960out: 961out:
961 nfsd4_release_compoundargs(args); 962 nfsd4_release_compoundargs(args);
962 cstate_free(cstate);
963 dprintk("nfsv4 compound returned %d\n", ntohl(status)); 963 dprintk("nfsv4 compound returned %d\n", ntohl(status));
964 return status; 964 return status;
965} 965}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1578d7a2667e..0cc7ff5d5ab5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
61static time_t lease_time = 90; /* default lease time */ 61static time_t lease_time = 90; /* default lease time */
62static time_t user_lease_time = 90; 62static time_t user_lease_time = 90;
63static time_t boot_time; 63static time_t boot_time;
64static int in_grace = 1;
65static u32 current_ownerid = 1; 64static u32 current_ownerid = 1;
66static u32 current_fileid = 1; 65static u32 current_fileid = 1;
67static u32 current_delegid = 1; 66static u32 current_delegid = 1;
@@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
1640 case NFS4_OPEN_CLAIM_NULL: 1639 case NFS4_OPEN_CLAIM_NULL:
1641 /* Let's not give out any delegations till everyone's 1640 /* Let's not give out any delegations till everyone's
1642 * had the chance to reclaim theirs.... */ 1641 * had the chance to reclaim theirs.... */
1643 if (nfs4_in_grace()) 1642 if (locks_in_grace())
1644 goto out; 1643 goto out;
1645 if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) 1644 if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
1646 goto out; 1645 goto out;
@@ -1816,12 +1815,15 @@ out:
1816 return status; 1815 return status;
1817} 1816}
1818 1817
1818struct lock_manager nfsd4_manager = {
1819};
1820
1819static void 1821static void
1820end_grace(void) 1822nfsd4_end_grace(void)
1821{ 1823{
1822 dprintk("NFSD: end of grace period\n"); 1824 dprintk("NFSD: end of grace period\n");
1823 nfsd4_recdir_purge_old(); 1825 nfsd4_recdir_purge_old();
1824 in_grace = 0; 1826 locks_end_grace(&nfsd4_manager);
1825} 1827}
1826 1828
1827static time_t 1829static time_t
@@ -1838,8 +1840,8 @@ nfs4_laundromat(void)
1838 nfs4_lock_state(); 1840 nfs4_lock_state();
1839 1841
1840 dprintk("NFSD: laundromat service - starting\n"); 1842 dprintk("NFSD: laundromat service - starting\n");
1841 if (in_grace) 1843 if (locks_in_grace())
1842 end_grace(); 1844 nfsd4_end_grace();
1843 list_for_each_safe(pos, next, &client_lru) { 1845 list_for_each_safe(pos, next, &client_lru) {
1844 clp = list_entry(pos, struct nfs4_client, cl_lru); 1846 clp = list_entry(pos, struct nfs4_client, cl_lru);
1845 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { 1847 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1974 return nfserr_bad_stateid; 1976 return nfserr_bad_stateid;
1975 else if (ONE_STATEID(stateid) && (flags & RD_STATE)) 1977 else if (ONE_STATEID(stateid) && (flags & RD_STATE))
1976 return nfs_ok; 1978 return nfs_ok;
1977 else if (nfs4_in_grace()) { 1979 else if (locks_in_grace()) {
1978 /* Answer in remaining cases depends on existance of 1980 /* Answer in remaining cases depends on existance of
1979 * conflicting state; so we must wait out the grace period. */ 1981 * conflicting state; so we must wait out the grace period. */
1980 return nfserr_grace; 1982 return nfserr_grace;
@@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1993static inline int 1995static inline int
1994io_during_grace_disallowed(struct inode *inode, int flags) 1996io_during_grace_disallowed(struct inode *inode, int flags)
1995{ 1997{
1996 return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE)) 1998 return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
1997 && mandatory_lock(inode); 1999 && mandatory_lock(inode);
1998} 2000}
1999 2001
@@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2693 filp = lock_stp->st_vfs_file; 2695 filp = lock_stp->st_vfs_file;
2694 2696
2695 status = nfserr_grace; 2697 status = nfserr_grace;
2696 if (nfs4_in_grace() && !lock->lk_reclaim) 2698 if (locks_in_grace() && !lock->lk_reclaim)
2697 goto out; 2699 goto out;
2698 status = nfserr_no_grace; 2700 status = nfserr_no_grace;
2699 if (!nfs4_in_grace() && lock->lk_reclaim) 2701 if (!locks_in_grace() && lock->lk_reclaim)
2700 goto out; 2702 goto out;
2701 2703
2702 locks_init_lock(&file_lock); 2704 locks_init_lock(&file_lock);
@@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2779 int error; 2781 int error;
2780 __be32 status; 2782 __be32 status;
2781 2783
2782 if (nfs4_in_grace()) 2784 if (locks_in_grace())
2783 return nfserr_grace; 2785 return nfserr_grace;
2784 2786
2785 if (check_lock_length(lockt->lt_offset, lockt->lt_length)) 2787 if (check_lock_length(lockt->lt_offset, lockt->lt_length))
@@ -3192,9 +3194,9 @@ __nfs4_state_start(void)
3192 unsigned long grace_time; 3194 unsigned long grace_time;
3193 3195
3194 boot_time = get_seconds(); 3196 boot_time = get_seconds();
3195 grace_time = get_nfs_grace_period(); 3197 grace_time = get_nfs4_grace_period();
3196 lease_time = user_lease_time; 3198 lease_time = user_lease_time;
3197 in_grace = 1; 3199 locks_start_grace(&nfsd4_manager);
3198 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 3200 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
3199 grace_time/HZ); 3201 grace_time/HZ);
3200 laundry_wq = create_singlethread_workqueue("nfsd4"); 3202 laundry_wq = create_singlethread_workqueue("nfsd4");
@@ -3213,12 +3215,6 @@ nfs4_state_start(void)
3213 return; 3215 return;
3214} 3216}
3215 3217
3216int
3217nfs4_in_grace(void)
3218{
3219 return in_grace;
3220}
3221
3222time_t 3218time_t
3223nfs4_lease_time(void) 3219nfs4_lease_time(void)
3224{ 3220{
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 14ba4d9b2859..afcdf4b76843 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -413,6 +413,18 @@ out_nfserr:
413} 413}
414 414
415static __be32 415static __be32
416nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
417{
418 DECODE_HEAD;
419
420 READ_BUF(sizeof(stateid_t));
421 READ32(sid->si_generation);
422 COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
423
424 DECODE_TAIL;
425}
426
427static __be32
416nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) 428nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
417{ 429{
418 DECODE_HEAD; 430 DECODE_HEAD;
@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
429 DECODE_HEAD; 441 DECODE_HEAD;
430 442
431 close->cl_stateowner = NULL; 443 close->cl_stateowner = NULL;
432 READ_BUF(4 + sizeof(stateid_t)); 444 READ_BUF(4);
433 READ32(close->cl_seqid); 445 READ32(close->cl_seqid);
434 READ32(close->cl_stateid.si_generation); 446 return nfsd4_decode_stateid(argp, &close->cl_stateid);
435 COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
436 447
437 DECODE_TAIL; 448 DECODE_TAIL;
438} 449}
@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
493static inline __be32 504static inline __be32
494nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) 505nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
495{ 506{
496 DECODE_HEAD; 507 return nfsd4_decode_stateid(argp, &dr->dr_stateid);
497
498 READ_BUF(sizeof(stateid_t));
499 READ32(dr->dr_stateid.si_generation);
500 COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
501
502 DECODE_TAIL;
503} 508}
504 509
505static inline __be32 510static inline __be32
@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
542 READ32(lock->lk_is_new); 547 READ32(lock->lk_is_new);
543 548
544 if (lock->lk_is_new) { 549 if (lock->lk_is_new) {
545 READ_BUF(36); 550 READ_BUF(4);
546 READ32(lock->lk_new_open_seqid); 551 READ32(lock->lk_new_open_seqid);
547 READ32(lock->lk_new_open_stateid.si_generation); 552 status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
548 553 if (status)
549 COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t)); 554 return status;
555 READ_BUF(8 + sizeof(clientid_t));
550 READ32(lock->lk_new_lock_seqid); 556 READ32(lock->lk_new_lock_seqid);
551 COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); 557 COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
552 READ32(lock->lk_new_owner.len); 558 READ32(lock->lk_new_owner.len);
553 READ_BUF(lock->lk_new_owner.len); 559 READ_BUF(lock->lk_new_owner.len);
554 READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); 560 READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
555 } else { 561 } else {
556 READ_BUF(20); 562 status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
557 READ32(lock->lk_old_lock_stateid.si_generation); 563 if (status)
558 COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t)); 564 return status;
565 READ_BUF(4);
559 READ32(lock->lk_old_lock_seqid); 566 READ32(lock->lk_old_lock_seqid);
560 } 567 }
561 568
@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
587 DECODE_HEAD; 594 DECODE_HEAD;
588 595
589 locku->lu_stateowner = NULL; 596 locku->lu_stateowner = NULL;
590 READ_BUF(24 + sizeof(stateid_t)); 597 READ_BUF(8);
591 READ32(locku->lu_type); 598 READ32(locku->lu_type);
592 if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) 599 if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
593 goto xdr_error; 600 goto xdr_error;
594 READ32(locku->lu_seqid); 601 READ32(locku->lu_seqid);
595 READ32(locku->lu_stateid.si_generation); 602 status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
596 COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); 603 if (status)
604 return status;
605 READ_BUF(16);
597 READ64(locku->lu_offset); 606 READ64(locku->lu_offset);
598 READ64(locku->lu_length); 607 READ64(locku->lu_length);
599 608
@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
678 READ32(open->op_delegate_type); 687 READ32(open->op_delegate_type);
679 break; 688 break;
680 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 689 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
681 READ_BUF(sizeof(stateid_t) + 4); 690 status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
682 COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t)); 691 if (status)
692 return status;
693 READ_BUF(4);
683 READ32(open->op_fname.len); 694 READ32(open->op_fname.len);
684 READ_BUF(open->op_fname.len); 695 READ_BUF(open->op_fname.len);
685 SAVEMEM(open->op_fname.data, open->op_fname.len); 696 SAVEMEM(open->op_fname.data, open->op_fname.len);
@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
699 DECODE_HEAD; 710 DECODE_HEAD;
700 711
701 open_conf->oc_stateowner = NULL; 712 open_conf->oc_stateowner = NULL;
702 READ_BUF(4 + sizeof(stateid_t)); 713 status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
703 READ32(open_conf->oc_req_stateid.si_generation); 714 if (status)
704 COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t)); 715 return status;
716 READ_BUF(4);
705 READ32(open_conf->oc_seqid); 717 READ32(open_conf->oc_seqid);
706 718
707 DECODE_TAIL; 719 DECODE_TAIL;
@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
713 DECODE_HEAD; 725 DECODE_HEAD;
714 726
715 open_down->od_stateowner = NULL; 727 open_down->od_stateowner = NULL;
716 READ_BUF(12 + sizeof(stateid_t)); 728 status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
717 READ32(open_down->od_stateid.si_generation); 729 if (status)
718 COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t)); 730 return status;
731 READ_BUF(12);
719 READ32(open_down->od_seqid); 732 READ32(open_down->od_seqid);
720 READ32(open_down->od_share_access); 733 READ32(open_down->od_share_access);
721 READ32(open_down->od_share_deny); 734 READ32(open_down->od_share_deny);
@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
743{ 756{
744 DECODE_HEAD; 757 DECODE_HEAD;
745 758
746 READ_BUF(sizeof(stateid_t) + 12); 759 status = nfsd4_decode_stateid(argp, &read->rd_stateid);
747 READ32(read->rd_stateid.si_generation); 760 if (status)
748 COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t)); 761 return status;
762 READ_BUF(12);
749 READ64(read->rd_offset); 763 READ64(read->rd_offset);
750 READ32(read->rd_length); 764 READ32(read->rd_length);
751 765
@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
834static __be32 848static __be32
835nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 849nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
836{ 850{
837 DECODE_HEAD; 851 __be32 status;
838
839 READ_BUF(sizeof(stateid_t));
840 READ32(setattr->sa_stateid.si_generation);
841 COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
842 if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
843 goto out;
844 852
845 DECODE_TAIL; 853 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
854 if (status)
855 return status;
856 return nfsd4_decode_fattr(argp, setattr->sa_bmval,
857 &setattr->sa_iattr, &setattr->sa_acl);
846} 858}
847 859
848static __be32 860static __be32
@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
927 int len; 939 int len;
928 DECODE_HEAD; 940 DECODE_HEAD;
929 941
930 READ_BUF(sizeof(stateid_opaque_t) + 20); 942 status = nfsd4_decode_stateid(argp, &write->wr_stateid);
931 READ32(write->wr_stateid.si_generation); 943 if (status)
932 COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t)); 944 return status;
945 READ_BUF(16);
933 READ64(write->wr_offset); 946 READ64(write->wr_offset);
934 READ32(write->wr_stable_how); 947 READ32(write->wr_stable_how);
935 if (write->wr_stable_how > 2) 948 if (write->wr_stable_how > 2)
@@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1183 * Header routine to setup seqid operation replay cache 1196 * Header routine to setup seqid operation replay cache
1184 */ 1197 */
1185#define ENCODE_SEQID_OP_HEAD \ 1198#define ENCODE_SEQID_OP_HEAD \
1186 __be32 *p; \
1187 __be32 *save; \ 1199 __be32 *save; \
1188 \ 1200 \
1189 save = resp->p; 1201 save = resp->p;
@@ -1950,6 +1962,17 @@ fail:
1950 return -EINVAL; 1962 return -EINVAL;
1951} 1963}
1952 1964
1965static void
1966nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
1967{
1968 ENCODE_HEAD;
1969
1970 RESERVE_SPACE(sizeof(stateid_t));
1971 WRITE32(sid->si_generation);
1972 WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
1973 ADJUST_ARGS();
1974}
1975
1953static __be32 1976static __be32
1954nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) 1977nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
1955{ 1978{
@@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
1969{ 1992{
1970 ENCODE_SEQID_OP_HEAD; 1993 ENCODE_SEQID_OP_HEAD;
1971 1994
1972 if (!nfserr) { 1995 if (!nfserr)
1973 RESERVE_SPACE(sizeof(stateid_t)); 1996 nfsd4_encode_stateid(resp, &close->cl_stateid);
1974 WRITE32(close->cl_stateid.si_generation); 1997
1975 WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
1976 ADJUST_ARGS();
1977 }
1978 ENCODE_SEQID_OP_TAIL(close->cl_stateowner); 1998 ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
1979 return nfserr; 1999 return nfserr;
1980} 2000}
@@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
2074{ 2094{
2075 ENCODE_SEQID_OP_HEAD; 2095 ENCODE_SEQID_OP_HEAD;
2076 2096
2077 if (!nfserr) { 2097 if (!nfserr)
2078 RESERVE_SPACE(4 + sizeof(stateid_t)); 2098 nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
2079 WRITE32(lock->lk_resp_stateid.si_generation); 2099 else if (nfserr == nfserr_denied)
2080 WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
2081 ADJUST_ARGS();
2082 } else if (nfserr == nfserr_denied)
2083 nfsd4_encode_lock_denied(resp, &lock->lk_denied); 2100 nfsd4_encode_lock_denied(resp, &lock->lk_denied);
2084 2101
2085 ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); 2102 ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
@@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
2099{ 2116{
2100 ENCODE_SEQID_OP_HEAD; 2117 ENCODE_SEQID_OP_HEAD;
2101 2118
2102 if (!nfserr) { 2119 if (!nfserr)
2103 RESERVE_SPACE(sizeof(stateid_t)); 2120 nfsd4_encode_stateid(resp, &locku->lu_stateid);
2104 WRITE32(locku->lu_stateid.si_generation); 2121
2105 WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
2106 ADJUST_ARGS();
2107 }
2108
2109 ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); 2122 ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
2110 return nfserr; 2123 return nfserr;
2111} 2124}
@@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
2128static __be32 2141static __be32
2129nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) 2142nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
2130{ 2143{
2144 ENCODE_HEAD;
2131 ENCODE_SEQID_OP_HEAD; 2145 ENCODE_SEQID_OP_HEAD;
2132 2146
2133 if (nfserr) 2147 if (nfserr)
2134 goto out; 2148 goto out;
2135 2149
2136 RESERVE_SPACE(36 + sizeof(stateid_t)); 2150 nfsd4_encode_stateid(resp, &open->op_stateid);
2137 WRITE32(open->op_stateid.si_generation); 2151 RESERVE_SPACE(40);
2138 WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
2139 WRITECINFO(open->op_cinfo); 2152 WRITECINFO(open->op_cinfo);
2140 WRITE32(open->op_rflags); 2153 WRITE32(open->op_rflags);
2141 WRITE32(2); 2154 WRITE32(2);
@@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
2148 case NFS4_OPEN_DELEGATE_NONE: 2161 case NFS4_OPEN_DELEGATE_NONE:
2149 break; 2162 break;
2150 case NFS4_OPEN_DELEGATE_READ: 2163 case NFS4_OPEN_DELEGATE_READ:
2151 RESERVE_SPACE(20 + sizeof(stateid_t)); 2164 nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
2152 WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); 2165 RESERVE_SPACE(20);
2153 WRITE32(open->op_recall); 2166 WRITE32(open->op_recall);
2154 2167
2155 /* 2168 /*
@@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
2162 ADJUST_ARGS(); 2175 ADJUST_ARGS();
2163 break; 2176 break;
2164 case NFS4_OPEN_DELEGATE_WRITE: 2177 case NFS4_OPEN_DELEGATE_WRITE:
2165 RESERVE_SPACE(32 + sizeof(stateid_t)); 2178 nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
2166 WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); 2179 RESERVE_SPACE(32);
2167 WRITE32(0); 2180 WRITE32(0);
2168 2181
2169 /* 2182 /*
@@ -2195,13 +2208,9 @@ static __be32
2195nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) 2208nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
2196{ 2209{
2197 ENCODE_SEQID_OP_HEAD; 2210 ENCODE_SEQID_OP_HEAD;
2198 2211
2199 if (!nfserr) { 2212 if (!nfserr)
2200 RESERVE_SPACE(sizeof(stateid_t)); 2213 nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
2201 WRITE32(oc->oc_resp_stateid.si_generation);
2202 WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
2203 ADJUST_ARGS();
2204 }
2205 2214
2206 ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); 2215 ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
2207 return nfserr; 2216 return nfserr;
@@ -2211,13 +2220,9 @@ static __be32
2211nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) 2220nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
2212{ 2221{
2213 ENCODE_SEQID_OP_HEAD; 2222 ENCODE_SEQID_OP_HEAD;
2214 2223
2215 if (!nfserr) { 2224 if (!nfserr)
2216 RESERVE_SPACE(sizeof(stateid_t)); 2225 nfsd4_encode_stateid(resp, &od->od_stateid);
2217 WRITE32(od->od_stateid.si_generation);
2218 WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
2219 ADJUST_ARGS();
2220 }
2221 2226
2222 ENCODE_SEQID_OP_TAIL(od->od_stateowner); 2227 ENCODE_SEQID_OP_TAIL(od->od_stateowner);
2223 return nfserr; 2228 return nfserr;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c53e65f8f3a2..97543df58242 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
614 return -EINVAL; 614 return -EINVAL;
615 err = nfsd_create_serv(); 615 err = nfsd_create_serv();
616 if (!err) { 616 if (!err) {
617 int proto = 0; 617 err = svc_addsock(nfsd_serv, fd, buf);
618 err = svc_addsock(nfsd_serv, fd, buf, &proto);
619 if (err >= 0) { 618 if (err >= 0) {
620 err = lockd_up(proto); 619 err = lockd_up();
621 if (err < 0) 620 if (err < 0)
622 svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); 621 svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
623 } 622 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ea37c96f0445..cd25d91895a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
302 if (error) 302 if (error)
303 goto out; 303 goto out;
304 304
305 if (!(access & NFSD_MAY_LOCK)) { 305 /*
306 /* 306 * pseudoflavor restrictions are not enforced on NLM,
307 * pseudoflavor restrictions are not enforced on NLM, 307 * which clients virtually always use auth_sys for,
308 * which clients virtually always use auth_sys for, 308 * even while using RPCSEC_GSS for NFS.
309 * even while using RPCSEC_GSS for NFS. 309 */
310 */ 310 if (access & NFSD_MAY_LOCK)
311 error = check_nfsd_access(exp, rqstp); 311 goto skip_pseudoflavor_check;
312 if (error) 312 /*
313 goto out; 313 * Clients may expect to be able to use auth_sys during mount,
314 } 314 * even if they use gss for everything else; see section 2.3.2
315 * of rfc 2623.
316 */
317 if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
318 && exp->ex_path.dentry == dentry)
319 goto skip_pseudoflavor_check;
320
321 error = check_nfsd_access(exp, rqstp);
322 if (error)
323 goto out;
315 324
325skip_pseudoflavor_check:
316 /* Finally, check access permissions. */ 326 /* Finally, check access permissions. */
317 error = nfsd_permission(rqstp, exp, dentry, access); 327 error = nfsd_permission(rqstp, exp, dentry, access);
318 328
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0766f95d236a..5cffeca7acef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
65 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); 65 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
66 66
67 fh_copy(&resp->fh, &argp->fh); 67 fh_copy(&resp->fh, &argp->fh);
68 nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); 68 nfserr = fh_verify(rqstp, &resp->fh, 0,
69 NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
69 return nfsd_return_attrs(nfserr, resp); 70 return nfsd_return_attrs(nfserr, resp);
70} 71}
71 72
@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
521 522
522 dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); 523 dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
523 524
524 nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); 525 nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
526 NFSD_MAY_BYPASS_GSS_ON_ROOT);
525 fh_put(&argp->fh); 527 fh_put(&argp->fh);
526 return nfserr; 528 return nfserr;
527} 529}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 80292ff5e924..59eeb46f82c5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,6 +229,7 @@ int nfsd_create_serv(void)
229 229
230 atomic_set(&nfsd_busy, 0); 230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 AF_INET,
232 nfsd_last_thread, nfsd, THIS_MODULE); 233 nfsd_last_thread, nfsd, THIS_MODULE);
233 if (nfsd_serv == NULL) 234 if (nfsd_serv == NULL)
234 err = -ENOMEM; 235 err = -ENOMEM;
@@ -243,25 +244,20 @@ static int nfsd_init_socks(int port)
243 if (!list_empty(&nfsd_serv->sv_permsocks)) 244 if (!list_empty(&nfsd_serv->sv_permsocks))
244 return 0; 245 return 0;
245 246
246 error = lockd_up(IPPROTO_UDP); 247 error = svc_create_xprt(nfsd_serv, "udp", port,
247 if (error >= 0) {
248 error = svc_create_xprt(nfsd_serv, "udp", port,
249 SVC_SOCK_DEFAULTS); 248 SVC_SOCK_DEFAULTS);
250 if (error < 0)
251 lockd_down();
252 }
253 if (error < 0) 249 if (error < 0)
254 return error; 250 return error;
255 251
256 error = lockd_up(IPPROTO_TCP); 252 error = svc_create_xprt(nfsd_serv, "tcp", port,
257 if (error >= 0) {
258 error = svc_create_xprt(nfsd_serv, "tcp", port,
259 SVC_SOCK_DEFAULTS); 253 SVC_SOCK_DEFAULTS);
260 if (error < 0)
261 lockd_down();
262 }
263 if (error < 0) 254 if (error < 0)
264 return error; 255 return error;
256
257 error = lockd_up();
258 if (error < 0)
259 return error;
260
265 return 0; 261 return 0;
266} 262}
267 263
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 18060bed5267..aa1d0d6489a1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -83,7 +83,6 @@ struct raparm_hbucket {
83 spinlock_t pb_lock; 83 spinlock_t pb_lock;
84} ____cacheline_aligned_in_smp; 84} ____cacheline_aligned_in_smp;
85 85
86static struct raparms * raparml;
87#define RAPARM_HASH_BITS 4 86#define RAPARM_HASH_BITS 4
88#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) 87#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) 88#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
@@ -1866,9 +1865,9 @@ out:
1866 * N.B. After this call fhp needs an fh_put 1865 * N.B. After this call fhp needs an fh_put
1867 */ 1866 */
1868__be32 1867__be32
1869nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) 1868nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
1870{ 1869{
1871 __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); 1870 __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
1872 if (!err && vfs_statfs(fhp->fh_dentry,stat)) 1871 if (!err && vfs_statfs(fhp->fh_dentry,stat))
1873 err = nfserr_io; 1872 err = nfserr_io;
1874 return err; 1873 return err;
@@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
1966void 1965void
1967nfsd_racache_shutdown(void) 1966nfsd_racache_shutdown(void)
1968{ 1967{
1969 if (!raparml) 1968 struct raparms *raparm, *last_raparm;
1970 return; 1969 unsigned int i;
1970
1971 dprintk("nfsd: freeing readahead buffers.\n"); 1971 dprintk("nfsd: freeing readahead buffers.\n");
1972 kfree(raparml); 1972
1973 raparml = NULL; 1973 for (i = 0; i < RAPARM_HASH_SIZE; i++) {
1974 raparm = raparm_hash[i].pb_head;
1975 while(raparm) {
1976 last_raparm = raparm;
1977 raparm = raparm->p_next;
1978 kfree(last_raparm);
1979 }
1980 raparm_hash[i].pb_head = NULL;
1981 }
1974} 1982}
1975/* 1983/*
1976 * Initialize readahead param cache 1984 * Initialize readahead param cache
@@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size)
1981 int i; 1989 int i;
1982 int j = 0; 1990 int j = 0;
1983 int nperbucket; 1991 int nperbucket;
1992 struct raparms **raparm = NULL;
1984 1993
1985 1994
1986 if (raparml) 1995 if (raparm_hash[0].pb_head)
1987 return 0; 1996 return 0;
1988 if (cache_size < 2*RAPARM_HASH_SIZE) 1997 nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
1989 cache_size = 2*RAPARM_HASH_SIZE; 1998 if (nperbucket < 2)
1990 raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); 1999 nperbucket = 2;
1991 2000 cache_size = nperbucket * RAPARM_HASH_SIZE;
1992 if (!raparml) {
1993 printk(KERN_WARNING
1994 "nfsd: Could not allocate memory read-ahead cache.\n");
1995 return -ENOMEM;
1996 }
1997 2001
1998 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); 2002 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
1999 for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { 2003
2000 raparm_hash[i].pb_head = NULL; 2004 for (i = 0; i < RAPARM_HASH_SIZE; i++) {
2001 spin_lock_init(&raparm_hash[i].pb_lock); 2005 spin_lock_init(&raparm_hash[i].pb_lock);
2002 } 2006
2003 nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); 2007 raparm = &raparm_hash[i].pb_head;
2004 for (i = 0; i < cache_size - 1; i++) { 2008 for (j = 0; j < nperbucket; j++) {
2005 if (i % nperbucket == 0) 2009 *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
2006 raparm_hash[j++].pb_head = raparml + i; 2010 if (!*raparm)
2007 if (i % nperbucket < nperbucket-1) 2011 goto out_nomem;
2008 raparml[i].p_next = raparml + i + 1; 2012 raparm = &(*raparm)->p_next;
2013 }
2014 *raparm = NULL;
2009 } 2015 }
2010 2016
2011 nfsdstats.ra_size = cache_size; 2017 nfsdstats.ra_size = cache_size;
2012 return 0; 2018 return 0;
2019
2020out_nomem:
2021 dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
2022 nfsd_racache_shutdown();
2023 return -ENOMEM;
2013} 2024}
2014 2025
2015#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 2026#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index e1781c8b1650..9e8a95be7a1e 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
174 // TODO: Consider moving this lot to a separate function! (AIA) 174 // TODO: Consider moving this lot to a separate function! (AIA)
175handle_name: 175handle_name:
176 { 176 {
177 struct dentry *real_dent, *new_dent;
178 MFT_RECORD *m; 177 MFT_RECORD *m;
179 ntfs_attr_search_ctx *ctx; 178 ntfs_attr_search_ctx *ctx;
180 ntfs_inode *ni = NTFS_I(dent_inode); 179 ntfs_inode *ni = NTFS_I(dent_inode);
@@ -255,93 +254,9 @@ handle_name:
255 } 254 }
256 nls_name.hash = full_name_hash(nls_name.name, nls_name.len); 255 nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
257 256
258 /* 257 dent = d_add_ci(dent, dent_inode, &nls_name);
259 * Note: No need for dent->d_lock lock as i_mutex is held on the
260 * parent inode.
261 */
262
263 /* Does a dentry matching the nls_name exist already? */
264 real_dent = d_lookup(dent->d_parent, &nls_name);
265 /* If not, create it now. */
266 if (!real_dent) {
267 real_dent = d_alloc(dent->d_parent, &nls_name);
268 kfree(nls_name.name);
269 if (!real_dent) {
270 err = -ENOMEM;
271 goto err_out;
272 }
273 new_dent = d_splice_alias(dent_inode, real_dent);
274 if (new_dent)
275 dput(real_dent);
276 else
277 new_dent = real_dent;
278 ntfs_debug("Done. (Created new dentry.)");
279 return new_dent;
280 }
281 kfree(nls_name.name); 258 kfree(nls_name.name);
282 /* Matching dentry exists, check if it is negative. */ 259 return dent;
283 if (real_dent->d_inode) {
284 if (unlikely(real_dent->d_inode != dent_inode)) {
285 /* This can happen because bad inodes are unhashed. */
286 BUG_ON(!is_bad_inode(dent_inode));
287 BUG_ON(!is_bad_inode(real_dent->d_inode));
288 }
289 /*
290 * Already have the inode and the dentry attached, decrement
291 * the reference count to balance the ntfs_iget() we did
292 * earlier on. We found the dentry using d_lookup() so it
293 * cannot be disconnected and thus we do not need to worry
294 * about any NFS/disconnectedness issues here.
295 */
296 iput(dent_inode);
297 ntfs_debug("Done. (Already had inode and dentry.)");
298 return real_dent;
299 }
300 /*
301 * Negative dentry: instantiate it unless the inode is a directory and
302 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
303 * in which case d_move() that in place of the found dentry.
304 */
305 if (!S_ISDIR(dent_inode->i_mode)) {
306 /* Not a directory; everything is easy. */
307 d_instantiate(real_dent, dent_inode);
308 ntfs_debug("Done. (Already had negative file dentry.)");
309 return real_dent;
310 }
311 spin_lock(&dcache_lock);
312 if (list_empty(&dent_inode->i_dentry)) {
313 /*
314 * Directory without a 'disconnected' dentry; we need to do
315 * d_instantiate() by hand because it takes dcache_lock which
316 * we already hold.
317 */
318 list_add(&real_dent->d_alias, &dent_inode->i_dentry);
319 real_dent->d_inode = dent_inode;
320 spin_unlock(&dcache_lock);
321 security_d_instantiate(real_dent, dent_inode);
322 ntfs_debug("Done. (Already had negative directory dentry.)");
323 return real_dent;
324 }
325 /*
326 * Directory with a 'disconnected' dentry; get a reference to the
327 * 'disconnected' dentry.
328 */
329 new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
330 d_alias);
331 dget_locked(new_dent);
332 spin_unlock(&dcache_lock);
333 /* Do security vodoo. */
334 security_d_instantiate(real_dent, dent_inode);
335 /* Move new_dent in place of real_dent. */
336 d_move(new_dent, real_dent);
337 /* Balance the ntfs_iget() we did above. */
338 iput(dent_inode);
339 /* Throw away real_dent. */
340 dput(real_dent);
341 /* Use new_dent as the actual dentry. */
342 ntfs_debug("Done. (Already had negative, disconnected directory "
343 "dentry.)");
344 return new_dent;
345 260
346eio_err_out: 261eio_err_out:
347 ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); 262 ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e8..4087fbdac327 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
113 * Reason flags (32-bit). Cumulative flags describing the change(s) to the 113 * Reason flags (32-bit). Cumulative flags describing the change(s) to the
114 * file since it was last opened. I think the names speak for themselves but 114 * file since it was last opened. I think the names speak for themselves but
115 * if you disagree check out the descriptions in the Linux NTFS project NTFS 115 * if you disagree check out the descriptions in the Linux NTFS project NTFS
116 * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html 116 * documentation: http://www.linux-ntfs.org/
117 */ 117 */
118enum { 118enum {
119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), 119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
145 * Source info flags (32-bit). Information about the source of the change(s) 145 * Source info flags (32-bit). Information about the source of the change(s)
146 * to the file. For detailed descriptions of what these mean, see the Linux 146 * to the file. For detailed descriptions of what these mean, see the Linux
147 * NTFS project NTFS documentation: 147 * NTFS project NTFS documentation:
148 * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html 148 * http://www.linux-ntfs.org/
149 */ 149 */
150enum { 150enum {
151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), 151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001),
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de56fdb..589dcdfdfe3c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
34 symlink.o \ 34 symlink.o \
35 sysfile.o \ 35 sysfile.o \
36 uptodate.o \ 36 uptodate.o \
37 ver.o 37 ver.o \
38 xattr.o
38 39
39ocfs2_stackglue-objs := stackglue.o 40ocfs2_stackglue-objs := stackglue.o
40ocfs2_stack_o2cb-objs := stack_o2cb.o 41ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e068..0cc2deb9394c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,340 @@
49 49
50#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51 51
52
53/*
54 * Operations for a specific extent tree type.
55 *
56 * To implement an on-disk btree (extent tree) type in ocfs2, add
57 * an ocfs2_extent_tree_operations structure and the matching
58 * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it
59 * for the allocation portion of the extent tree.
60 */
61struct ocfs2_extent_tree_operations {
62 /*
63 * last_eb_blk is the block number of the right most leaf extent
64 * block. Most on-disk structures containing an extent tree store
65 * this value for fast access. The ->eo_set_last_eb_blk() and
66 * ->eo_get_last_eb_blk() operations access this value. They are
67 * both required.
68 */
69 void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
70 u64 blkno);
71 u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
72
73 /*
74 * The on-disk structure usually keeps track of how many total
75 * clusters are stored in this extent tree. This function updates
76 * that value. new_clusters is the delta, and must be
77 * added to the total. Required.
78 */
79 void (*eo_update_clusters)(struct inode *inode,
80 struct ocfs2_extent_tree *et,
81 u32 new_clusters);
82
83 /*
84 * If ->eo_insert_check() exists, it is called before rec is
85 * inserted into the extent tree. It is optional.
86 */
87 int (*eo_insert_check)(struct inode *inode,
88 struct ocfs2_extent_tree *et,
89 struct ocfs2_extent_rec *rec);
90 int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
91
92 /*
93 * --------------------------------------------------------------
94 * The remaining are internal to ocfs2_extent_tree and don't have
95 * accessor functions
96 */
97
98 /*
99 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
100 * It is required.
101 */
102 void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
103
104 /*
105 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
106 * it exists. If it does not, et->et_max_leaf_clusters is set
107 * to 0 (unlimited). Optional.
108 */
109 void (*eo_fill_max_leaf_clusters)(struct inode *inode,
110 struct ocfs2_extent_tree *et);
111};
112
113
114/*
115 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
116 * in the methods.
117 */
118static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
119static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
120 u64 blkno);
121static void ocfs2_dinode_update_clusters(struct inode *inode,
122 struct ocfs2_extent_tree *et,
123 u32 clusters);
124static int ocfs2_dinode_insert_check(struct inode *inode,
125 struct ocfs2_extent_tree *et,
126 struct ocfs2_extent_rec *rec);
127static int ocfs2_dinode_sanity_check(struct inode *inode,
128 struct ocfs2_extent_tree *et);
129static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
130static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
131 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
132 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
133 .eo_update_clusters = ocfs2_dinode_update_clusters,
134 .eo_insert_check = ocfs2_dinode_insert_check,
135 .eo_sanity_check = ocfs2_dinode_sanity_check,
136 .eo_fill_root_el = ocfs2_dinode_fill_root_el,
137};
138
139static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
140 u64 blkno)
141{
142 struct ocfs2_dinode *di = et->et_object;
143
144 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
145 di->i_last_eb_blk = cpu_to_le64(blkno);
146}
147
148static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
149{
150 struct ocfs2_dinode *di = et->et_object;
151
152 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
153 return le64_to_cpu(di->i_last_eb_blk);
154}
155
156static void ocfs2_dinode_update_clusters(struct inode *inode,
157 struct ocfs2_extent_tree *et,
158 u32 clusters)
159{
160 struct ocfs2_dinode *di = et->et_object;
161
162 le32_add_cpu(&di->i_clusters, clusters);
163 spin_lock(&OCFS2_I(inode)->ip_lock);
164 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
165 spin_unlock(&OCFS2_I(inode)->ip_lock);
166}
167
168static int ocfs2_dinode_insert_check(struct inode *inode,
169 struct ocfs2_extent_tree *et,
170 struct ocfs2_extent_rec *rec)
171{
172 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
173
174 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
175 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
176 (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
177 "Device %s, asking for sparse allocation: inode %llu, "
178 "cpos %u, clusters %u\n",
179 osb->dev_str,
180 (unsigned long long)OCFS2_I(inode)->ip_blkno,
181 rec->e_cpos,
182 OCFS2_I(inode)->ip_clusters);
183
184 return 0;
185}
186
187static int ocfs2_dinode_sanity_check(struct inode *inode,
188 struct ocfs2_extent_tree *et)
189{
190 int ret = 0;
191 struct ocfs2_dinode *di;
192
193 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
194
195 di = et->et_object;
196 if (!OCFS2_IS_VALID_DINODE(di)) {
197 ret = -EIO;
198 ocfs2_error(inode->i_sb,
199 "Inode %llu has invalid path root",
200 (unsigned long long)OCFS2_I(inode)->ip_blkno);
201 }
202
203 return ret;
204}
205
206static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
207{
208 struct ocfs2_dinode *di = et->et_object;
209
210 et->et_root_el = &di->id2.i_list;
211}
212
213
214static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
215{
216 struct ocfs2_xattr_value_root *xv = et->et_object;
217
218 et->et_root_el = &xv->xr_list;
219}
220
221static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
222 u64 blkno)
223{
224 struct ocfs2_xattr_value_root *xv =
225 (struct ocfs2_xattr_value_root *)et->et_object;
226
227 xv->xr_last_eb_blk = cpu_to_le64(blkno);
228}
229
230static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
231{
232 struct ocfs2_xattr_value_root *xv =
233 (struct ocfs2_xattr_value_root *) et->et_object;
234
235 return le64_to_cpu(xv->xr_last_eb_blk);
236}
237
238static void ocfs2_xattr_value_update_clusters(struct inode *inode,
239 struct ocfs2_extent_tree *et,
240 u32 clusters)
241{
242 struct ocfs2_xattr_value_root *xv =
243 (struct ocfs2_xattr_value_root *)et->et_object;
244
245 le32_add_cpu(&xv->xr_clusters, clusters);
246}
247
248static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
249 .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
250 .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
251 .eo_update_clusters = ocfs2_xattr_value_update_clusters,
252 .eo_fill_root_el = ocfs2_xattr_value_fill_root_el,
253};
254
255static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
256{
257 struct ocfs2_xattr_block *xb = et->et_object;
258
259 et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
260}
261
262static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
263 struct ocfs2_extent_tree *et)
264{
265 et->et_max_leaf_clusters =
266 ocfs2_clusters_for_bytes(inode->i_sb,
267 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
268}
269
270static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
271 u64 blkno)
272{
273 struct ocfs2_xattr_block *xb = et->et_object;
274 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
275
276 xt->xt_last_eb_blk = cpu_to_le64(blkno);
277}
278
279static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
280{
281 struct ocfs2_xattr_block *xb = et->et_object;
282 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
283
284 return le64_to_cpu(xt->xt_last_eb_blk);
285}
286
287static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
288 struct ocfs2_extent_tree *et,
289 u32 clusters)
290{
291 struct ocfs2_xattr_block *xb = et->et_object;
292
293 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
294}
295
296static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
297 .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
298 .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
299 .eo_update_clusters = ocfs2_xattr_tree_update_clusters,
300 .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el,
301 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
302};
303
304static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
305 struct inode *inode,
306 struct buffer_head *bh,
307 void *obj,
308 struct ocfs2_extent_tree_operations *ops)
309{
310 et->et_ops = ops;
311 et->et_root_bh = bh;
312 if (!obj)
313 obj = (void *)bh->b_data;
314 et->et_object = obj;
315
316 et->et_ops->eo_fill_root_el(et);
317 if (!et->et_ops->eo_fill_max_leaf_clusters)
318 et->et_max_leaf_clusters = 0;
319 else
320 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
321}
322
323void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
324 struct inode *inode,
325 struct buffer_head *bh)
326{
327 __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
328}
329
330void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
331 struct inode *inode,
332 struct buffer_head *bh)
333{
334 __ocfs2_init_extent_tree(et, inode, bh, NULL,
335 &ocfs2_xattr_tree_et_ops);
336}
337
338void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 struct inode *inode,
340 struct buffer_head *bh,
341 struct ocfs2_xattr_value_root *xv)
342{
343 __ocfs2_init_extent_tree(et, inode, bh, xv,
344 &ocfs2_xattr_value_et_ops);
345}
346
347static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
348 u64 new_last_eb_blk)
349{
350 et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
351}
352
353static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
354{
355 return et->et_ops->eo_get_last_eb_blk(et);
356}
357
358static inline void ocfs2_et_update_clusters(struct inode *inode,
359 struct ocfs2_extent_tree *et,
360 u32 clusters)
361{
362 et->et_ops->eo_update_clusters(inode, et, clusters);
363}
364
365static inline int ocfs2_et_insert_check(struct inode *inode,
366 struct ocfs2_extent_tree *et,
367 struct ocfs2_extent_rec *rec)
368{
369 int ret = 0;
370
371 if (et->et_ops->eo_insert_check)
372 ret = et->et_ops->eo_insert_check(inode, et, rec);
373 return ret;
374}
375
376static inline int ocfs2_et_sanity_check(struct inode *inode,
377 struct ocfs2_extent_tree *et)
378{
379 int ret = 0;
380
381 if (et->et_ops->eo_sanity_check)
382 ret = et->et_ops->eo_sanity_check(inode, et);
383 return ret;
384}
385
52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 386static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 387static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54 struct ocfs2_extent_block *eb); 388 struct ocfs2_extent_block *eb);
@@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
205} 539}
206 540
207/* 541/*
208 * Allocate and initialize a new path based on a disk inode tree.
209 */
210static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
211{
212 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
213 struct ocfs2_extent_list *el = &di->id2.i_list;
214
215 return ocfs2_new_path(di_bh, el);
216}
217
218/*
219 * Convenience function to journal all components in a path. 542 * Convenience function to journal all components in a path.
220 */ 543 */
221static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 544static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
@@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt {
368 */ 691 */
369int ocfs2_num_free_extents(struct ocfs2_super *osb, 692int ocfs2_num_free_extents(struct ocfs2_super *osb,
370 struct inode *inode, 693 struct inode *inode,
371 struct ocfs2_dinode *fe) 694 struct ocfs2_extent_tree *et)
372{ 695{
373 int retval; 696 int retval;
374 struct ocfs2_extent_list *el; 697 struct ocfs2_extent_list *el = NULL;
375 struct ocfs2_extent_block *eb; 698 struct ocfs2_extent_block *eb;
376 struct buffer_head *eb_bh = NULL; 699 struct buffer_head *eb_bh = NULL;
700 u64 last_eb_blk = 0;
377 701
378 mlog_entry_void(); 702 mlog_entry_void();
379 703
380 if (!OCFS2_IS_VALID_DINODE(fe)) { 704 el = et->et_root_el;
381 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 705 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
382 retval = -EIO;
383 goto bail;
384 }
385 706
386 if (fe->i_last_eb_blk) { 707 if (last_eb_blk) {
387 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 708 retval = ocfs2_read_block(inode, last_eb_blk,
388 &eb_bh, OCFS2_BH_CACHED, inode); 709 &eb_bh);
389 if (retval < 0) { 710 if (retval < 0) {
390 mlog_errno(retval); 711 mlog_errno(retval);
391 goto bail; 712 goto bail;
392 } 713 }
393 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 714 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
394 el = &eb->h_list; 715 el = &eb->h_list;
395 } else 716 }
396 el = &fe->id2.i_list;
397 717
398 BUG_ON(el->l_tree_depth != 0); 718 BUG_ON(el->l_tree_depth != 0);
399 719
400 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 720 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
401bail: 721bail:
402 if (eb_bh) 722 brelse(eb_bh);
403 brelse(eb_bh);
404 723
405 mlog_exit(retval); 724 mlog_exit(retval);
406 return retval; 725 return retval;
@@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
486bail: 805bail:
487 if (status < 0) { 806 if (status < 0) {
488 for(i = 0; i < wanted; i++) { 807 for(i = 0; i < wanted; i++) {
489 if (bhs[i]) 808 brelse(bhs[i]);
490 brelse(bhs[i]);
491 bhs[i] = NULL; 809 bhs[i] = NULL;
492 } 810 }
493 } 811 }
@@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
531static int ocfs2_add_branch(struct ocfs2_super *osb, 849static int ocfs2_add_branch(struct ocfs2_super *osb,
532 handle_t *handle, 850 handle_t *handle,
533 struct inode *inode, 851 struct inode *inode,
534 struct buffer_head *fe_bh, 852 struct ocfs2_extent_tree *et,
535 struct buffer_head *eb_bh, 853 struct buffer_head *eb_bh,
536 struct buffer_head **last_eb_bh, 854 struct buffer_head **last_eb_bh,
537 struct ocfs2_alloc_context *meta_ac) 855 struct ocfs2_alloc_context *meta_ac)
@@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
540 u64 next_blkno, new_last_eb_blk; 858 u64 next_blkno, new_last_eb_blk;
541 struct buffer_head *bh; 859 struct buffer_head *bh;
542 struct buffer_head **new_eb_bhs = NULL; 860 struct buffer_head **new_eb_bhs = NULL;
543 struct ocfs2_dinode *fe;
544 struct ocfs2_extent_block *eb; 861 struct ocfs2_extent_block *eb;
545 struct ocfs2_extent_list *eb_el; 862 struct ocfs2_extent_list *eb_el;
546 struct ocfs2_extent_list *el; 863 struct ocfs2_extent_list *el;
@@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
550 867
551 BUG_ON(!last_eb_bh || !*last_eb_bh); 868 BUG_ON(!last_eb_bh || !*last_eb_bh);
552 869
553 fe = (struct ocfs2_dinode *) fe_bh->b_data;
554
555 if (eb_bh) { 870 if (eb_bh) {
556 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 871 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
557 el = &eb->h_list; 872 el = &eb->h_list;
558 } else 873 } else
559 el = &fe->id2.i_list; 874 el = et->et_root_el;
560 875
561 /* we never add a branch to a leaf. */ 876 /* we never add a branch to a leaf. */
562 BUG_ON(!el->l_tree_depth); 877 BUG_ON(!el->l_tree_depth);
@@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
646 mlog_errno(status); 961 mlog_errno(status);
647 goto bail; 962 goto bail;
648 } 963 }
649 status = ocfs2_journal_access(handle, inode, fe_bh, 964 status = ocfs2_journal_access(handle, inode, et->et_root_bh,
650 OCFS2_JOURNAL_ACCESS_WRITE); 965 OCFS2_JOURNAL_ACCESS_WRITE);
651 if (status < 0) { 966 if (status < 0) {
652 mlog_errno(status); 967 mlog_errno(status);
@@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
662 } 977 }
663 978
664 /* Link the new branch into the rest of the tree (el will 979 /* Link the new branch into the rest of the tree (el will
665 * either be on the fe, or the extent block passed in. */ 980 * either be on the root_bh, or the extent block passed in. */
666 i = le16_to_cpu(el->l_next_free_rec); 981 i = le16_to_cpu(el->l_next_free_rec);
667 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 982 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
668 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 983 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
671 986
672 /* fe needs a new last extent block pointer, as does the 987 /* fe needs a new last extent block pointer, as does the
673 * next_leaf on the previously last-extent-block. */ 988 * next_leaf on the previously last-extent-block. */
674 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 989 ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
675 990
676 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 991 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
677 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 992 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
679 status = ocfs2_journal_dirty(handle, *last_eb_bh); 994 status = ocfs2_journal_dirty(handle, *last_eb_bh);
680 if (status < 0) 995 if (status < 0)
681 mlog_errno(status); 996 mlog_errno(status);
682 status = ocfs2_journal_dirty(handle, fe_bh); 997 status = ocfs2_journal_dirty(handle, et->et_root_bh);
683 if (status < 0) 998 if (status < 0)
684 mlog_errno(status); 999 mlog_errno(status);
685 if (eb_bh) { 1000 if (eb_bh) {
@@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
700bail: 1015bail:
701 if (new_eb_bhs) { 1016 if (new_eb_bhs) {
702 for (i = 0; i < new_blocks; i++) 1017 for (i = 0; i < new_blocks; i++)
703 if (new_eb_bhs[i]) 1018 brelse(new_eb_bhs[i]);
704 brelse(new_eb_bhs[i]);
705 kfree(new_eb_bhs); 1019 kfree(new_eb_bhs);
706 } 1020 }
707 1021
@@ -717,16 +1031,15 @@ bail:
717static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 1031static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
718 handle_t *handle, 1032 handle_t *handle,
719 struct inode *inode, 1033 struct inode *inode,
720 struct buffer_head *fe_bh, 1034 struct ocfs2_extent_tree *et,
721 struct ocfs2_alloc_context *meta_ac, 1035 struct ocfs2_alloc_context *meta_ac,
722 struct buffer_head **ret_new_eb_bh) 1036 struct buffer_head **ret_new_eb_bh)
723{ 1037{
724 int status, i; 1038 int status, i;
725 u32 new_clusters; 1039 u32 new_clusters;
726 struct buffer_head *new_eb_bh = NULL; 1040 struct buffer_head *new_eb_bh = NULL;
727 struct ocfs2_dinode *fe;
728 struct ocfs2_extent_block *eb; 1041 struct ocfs2_extent_block *eb;
729 struct ocfs2_extent_list *fe_el; 1042 struct ocfs2_extent_list *root_el;
730 struct ocfs2_extent_list *eb_el; 1043 struct ocfs2_extent_list *eb_el;
731 1044
732 mlog_entry_void(); 1045 mlog_entry_void();
@@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
746 } 1059 }
747 1060
748 eb_el = &eb->h_list; 1061 eb_el = &eb->h_list;
749 fe = (struct ocfs2_dinode *) fe_bh->b_data; 1062 root_el = et->et_root_el;
750 fe_el = &fe->id2.i_list;
751 1063
752 status = ocfs2_journal_access(handle, inode, new_eb_bh, 1064 status = ocfs2_journal_access(handle, inode, new_eb_bh,
753 OCFS2_JOURNAL_ACCESS_CREATE); 1065 OCFS2_JOURNAL_ACCESS_CREATE);
@@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
756 goto bail; 1068 goto bail;
757 } 1069 }
758 1070
759 /* copy the fe data into the new extent block */ 1071 /* copy the root extent list data into the new extent block */
760 eb_el->l_tree_depth = fe_el->l_tree_depth; 1072 eb_el->l_tree_depth = root_el->l_tree_depth;
761 eb_el->l_next_free_rec = fe_el->l_next_free_rec; 1073 eb_el->l_next_free_rec = root_el->l_next_free_rec;
762 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 1074 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
763 eb_el->l_recs[i] = fe_el->l_recs[i]; 1075 eb_el->l_recs[i] = root_el->l_recs[i];
764 1076
765 status = ocfs2_journal_dirty(handle, new_eb_bh); 1077 status = ocfs2_journal_dirty(handle, new_eb_bh);
766 if (status < 0) { 1078 if (status < 0) {
@@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
768 goto bail; 1080 goto bail;
769 } 1081 }
770 1082
771 status = ocfs2_journal_access(handle, inode, fe_bh, 1083 status = ocfs2_journal_access(handle, inode, et->et_root_bh,
772 OCFS2_JOURNAL_ACCESS_WRITE); 1084 OCFS2_JOURNAL_ACCESS_WRITE);
773 if (status < 0) { 1085 if (status < 0) {
774 mlog_errno(status); 1086 mlog_errno(status);
@@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
777 1089
778 new_clusters = ocfs2_sum_rightmost_rec(eb_el); 1090 new_clusters = ocfs2_sum_rightmost_rec(eb_el);
779 1091
780 /* update fe now */ 1092 /* update root_bh now */
781 le16_add_cpu(&fe_el->l_tree_depth, 1); 1093 le16_add_cpu(&root_el->l_tree_depth, 1);
782 fe_el->l_recs[0].e_cpos = 0; 1094 root_el->l_recs[0].e_cpos = 0;
783 fe_el->l_recs[0].e_blkno = eb->h_blkno; 1095 root_el->l_recs[0].e_blkno = eb->h_blkno;
784 fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 1096 root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
785 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 1097 for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
786 memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 1098 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
787 fe_el->l_next_free_rec = cpu_to_le16(1); 1099 root_el->l_next_free_rec = cpu_to_le16(1);
788 1100
789 /* If this is our 1st tree depth shift, then last_eb_blk 1101 /* If this is our 1st tree depth shift, then last_eb_blk
790 * becomes the allocated extent block */ 1102 * becomes the allocated extent block */
791 if (fe_el->l_tree_depth == cpu_to_le16(1)) 1103 if (root_el->l_tree_depth == cpu_to_le16(1))
792 fe->i_last_eb_blk = eb->h_blkno; 1104 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
793 1105
794 status = ocfs2_journal_dirty(handle, fe_bh); 1106 status = ocfs2_journal_dirty(handle, et->et_root_bh);
795 if (status < 0) { 1107 if (status < 0) {
796 mlog_errno(status); 1108 mlog_errno(status);
797 goto bail; 1109 goto bail;
@@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
801 new_eb_bh = NULL; 1113 new_eb_bh = NULL;
802 status = 0; 1114 status = 0;
803bail: 1115bail:
804 if (new_eb_bh) 1116 brelse(new_eb_bh);
805 brelse(new_eb_bh);
806 1117
807 mlog_exit(status); 1118 mlog_exit(status);
808 return status; 1119 return status;
@@ -817,22 +1128,21 @@ bail:
817 * 1) a lowest extent block is found, then we pass it back in 1128 * 1) a lowest extent block is found, then we pass it back in
818 * *lowest_eb_bh and return '0' 1129 * *lowest_eb_bh and return '0'
819 * 1130 *
820 * 2) the search fails to find anything, but the dinode has room. We 1131 * 2) the search fails to find anything, but the root_el has room. We
821 * pass NULL back in *lowest_eb_bh, but still return '0' 1132 * pass NULL back in *lowest_eb_bh, but still return '0'
822 * 1133 *
823 * 3) the search fails to find anything AND the dinode is full, in 1134 * 3) the search fails to find anything AND the root_el is full, in
824 * which case we return > 0 1135 * which case we return > 0
825 * 1136 *
826 * return status < 0 indicates an error. 1137 * return status < 0 indicates an error.
827 */ 1138 */
828static int ocfs2_find_branch_target(struct ocfs2_super *osb, 1139static int ocfs2_find_branch_target(struct ocfs2_super *osb,
829 struct inode *inode, 1140 struct inode *inode,
830 struct buffer_head *fe_bh, 1141 struct ocfs2_extent_tree *et,
831 struct buffer_head **target_bh) 1142 struct buffer_head **target_bh)
832{ 1143{
833 int status = 0, i; 1144 int status = 0, i;
834 u64 blkno; 1145 u64 blkno;
835 struct ocfs2_dinode *fe;
836 struct ocfs2_extent_block *eb; 1146 struct ocfs2_extent_block *eb;
837 struct ocfs2_extent_list *el; 1147 struct ocfs2_extent_list *el;
838 struct buffer_head *bh = NULL; 1148 struct buffer_head *bh = NULL;
@@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
842 1152
843 *target_bh = NULL; 1153 *target_bh = NULL;
844 1154
845 fe = (struct ocfs2_dinode *) fe_bh->b_data; 1155 el = et->et_root_el;
846 el = &fe->id2.i_list;
847 1156
848 while(le16_to_cpu(el->l_tree_depth) > 1) { 1157 while(le16_to_cpu(el->l_tree_depth) > 1) {
849 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1158 if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
864 goto bail; 1173 goto bail;
865 } 1174 }
866 1175
867 if (bh) { 1176 brelse(bh);
868 brelse(bh); 1177 bh = NULL;
869 bh = NULL;
870 }
871 1178
872 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, 1179 status = ocfs2_read_block(inode, blkno, &bh);
873 inode);
874 if (status < 0) { 1180 if (status < 0) {
875 mlog_errno(status); 1181 mlog_errno(status);
876 goto bail; 1182 goto bail;
@@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
886 1192
887 if (le16_to_cpu(el->l_next_free_rec) < 1193 if (le16_to_cpu(el->l_next_free_rec) <
888 le16_to_cpu(el->l_count)) { 1194 le16_to_cpu(el->l_count)) {
889 if (lowest_bh) 1195 brelse(lowest_bh);
890 brelse(lowest_bh);
891 lowest_bh = bh; 1196 lowest_bh = bh;
892 get_bh(lowest_bh); 1197 get_bh(lowest_bh);
893 } 1198 }
@@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
895 1200
896 /* If we didn't find one and the fe doesn't have any room, 1201 /* If we didn't find one and the fe doesn't have any room,
897 * then return '1' */ 1202 * then return '1' */
898 if (!lowest_bh 1203 el = et->et_root_el;
899 && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) 1204 if (!lowest_bh && (el->l_next_free_rec == el->l_count))
900 status = 1; 1205 status = 1;
901 1206
902 *target_bh = lowest_bh; 1207 *target_bh = lowest_bh;
903bail: 1208bail:
904 if (bh) 1209 brelse(bh);
905 brelse(bh);
906 1210
907 mlog_exit(status); 1211 mlog_exit(status);
908 return status; 1212 return status;
@@ -919,19 +1223,19 @@ bail:
919 * *last_eb_bh will be updated by ocfs2_add_branch(). 1223 * *last_eb_bh will be updated by ocfs2_add_branch().
920 */ 1224 */
921static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 1225static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
922 struct buffer_head *di_bh, int *final_depth, 1226 struct ocfs2_extent_tree *et, int *final_depth,
923 struct buffer_head **last_eb_bh, 1227 struct buffer_head **last_eb_bh,
924 struct ocfs2_alloc_context *meta_ac) 1228 struct ocfs2_alloc_context *meta_ac)
925{ 1229{
926 int ret, shift; 1230 int ret, shift;
927 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1231 struct ocfs2_extent_list *el = et->et_root_el;
928 int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); 1232 int depth = le16_to_cpu(el->l_tree_depth);
929 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
930 struct buffer_head *bh = NULL; 1234 struct buffer_head *bh = NULL;
931 1235
932 BUG_ON(meta_ac == NULL); 1236 BUG_ON(meta_ac == NULL);
933 1237
934 shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); 1238 shift = ocfs2_find_branch_target(osb, inode, et, &bh);
935 if (shift < 0) { 1239 if (shift < 0) {
936 ret = shift; 1240 ret = shift;
937 mlog_errno(ret); 1241 mlog_errno(ret);
@@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
948 /* ocfs2_shift_tree_depth will return us a buffer with 1252 /* ocfs2_shift_tree_depth will return us a buffer with
949 * the new extent block (so we can pass that to 1253 * the new extent block (so we can pass that to
950 * ocfs2_add_branch). */ 1254 * ocfs2_add_branch). */
951 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, 1255 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
952 meta_ac, &bh); 1256 meta_ac, &bh);
953 if (ret < 0) { 1257 if (ret < 0) {
954 mlog_errno(ret); 1258 mlog_errno(ret);
@@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
975 /* call ocfs2_add_branch to add the final part of the tree with 1279 /* call ocfs2_add_branch to add the final part of the tree with
976 * the new data. */ 1280 * the new data. */
977 mlog(0, "add branch. bh = %p\n", bh); 1281 mlog(0, "add branch. bh = %p\n", bh);
978 ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, 1282 ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
979 meta_ac); 1283 meta_ac);
980 if (ret < 0) { 1284 if (ret < 0) {
981 mlog_errno(ret); 1285 mlog_errno(ret);
@@ -990,15 +1294,6 @@ out:
990} 1294}
991 1295
992/* 1296/*
993 * This is only valid for leaf nodes, which are the only ones that can
994 * have empty extents anyway.
995 */
996static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
997{
998 return !rec->e_leaf_clusters;
999}
1000
1001/*
1002 * This function will discard the rightmost extent record. 1297 * This function will discard the rightmost extent record.
1003 */ 1298 */
1004static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 1299static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
@@ -1245,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
1245 1540
1246 brelse(bh); 1541 brelse(bh);
1247 bh = NULL; 1542 bh = NULL;
1248 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, 1543 ret = ocfs2_read_block(inode, blkno, &bh);
1249 &bh, OCFS2_BH_CACHED, inode);
1250 if (ret) { 1544 if (ret) {
1251 mlog_errno(ret); 1545 mlog_errno(ret);
1252 goto out; 1546 goto out;
@@ -2067,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2067 struct ocfs2_path *right_path, 2361 struct ocfs2_path *right_path,
2068 int subtree_index, 2362 int subtree_index,
2069 struct ocfs2_cached_dealloc_ctxt *dealloc, 2363 struct ocfs2_cached_dealloc_ctxt *dealloc,
2070 int *deleted) 2364 int *deleted,
2365 struct ocfs2_extent_tree *et)
2071{ 2366{
2072 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2367 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2073 struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); 2368 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2074 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2075 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; 2369 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2076 struct ocfs2_extent_block *eb; 2370 struct ocfs2_extent_block *eb;
2077 2371
@@ -2123,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2123 * We have to update i_last_eb_blk during the meta 2417 * We have to update i_last_eb_blk during the meta
2124 * data delete. 2418 * data delete.
2125 */ 2419 */
2126 ret = ocfs2_journal_access(handle, inode, di_bh, 2420 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2127 OCFS2_JOURNAL_ACCESS_WRITE); 2421 OCFS2_JOURNAL_ACCESS_WRITE);
2128 if (ret) { 2422 if (ret) {
2129 mlog_errno(ret); 2423 mlog_errno(ret);
@@ -2198,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2198 ocfs2_update_edge_lengths(inode, handle, left_path); 2492 ocfs2_update_edge_lengths(inode, handle, left_path);
2199 2493
2200 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2494 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2201 di->i_last_eb_blk = eb->h_blkno; 2495 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2202 2496
2203 /* 2497 /*
2204 * Removal of the extent in the left leaf was skipped 2498 * Removal of the extent in the left leaf was skipped
@@ -2208,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2208 if (right_has_empty) 2502 if (right_has_empty)
2209 ocfs2_remove_empty_extent(left_leaf_el); 2503 ocfs2_remove_empty_extent(left_leaf_el);
2210 2504
2211 ret = ocfs2_journal_dirty(handle, di_bh); 2505 ret = ocfs2_journal_dirty(handle, et_root_bh);
2212 if (ret) 2506 if (ret)
2213 mlog_errno(ret); 2507 mlog_errno(ret);
2214 2508
@@ -2331,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2331 handle_t *handle, int orig_credits, 2625 handle_t *handle, int orig_credits,
2332 struct ocfs2_path *path, 2626 struct ocfs2_path *path,
2333 struct ocfs2_cached_dealloc_ctxt *dealloc, 2627 struct ocfs2_cached_dealloc_ctxt *dealloc,
2334 struct ocfs2_path **empty_extent_path) 2628 struct ocfs2_path **empty_extent_path,
2629 struct ocfs2_extent_tree *et)
2335{ 2630{
2336 int ret, subtree_root, deleted; 2631 int ret, subtree_root, deleted;
2337 u32 right_cpos; 2632 u32 right_cpos;
@@ -2404,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2404 2699
2405 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 2700 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2406 right_path, subtree_root, 2701 right_path, subtree_root,
2407 dealloc, &deleted); 2702 dealloc, &deleted, et);
2408 if (ret == -EAGAIN) { 2703 if (ret == -EAGAIN) {
2409 /* 2704 /*
2410 * The rotation has to temporarily stop due to 2705 * The rotation has to temporarily stop due to
@@ -2447,29 +2742,20 @@ out:
2447} 2742}
2448 2743
2449static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, 2744static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2450 struct ocfs2_path *path, 2745 struct ocfs2_path *path,
2451 struct ocfs2_cached_dealloc_ctxt *dealloc) 2746 struct ocfs2_cached_dealloc_ctxt *dealloc,
2747 struct ocfs2_extent_tree *et)
2452{ 2748{
2453 int ret, subtree_index; 2749 int ret, subtree_index;
2454 u32 cpos; 2750 u32 cpos;
2455 struct ocfs2_path *left_path = NULL; 2751 struct ocfs2_path *left_path = NULL;
2456 struct ocfs2_dinode *di;
2457 struct ocfs2_extent_block *eb; 2752 struct ocfs2_extent_block *eb;
2458 struct ocfs2_extent_list *el; 2753 struct ocfs2_extent_list *el;
2459 2754
2460 /*
2461 * XXX: This code assumes that the root is an inode, which is
2462 * true for now but may change as tree code gets generic.
2463 */
2464 di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2465 if (!OCFS2_IS_VALID_DINODE(di)) {
2466 ret = -EIO;
2467 ocfs2_error(inode->i_sb,
2468 "Inode %llu has invalid path root",
2469 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2470 goto out;
2471 }
2472 2755
2756 ret = ocfs2_et_sanity_check(inode, et);
2757 if (ret)
2758 goto out;
2473 /* 2759 /*
2474 * There's two ways we handle this depending on 2760 * There's two ways we handle this depending on
2475 * whether path is the only existing one. 2761 * whether path is the only existing one.
@@ -2526,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2526 ocfs2_update_edge_lengths(inode, handle, left_path); 2812 ocfs2_update_edge_lengths(inode, handle, left_path);
2527 2813
2528 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2814 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2529 di->i_last_eb_blk = eb->h_blkno; 2815 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2530 } else { 2816 } else {
2531 /* 2817 /*
2532 * 'path' is also the leftmost path which 2818 * 'path' is also the leftmost path which
@@ -2537,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2537 */ 2823 */
2538 ocfs2_unlink_path(inode, handle, dealloc, path, 1); 2824 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2539 2825
2540 el = &di->id2.i_list; 2826 el = et->et_root_el;
2541 el->l_tree_depth = 0; 2827 el->l_tree_depth = 0;
2542 el->l_next_free_rec = 0; 2828 el->l_next_free_rec = 0;
2543 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2829 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2544 2830
2545 di->i_last_eb_blk = 0; 2831 ocfs2_et_set_last_eb_blk(et, 0);
2546 } 2832 }
2547 2833
2548 ocfs2_journal_dirty(handle, path_root_bh(path)); 2834 ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2570,7 +2856,8 @@ out:
2570 */ 2856 */
2571static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, 2857static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2572 struct ocfs2_path *path, 2858 struct ocfs2_path *path,
2573 struct ocfs2_cached_dealloc_ctxt *dealloc) 2859 struct ocfs2_cached_dealloc_ctxt *dealloc,
2860 struct ocfs2_extent_tree *et)
2574{ 2861{
2575 int ret, orig_credits = handle->h_buffer_credits; 2862 int ret, orig_credits = handle->h_buffer_credits;
2576 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 2863 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2584,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2584 if (path->p_tree_depth == 0) { 2871 if (path->p_tree_depth == 0) {
2585rightmost_no_delete: 2872rightmost_no_delete:
2586 /* 2873 /*
2587 * In-inode extents. This is trivially handled, so do 2874 * Inline extents. This is trivially handled, so do
2588 * it up front. 2875 * it up front.
2589 */ 2876 */
2590 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 2877 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2638,7 +2925,7 @@ rightmost_no_delete:
2638 */ 2925 */
2639 2926
2640 ret = ocfs2_remove_rightmost_path(inode, handle, path, 2927 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2641 dealloc); 2928 dealloc, et);
2642 if (ret) 2929 if (ret)
2643 mlog_errno(ret); 2930 mlog_errno(ret);
2644 goto out; 2931 goto out;
@@ -2650,7 +2937,7 @@ rightmost_no_delete:
2650 */ 2937 */
2651try_rotate: 2938try_rotate:
2652 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, 2939 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2653 dealloc, &restart_path); 2940 dealloc, &restart_path, et);
2654 if (ret && ret != -EAGAIN) { 2941 if (ret && ret != -EAGAIN) {
2655 mlog_errno(ret); 2942 mlog_errno(ret);
2656 goto out; 2943 goto out;
@@ -2662,7 +2949,7 @@ try_rotate:
2662 2949
2663 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, 2950 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2664 tmp_path, dealloc, 2951 tmp_path, dealloc,
2665 &restart_path); 2952 &restart_path, et);
2666 if (ret && ret != -EAGAIN) { 2953 if (ret && ret != -EAGAIN) {
2667 mlog_errno(ret); 2954 mlog_errno(ret);
2668 goto out; 2955 goto out;
@@ -2948,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
2948 handle_t *handle, 3235 handle_t *handle,
2949 struct ocfs2_extent_rec *split_rec, 3236 struct ocfs2_extent_rec *split_rec,
2950 struct ocfs2_cached_dealloc_ctxt *dealloc, 3237 struct ocfs2_cached_dealloc_ctxt *dealloc,
3238 struct ocfs2_extent_tree *et,
2951 int index) 3239 int index)
2952{ 3240{
2953 int ret, i, subtree_index = 0, has_empty_extent = 0; 3241 int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3068,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3068 le16_to_cpu(el->l_next_free_rec) == 1) { 3356 le16_to_cpu(el->l_next_free_rec) == 1) {
3069 3357
3070 ret = ocfs2_remove_rightmost_path(inode, handle, 3358 ret = ocfs2_remove_rightmost_path(inode, handle,
3071 right_path, dealloc); 3359 right_path,
3360 dealloc, et);
3072 if (ret) { 3361 if (ret) {
3073 mlog_errno(ret); 3362 mlog_errno(ret);
3074 goto out; 3363 goto out;
@@ -3095,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3095 int split_index, 3384 int split_index,
3096 struct ocfs2_extent_rec *split_rec, 3385 struct ocfs2_extent_rec *split_rec,
3097 struct ocfs2_cached_dealloc_ctxt *dealloc, 3386 struct ocfs2_cached_dealloc_ctxt *dealloc,
3098 struct ocfs2_merge_ctxt *ctxt) 3387 struct ocfs2_merge_ctxt *ctxt,
3388 struct ocfs2_extent_tree *et)
3099 3389
3100{ 3390{
3101 int ret = 0; 3391 int ret = 0;
@@ -3113,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3113 * illegal. 3403 * illegal.
3114 */ 3404 */
3115 ret = ocfs2_rotate_tree_left(inode, handle, path, 3405 ret = ocfs2_rotate_tree_left(inode, handle, path,
3116 dealloc); 3406 dealloc, et);
3117 if (ret) { 3407 if (ret) {
3118 mlog_errno(ret); 3408 mlog_errno(ret);
3119 goto out; 3409 goto out;
@@ -3156,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3156 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3446 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3157 3447
3158 /* The merge left us with an empty extent, remove it. */ 3448 /* The merge left us with an empty extent, remove it. */
3159 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); 3449 ret = ocfs2_rotate_tree_left(inode, handle, path,
3450 dealloc, et);
3160 if (ret) { 3451 if (ret) {
3161 mlog_errno(ret); 3452 mlog_errno(ret);
3162 goto out; 3453 goto out;
@@ -3170,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3170 */ 3461 */
3171 ret = ocfs2_merge_rec_left(inode, path, 3462 ret = ocfs2_merge_rec_left(inode, path,
3172 handle, rec, 3463 handle, rec,
3173 dealloc, 3464 dealloc, et,
3174 split_index); 3465 split_index);
3175 3466
3176 if (ret) { 3467 if (ret) {
@@ -3179,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3179 } 3470 }
3180 3471
3181 ret = ocfs2_rotate_tree_left(inode, handle, path, 3472 ret = ocfs2_rotate_tree_left(inode, handle, path,
3182 dealloc); 3473 dealloc, et);
3183 /* 3474 /*
3184 * Error from this last rotate is not critical, so 3475 * Error from this last rotate is not critical, so
3185 * print but don't bubble it up. 3476 * print but don't bubble it up.
@@ -3199,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3199 ret = ocfs2_merge_rec_left(inode, 3490 ret = ocfs2_merge_rec_left(inode,
3200 path, 3491 path,
3201 handle, split_rec, 3492 handle, split_rec,
3202 dealloc, 3493 dealloc, et,
3203 split_index); 3494 split_index);
3204 if (ret) { 3495 if (ret) {
3205 mlog_errno(ret); 3496 mlog_errno(ret);
@@ -3222,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3222 * our leaf. Try to rotate it away. 3513 * our leaf. Try to rotate it away.
3223 */ 3514 */
3224 ret = ocfs2_rotate_tree_left(inode, handle, path, 3515 ret = ocfs2_rotate_tree_left(inode, handle, path,
3225 dealloc); 3516 dealloc, et);
3226 if (ret) 3517 if (ret)
3227 mlog_errno(ret); 3518 mlog_errno(ret);
3228 ret = 0; 3519 ret = 0;
@@ -3356,16 +3647,6 @@ rotate:
3356 ocfs2_rotate_leaf(el, insert_rec); 3647 ocfs2_rotate_leaf(el, insert_rec);
3357} 3648}
3358 3649
3359static inline void ocfs2_update_dinode_clusters(struct inode *inode,
3360 struct ocfs2_dinode *di,
3361 u32 clusters)
3362{
3363 le32_add_cpu(&di->i_clusters, clusters);
3364 spin_lock(&OCFS2_I(inode)->ip_lock);
3365 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
3366 spin_unlock(&OCFS2_I(inode)->ip_lock);
3367}
3368
3369static void ocfs2_adjust_rightmost_records(struct inode *inode, 3650static void ocfs2_adjust_rightmost_records(struct inode *inode,
3370 handle_t *handle, 3651 handle_t *handle,
3371 struct ocfs2_path *path, 3652 struct ocfs2_path *path,
@@ -3567,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode,
3567} 3848}
3568 3849
3569/* 3850/*
3570 * This function only does inserts on an allocation b-tree. For dinode 3851 * This function only does inserts on an allocation b-tree. For tree
3571 * lists, ocfs2_insert_at_leaf() is called directly. 3852 * depth = 0, ocfs2_insert_at_leaf() is called directly.
3572 * 3853 *
3573 * right_path is the path we want to do the actual insert 3854 * right_path is the path we want to do the actual insert
3574 * in. left_path should only be passed in if we need to update that 3855 * in. left_path should only be passed in if we need to update that
@@ -3665,7 +3946,7 @@ out:
3665 3946
3666static int ocfs2_do_insert_extent(struct inode *inode, 3947static int ocfs2_do_insert_extent(struct inode *inode,
3667 handle_t *handle, 3948 handle_t *handle,
3668 struct buffer_head *di_bh, 3949 struct ocfs2_extent_tree *et,
3669 struct ocfs2_extent_rec *insert_rec, 3950 struct ocfs2_extent_rec *insert_rec,
3670 struct ocfs2_insert_type *type) 3951 struct ocfs2_insert_type *type)
3671{ 3952{
@@ -3673,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3673 u32 cpos; 3954 u32 cpos;
3674 struct ocfs2_path *right_path = NULL; 3955 struct ocfs2_path *right_path = NULL;
3675 struct ocfs2_path *left_path = NULL; 3956 struct ocfs2_path *left_path = NULL;
3676 struct ocfs2_dinode *di;
3677 struct ocfs2_extent_list *el; 3957 struct ocfs2_extent_list *el;
3678 3958
3679 di = (struct ocfs2_dinode *) di_bh->b_data; 3959 el = et->et_root_el;
3680 el = &di->id2.i_list;
3681 3960
3682 ret = ocfs2_journal_access(handle, inode, di_bh, 3961 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3683 OCFS2_JOURNAL_ACCESS_WRITE); 3962 OCFS2_JOURNAL_ACCESS_WRITE);
3684 if (ret) { 3963 if (ret) {
3685 mlog_errno(ret); 3964 mlog_errno(ret);
@@ -3691,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3691 goto out_update_clusters; 3970 goto out_update_clusters;
3692 } 3971 }
3693 3972
3694 right_path = ocfs2_new_inode_path(di_bh); 3973 right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
3695 if (!right_path) { 3974 if (!right_path) {
3696 ret = -ENOMEM; 3975 ret = -ENOMEM;
3697 mlog_errno(ret); 3976 mlog_errno(ret);
@@ -3741,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3741 * ocfs2_rotate_tree_right() might have extended the 4020 * ocfs2_rotate_tree_right() might have extended the
3742 * transaction without re-journaling our tree root. 4021 * transaction without re-journaling our tree root.
3743 */ 4022 */
3744 ret = ocfs2_journal_access(handle, inode, di_bh, 4023 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3745 OCFS2_JOURNAL_ACCESS_WRITE); 4024 OCFS2_JOURNAL_ACCESS_WRITE);
3746 if (ret) { 4025 if (ret) {
3747 mlog_errno(ret); 4026 mlog_errno(ret);
@@ -3766,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3766 4045
3767out_update_clusters: 4046out_update_clusters:
3768 if (type->ins_split == SPLIT_NONE) 4047 if (type->ins_split == SPLIT_NONE)
3769 ocfs2_update_dinode_clusters(inode, di, 4048 ocfs2_et_update_clusters(inode, et,
3770 le16_to_cpu(insert_rec->e_leaf_clusters)); 4049 le16_to_cpu(insert_rec->e_leaf_clusters));
3771 4050
3772 ret = ocfs2_journal_dirty(handle, di_bh); 4051 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
3773 if (ret) 4052 if (ret)
3774 mlog_errno(ret); 4053 mlog_errno(ret);
3775 4054
@@ -3899,7 +4178,8 @@ out:
3899static void ocfs2_figure_contig_type(struct inode *inode, 4178static void ocfs2_figure_contig_type(struct inode *inode,
3900 struct ocfs2_insert_type *insert, 4179 struct ocfs2_insert_type *insert,
3901 struct ocfs2_extent_list *el, 4180 struct ocfs2_extent_list *el,
3902 struct ocfs2_extent_rec *insert_rec) 4181 struct ocfs2_extent_rec *insert_rec,
4182 struct ocfs2_extent_tree *et)
3903{ 4183{
3904 int i; 4184 int i;
3905 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4185 enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -3915,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode,
3915 } 4195 }
3916 } 4196 }
3917 insert->ins_contig = contig_type; 4197 insert->ins_contig = contig_type;
4198
4199 if (insert->ins_contig != CONTIG_NONE) {
4200 struct ocfs2_extent_rec *rec =
4201 &el->l_recs[insert->ins_contig_index];
4202 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4203 le16_to_cpu(insert_rec->e_leaf_clusters);
4204
4205 /*
4206 * Caller might want us to limit the size of extents, don't
4207 * calculate contiguousness if we might exceed that limit.
4208 */
4209 if (et->et_max_leaf_clusters &&
4210 (len > et->et_max_leaf_clusters))
4211 insert->ins_contig = CONTIG_NONE;
4212 }
3918} 4213}
3919 4214
3920/* 4215/*
@@ -3923,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
3923 * ocfs2_figure_appending_type() will figure out whether we'll have to 4218 * ocfs2_figure_appending_type() will figure out whether we'll have to
3924 * insert at the tail of the rightmost leaf. 4219 * insert at the tail of the rightmost leaf.
3925 * 4220 *
3926 * This should also work against the dinode list for tree's with 0 4221 * This should also work against the root extent list for tree's with 0
3927 * depth. If we consider the dinode list to be the rightmost leaf node 4222 * depth. If we consider the root extent list to be the rightmost leaf node
3928 * then the logic here makes sense. 4223 * then the logic here makes sense.
3929 */ 4224 */
3930static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 4225static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3975,14 +4270,13 @@ set_tail_append:
3975 * structure. 4270 * structure.
3976 */ 4271 */
3977static int ocfs2_figure_insert_type(struct inode *inode, 4272static int ocfs2_figure_insert_type(struct inode *inode,
3978 struct buffer_head *di_bh, 4273 struct ocfs2_extent_tree *et,
3979 struct buffer_head **last_eb_bh, 4274 struct buffer_head **last_eb_bh,
3980 struct ocfs2_extent_rec *insert_rec, 4275 struct ocfs2_extent_rec *insert_rec,
3981 int *free_records, 4276 int *free_records,
3982 struct ocfs2_insert_type *insert) 4277 struct ocfs2_insert_type *insert)
3983{ 4278{
3984 int ret; 4279 int ret;
3985 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3986 struct ocfs2_extent_block *eb; 4280 struct ocfs2_extent_block *eb;
3987 struct ocfs2_extent_list *el; 4281 struct ocfs2_extent_list *el;
3988 struct ocfs2_path *path = NULL; 4282 struct ocfs2_path *path = NULL;
@@ -3990,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
3990 4284
3991 insert->ins_split = SPLIT_NONE; 4285 insert->ins_split = SPLIT_NONE;
3992 4286
3993 el = &di->id2.i_list; 4287 el = et->et_root_el;
3994 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 4288 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
3995 4289
3996 if (el->l_tree_depth) { 4290 if (el->l_tree_depth) {
@@ -4000,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4000 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4294 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4001 * may want it later. 4295 * may want it later.
4002 */ 4296 */
4003 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4297 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
4004 le64_to_cpu(di->i_last_eb_blk), &bh,
4005 OCFS2_BH_CACHED, inode);
4006 if (ret) { 4298 if (ret) {
4007 mlog_exit(ret); 4299 mlog_exit(ret);
4008 goto out; 4300 goto out;
@@ -4023,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4023 le16_to_cpu(el->l_next_free_rec); 4315 le16_to_cpu(el->l_next_free_rec);
4024 4316
4025 if (!insert->ins_tree_depth) { 4317 if (!insert->ins_tree_depth) {
4026 ocfs2_figure_contig_type(inode, insert, el, insert_rec); 4318 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4027 ocfs2_figure_appending_type(insert, el, insert_rec); 4319 ocfs2_figure_appending_type(insert, el, insert_rec);
4028 return 0; 4320 return 0;
4029 } 4321 }
4030 4322
4031 path = ocfs2_new_inode_path(di_bh); 4323 path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4032 if (!path) { 4324 if (!path) {
4033 ret = -ENOMEM; 4325 ret = -ENOMEM;
4034 mlog_errno(ret); 4326 mlog_errno(ret);
@@ -4057,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4057 * into two types of appends: simple record append, or a 4349 * into two types of appends: simple record append, or a
4058 * rotate inside the tail leaf. 4350 * rotate inside the tail leaf.
4059 */ 4351 */
4060 ocfs2_figure_contig_type(inode, insert, el, insert_rec); 4352 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4061 4353
4062 /* 4354 /*
4063 * The insert code isn't quite ready to deal with all cases of 4355 * The insert code isn't quite ready to deal with all cases of
@@ -4078,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4078 * the case that we're doing a tail append, so maybe we can 4370 * the case that we're doing a tail append, so maybe we can
4079 * take advantage of that information somehow. 4371 * take advantage of that information somehow.
4080 */ 4372 */
4081 if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { 4373 if (ocfs2_et_get_last_eb_blk(et) ==
4374 path_leaf_bh(path)->b_blocknr) {
4082 /* 4375 /*
4083 * Ok, ocfs2_find_path() returned us the rightmost 4376 * Ok, ocfs2_find_path() returned us the rightmost
4084 * tree path. This might be an appending insert. There are 4377 * tree path. This might be an appending insert. There are
@@ -4108,7 +4401,7 @@ out:
4108int ocfs2_insert_extent(struct ocfs2_super *osb, 4401int ocfs2_insert_extent(struct ocfs2_super *osb,
4109 handle_t *handle, 4402 handle_t *handle,
4110 struct inode *inode, 4403 struct inode *inode,
4111 struct buffer_head *fe_bh, 4404 struct ocfs2_extent_tree *et,
4112 u32 cpos, 4405 u32 cpos,
4113 u64 start_blk, 4406 u64 start_blk,
4114 u32 new_clusters, 4407 u32 new_clusters,
@@ -4121,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4121 struct ocfs2_insert_type insert = {0, }; 4414 struct ocfs2_insert_type insert = {0, };
4122 struct ocfs2_extent_rec rec; 4415 struct ocfs2_extent_rec rec;
4123 4416
4124 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4125
4126 mlog(0, "add %u clusters at position %u to inode %llu\n", 4417 mlog(0, "add %u clusters at position %u to inode %llu\n",
4127 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4418 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4128 4419
4129 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4130 (OCFS2_I(inode)->ip_clusters != cpos),
4131 "Device %s, asking for sparse allocation: inode %llu, "
4132 "cpos %u, clusters %u\n",
4133 osb->dev_str,
4134 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4135 OCFS2_I(inode)->ip_clusters);
4136
4137 memset(&rec, 0, sizeof(rec)); 4420 memset(&rec, 0, sizeof(rec));
4138 rec.e_cpos = cpu_to_le32(cpos); 4421 rec.e_cpos = cpu_to_le32(cpos);
4139 rec.e_blkno = cpu_to_le64(start_blk); 4422 rec.e_blkno = cpu_to_le64(start_blk);
4140 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4423 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4141 rec.e_flags = flags; 4424 rec.e_flags = flags;
4425 status = ocfs2_et_insert_check(inode, et, &rec);
4426 if (status) {
4427 mlog_errno(status);
4428 goto bail;
4429 }
4142 4430
4143 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 4431 status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4144 &free_records, &insert); 4432 &free_records, &insert);
4145 if (status < 0) { 4433 if (status < 0) {
4146 mlog_errno(status); 4434 mlog_errno(status);
@@ -4154,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4154 free_records, insert.ins_tree_depth); 4442 free_records, insert.ins_tree_depth);
4155 4443
4156 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4444 if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4157 status = ocfs2_grow_tree(inode, handle, fe_bh, 4445 status = ocfs2_grow_tree(inode, handle, et,
4158 &insert.ins_tree_depth, &last_eb_bh, 4446 &insert.ins_tree_depth, &last_eb_bh,
4159 meta_ac); 4447 meta_ac);
4160 if (status) { 4448 if (status) {
@@ -4164,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4164 } 4452 }
4165 4453
4166 /* Finally, we can add clusters. This might rotate the tree for us. */ 4454 /* Finally, we can add clusters. This might rotate the tree for us. */
4167 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 4455 status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4168 if (status < 0) 4456 if (status < 0)
4169 mlog_errno(status); 4457 mlog_errno(status);
4170 else 4458 else if (et->et_ops == &ocfs2_dinode_et_ops)
4171 ocfs2_extent_map_insert_rec(inode, &rec); 4459 ocfs2_extent_map_insert_rec(inode, &rec);
4172 4460
4173bail: 4461bail:
4174 if (last_eb_bh) 4462 brelse(last_eb_bh);
4175 brelse(last_eb_bh); 4463
4464 mlog_exit(status);
4465 return status;
4466}
4467
4468/*
4469 * Allcate and add clusters into the extent b-tree.
4470 * The new clusters(clusters_to_add) will be inserted at logical_offset.
4471 * The extent b-tree's root is specified by et, and
4472 * it is not limited to the file storage. Any extent tree can use this
4473 * function if it implements the proper ocfs2_extent_tree.
4474 */
4475int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4476 struct inode *inode,
4477 u32 *logical_offset,
4478 u32 clusters_to_add,
4479 int mark_unwritten,
4480 struct ocfs2_extent_tree *et,
4481 handle_t *handle,
4482 struct ocfs2_alloc_context *data_ac,
4483 struct ocfs2_alloc_context *meta_ac,
4484 enum ocfs2_alloc_restarted *reason_ret)
4485{
4486 int status = 0;
4487 int free_extents;
4488 enum ocfs2_alloc_restarted reason = RESTART_NONE;
4489 u32 bit_off, num_bits;
4490 u64 block;
4491 u8 flags = 0;
4492
4493 BUG_ON(!clusters_to_add);
4494
4495 if (mark_unwritten)
4496 flags = OCFS2_EXT_UNWRITTEN;
4497
4498 free_extents = ocfs2_num_free_extents(osb, inode, et);
4499 if (free_extents < 0) {
4500 status = free_extents;
4501 mlog_errno(status);
4502 goto leave;
4503 }
4504
4505 /* there are two cases which could cause us to EAGAIN in the
4506 * we-need-more-metadata case:
4507 * 1) we haven't reserved *any*
4508 * 2) we are so fragmented, we've needed to add metadata too
4509 * many times. */
4510 if (!free_extents && !meta_ac) {
4511 mlog(0, "we haven't reserved any metadata!\n");
4512 status = -EAGAIN;
4513 reason = RESTART_META;
4514 goto leave;
4515 } else if ((!free_extents)
4516 && (ocfs2_alloc_context_bits_left(meta_ac)
4517 < ocfs2_extend_meta_needed(et->et_root_el))) {
4518 mlog(0, "filesystem is really fragmented...\n");
4519 status = -EAGAIN;
4520 reason = RESTART_META;
4521 goto leave;
4522 }
4523
4524 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4525 clusters_to_add, &bit_off, &num_bits);
4526 if (status < 0) {
4527 if (status != -ENOSPC)
4528 mlog_errno(status);
4529 goto leave;
4530 }
4176 4531
4532 BUG_ON(num_bits > clusters_to_add);
4533
4534 /* reserve our write early -- insert_extent may update the inode */
4535 status = ocfs2_journal_access(handle, inode, et->et_root_bh,
4536 OCFS2_JOURNAL_ACCESS_WRITE);
4537 if (status < 0) {
4538 mlog_errno(status);
4539 goto leave;
4540 }
4541
4542 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4543 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4544 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4545 status = ocfs2_insert_extent(osb, handle, inode, et,
4546 *logical_offset, block,
4547 num_bits, flags, meta_ac);
4548 if (status < 0) {
4549 mlog_errno(status);
4550 goto leave;
4551 }
4552
4553 status = ocfs2_journal_dirty(handle, et->et_root_bh);
4554 if (status < 0) {
4555 mlog_errno(status);
4556 goto leave;
4557 }
4558
4559 clusters_to_add -= num_bits;
4560 *logical_offset += num_bits;
4561
4562 if (clusters_to_add) {
4563 mlog(0, "need to alloc once more, wanted = %u\n",
4564 clusters_to_add);
4565 status = -EAGAIN;
4566 reason = RESTART_TRANS;
4567 }
4568
4569leave:
4177 mlog_exit(status); 4570 mlog_exit(status);
4571 if (reason_ret)
4572 *reason_ret = reason;
4178 return status; 4573 return status;
4179} 4574}
4180 4575
@@ -4201,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
4201static int ocfs2_split_and_insert(struct inode *inode, 4596static int ocfs2_split_and_insert(struct inode *inode,
4202 handle_t *handle, 4597 handle_t *handle,
4203 struct ocfs2_path *path, 4598 struct ocfs2_path *path,
4204 struct buffer_head *di_bh, 4599 struct ocfs2_extent_tree *et,
4205 struct buffer_head **last_eb_bh, 4600 struct buffer_head **last_eb_bh,
4206 int split_index, 4601 int split_index,
4207 struct ocfs2_extent_rec *orig_split_rec, 4602 struct ocfs2_extent_rec *orig_split_rec,
@@ -4215,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
4215 struct ocfs2_extent_rec split_rec = *orig_split_rec; 4610 struct ocfs2_extent_rec split_rec = *orig_split_rec;
4216 struct ocfs2_insert_type insert; 4611 struct ocfs2_insert_type insert;
4217 struct ocfs2_extent_block *eb; 4612 struct ocfs2_extent_block *eb;
4218 struct ocfs2_dinode *di;
4219 4613
4220leftright: 4614leftright:
4221 /* 4615 /*
@@ -4224,8 +4618,7 @@ leftright:
4224 */ 4618 */
4225 rec = path_leaf_el(path)->l_recs[split_index]; 4619 rec = path_leaf_el(path)->l_recs[split_index];
4226 4620
4227 di = (struct ocfs2_dinode *)di_bh->b_data; 4621 rightmost_el = et->et_root_el;
4228 rightmost_el = &di->id2.i_list;
4229 4622
4230 depth = le16_to_cpu(rightmost_el->l_tree_depth); 4623 depth = le16_to_cpu(rightmost_el->l_tree_depth);
4231 if (depth) { 4624 if (depth) {
@@ -4236,8 +4629,8 @@ leftright:
4236 4629
4237 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4630 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4238 le16_to_cpu(rightmost_el->l_count)) { 4631 le16_to_cpu(rightmost_el->l_count)) {
4239 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, 4632 ret = ocfs2_grow_tree(inode, handle, et,
4240 meta_ac); 4633 &depth, last_eb_bh, meta_ac);
4241 if (ret) { 4634 if (ret) {
4242 mlog_errno(ret); 4635 mlog_errno(ret);
4243 goto out; 4636 goto out;
@@ -4274,8 +4667,7 @@ leftright:
4274 do_leftright = 1; 4667 do_leftright = 1;
4275 } 4668 }
4276 4669
4277 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, 4670 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4278 &insert);
4279 if (ret) { 4671 if (ret) {
4280 mlog_errno(ret); 4672 mlog_errno(ret);
4281 goto out; 4673 goto out;
@@ -4317,8 +4709,9 @@ out:
4317 * of the tree is required. All other cases will degrade into a less 4709 * of the tree is required. All other cases will degrade into a less
4318 * optimal tree layout. 4710 * optimal tree layout.
4319 * 4711 *
4320 * last_eb_bh should be the rightmost leaf block for any inode with a 4712 * last_eb_bh should be the rightmost leaf block for any extent
4321 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. 4713 * btree. Since a split may grow the tree or a merge might shrink it,
4714 * the caller cannot trust the contents of that buffer after this call.
4322 * 4715 *
4323 * This code is optimized for readability - several passes might be 4716 * This code is optimized for readability - several passes might be
4324 * made over certain portions of the tree. All of those blocks will 4717 * made over certain portions of the tree. All of those blocks will
@@ -4326,7 +4719,7 @@ out:
4326 * extra overhead is not expressed in terms of disk reads. 4719 * extra overhead is not expressed in terms of disk reads.
4327 */ 4720 */
4328static int __ocfs2_mark_extent_written(struct inode *inode, 4721static int __ocfs2_mark_extent_written(struct inode *inode,
4329 struct buffer_head *di_bh, 4722 struct ocfs2_extent_tree *et,
4330 handle_t *handle, 4723 handle_t *handle,
4331 struct ocfs2_path *path, 4724 struct ocfs2_path *path,
4332 int split_index, 4725 int split_index,
@@ -4366,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4366 */ 4759 */
4367 if (path->p_tree_depth) { 4760 if (path->p_tree_depth) {
4368 struct ocfs2_extent_block *eb; 4761 struct ocfs2_extent_block *eb;
4369 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4370 4762
4371 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4763 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
4372 le64_to_cpu(di->i_last_eb_blk), 4764 &last_eb_bh);
4373 &last_eb_bh, OCFS2_BH_CACHED, inode);
4374 if (ret) { 4765 if (ret) {
4375 mlog_exit(ret); 4766 mlog_exit(ret);
4376 goto out; 4767 goto out;
@@ -4403,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4403 if (ctxt.c_split_covers_rec) 4794 if (ctxt.c_split_covers_rec)
4404 el->l_recs[split_index] = *split_rec; 4795 el->l_recs[split_index] = *split_rec;
4405 else 4796 else
4406 ret = ocfs2_split_and_insert(inode, handle, path, di_bh, 4797 ret = ocfs2_split_and_insert(inode, handle, path, et,
4407 &last_eb_bh, split_index, 4798 &last_eb_bh, split_index,
4408 split_rec, meta_ac); 4799 split_rec, meta_ac);
4409 if (ret) 4800 if (ret)
@@ -4411,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4411 } else { 4802 } else {
4412 ret = ocfs2_try_to_merge_extent(inode, handle, path, 4803 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4413 split_index, split_rec, 4804 split_index, split_rec,
4414 dealloc, &ctxt); 4805 dealloc, &ctxt, et);
4415 if (ret) 4806 if (ret)
4416 mlog_errno(ret); 4807 mlog_errno(ret);
4417 } 4808 }
@@ -4429,7 +4820,8 @@ out:
4429 * 4820 *
4430 * The caller is responsible for passing down meta_ac if we'll need it. 4821 * The caller is responsible for passing down meta_ac if we'll need it.
4431 */ 4822 */
4432int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, 4823int ocfs2_mark_extent_written(struct inode *inode,
4824 struct ocfs2_extent_tree *et,
4433 handle_t *handle, u32 cpos, u32 len, u32 phys, 4825 handle_t *handle, u32 cpos, u32 len, u32 phys,
4434 struct ocfs2_alloc_context *meta_ac, 4826 struct ocfs2_alloc_context *meta_ac,
4435 struct ocfs2_cached_dealloc_ctxt *dealloc) 4827 struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4455,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4455 /* 4847 /*
4456 * XXX: This should be fixed up so that we just re-insert the 4848 * XXX: This should be fixed up so that we just re-insert the
4457 * next extent records. 4849 * next extent records.
4850 *
4851 * XXX: This is a hack on the extent tree, maybe it should be
4852 * an op?
4458 */ 4853 */
4459 ocfs2_extent_map_trunc(inode, 0); 4854 if (et->et_ops == &ocfs2_dinode_et_ops)
4855 ocfs2_extent_map_trunc(inode, 0);
4460 4856
4461 left_path = ocfs2_new_inode_path(di_bh); 4857 left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4462 if (!left_path) { 4858 if (!left_path) {
4463 ret = -ENOMEM; 4859 ret = -ENOMEM;
4464 mlog_errno(ret); 4860 mlog_errno(ret);
@@ -4489,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4489 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; 4885 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4490 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; 4886 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4491 4887
4492 ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, 4888 ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4493 index, &split_rec, meta_ac, dealloc); 4889 index, &split_rec, meta_ac,
4890 dealloc);
4494 if (ret) 4891 if (ret)
4495 mlog_errno(ret); 4892 mlog_errno(ret);
4496 4893
@@ -4499,13 +4896,12 @@ out:
4499 return ret; 4896 return ret;
4500} 4897}
4501 4898
4502static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, 4899static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4503 handle_t *handle, struct ocfs2_path *path, 4900 handle_t *handle, struct ocfs2_path *path,
4504 int index, u32 new_range, 4901 int index, u32 new_range,
4505 struct ocfs2_alloc_context *meta_ac) 4902 struct ocfs2_alloc_context *meta_ac)
4506{ 4903{
4507 int ret, depth, credits = handle->h_buffer_credits; 4904 int ret, depth, credits = handle->h_buffer_credits;
4508 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4509 struct buffer_head *last_eb_bh = NULL; 4905 struct buffer_head *last_eb_bh = NULL;
4510 struct ocfs2_extent_block *eb; 4906 struct ocfs2_extent_block *eb;
4511 struct ocfs2_extent_list *rightmost_el, *el; 4907 struct ocfs2_extent_list *rightmost_el, *el;
@@ -4522,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4522 4918
4523 depth = path->p_tree_depth; 4919 depth = path->p_tree_depth;
4524 if (depth > 0) { 4920 if (depth > 0) {
4525 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4921 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
4526 le64_to_cpu(di->i_last_eb_blk), 4922 &last_eb_bh);
4527 &last_eb_bh, OCFS2_BH_CACHED, inode);
4528 if (ret < 0) { 4923 if (ret < 0) {
4529 mlog_errno(ret); 4924 mlog_errno(ret);
4530 goto out; 4925 goto out;
@@ -4535,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4535 } else 4930 } else
4536 rightmost_el = path_leaf_el(path); 4931 rightmost_el = path_leaf_el(path);
4537 4932
4538 credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); 4933 credits += path->p_tree_depth +
4934 ocfs2_extend_meta_needed(et->et_root_el);
4539 ret = ocfs2_extend_trans(handle, credits); 4935 ret = ocfs2_extend_trans(handle, credits);
4540 if (ret) { 4936 if (ret) {
4541 mlog_errno(ret); 4937 mlog_errno(ret);
@@ -4544,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4544 4940
4545 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4941 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4546 le16_to_cpu(rightmost_el->l_count)) { 4942 le16_to_cpu(rightmost_el->l_count)) {
4547 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, 4943 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4548 meta_ac); 4944 meta_ac);
4549 if (ret) { 4945 if (ret) {
4550 mlog_errno(ret); 4946 mlog_errno(ret);
@@ -4558,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4558 insert.ins_split = SPLIT_RIGHT; 4954 insert.ins_split = SPLIT_RIGHT;
4559 insert.ins_tree_depth = depth; 4955 insert.ins_tree_depth = depth;
4560 4956
4561 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); 4957 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4562 if (ret) 4958 if (ret)
4563 mlog_errno(ret); 4959 mlog_errno(ret);
4564 4960
@@ -4570,7 +4966,8 @@ out:
4570static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, 4966static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4571 struct ocfs2_path *path, int index, 4967 struct ocfs2_path *path, int index,
4572 struct ocfs2_cached_dealloc_ctxt *dealloc, 4968 struct ocfs2_cached_dealloc_ctxt *dealloc,
4573 u32 cpos, u32 len) 4969 u32 cpos, u32 len,
4970 struct ocfs2_extent_tree *et)
4574{ 4971{
4575 int ret; 4972 int ret;
4576 u32 left_cpos, rec_range, trunc_range; 4973 u32 left_cpos, rec_range, trunc_range;
@@ -4582,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4582 struct ocfs2_extent_block *eb; 4979 struct ocfs2_extent_block *eb;
4583 4980
4584 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 4981 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4585 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); 4982 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4586 if (ret) { 4983 if (ret) {
4587 mlog_errno(ret); 4984 mlog_errno(ret);
4588 goto out; 4985 goto out;
@@ -4713,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4713 5110
4714 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5111 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4715 5112
4716 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); 5113 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4717 if (ret) { 5114 if (ret) {
4718 mlog_errno(ret); 5115 mlog_errno(ret);
4719 goto out; 5116 goto out;
@@ -4724,7 +5121,8 @@ out:
4724 return ret; 5121 return ret;
4725} 5122}
4726 5123
4727int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, 5124int ocfs2_remove_extent(struct inode *inode,
5125 struct ocfs2_extent_tree *et,
4728 u32 cpos, u32 len, handle_t *handle, 5126 u32 cpos, u32 len, handle_t *handle,
4729 struct ocfs2_alloc_context *meta_ac, 5127 struct ocfs2_alloc_context *meta_ac,
4730 struct ocfs2_cached_dealloc_ctxt *dealloc) 5128 struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4733,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4733 u32 rec_range, trunc_range; 5131 u32 rec_range, trunc_range;
4734 struct ocfs2_extent_rec *rec; 5132 struct ocfs2_extent_rec *rec;
4735 struct ocfs2_extent_list *el; 5133 struct ocfs2_extent_list *el;
4736 struct ocfs2_path *path; 5134 struct ocfs2_path *path = NULL;
4737 5135
4738 ocfs2_extent_map_trunc(inode, 0); 5136 ocfs2_extent_map_trunc(inode, 0);
4739 5137
4740 path = ocfs2_new_inode_path(di_bh); 5138 path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4741 if (!path) { 5139 if (!path) {
4742 ret = -ENOMEM; 5140 ret = -ENOMEM;
4743 mlog_errno(ret); 5141 mlog_errno(ret);
@@ -4790,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4790 5188
4791 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5189 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4792 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5190 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4793 cpos, len); 5191 cpos, len, et);
4794 if (ret) { 5192 if (ret) {
4795 mlog_errno(ret); 5193 mlog_errno(ret);
4796 goto out; 5194 goto out;
4797 } 5195 }
4798 } else { 5196 } else {
4799 ret = ocfs2_split_tree(inode, di_bh, handle, path, index, 5197 ret = ocfs2_split_tree(inode, et, handle, path, index,
4800 trunc_range, meta_ac); 5198 trunc_range, meta_ac);
4801 if (ret) { 5199 if (ret) {
4802 mlog_errno(ret); 5200 mlog_errno(ret);
@@ -4845,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4845 } 5243 }
4846 5244
4847 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5245 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4848 cpos, len); 5246 cpos, len, et);
4849 if (ret) { 5247 if (ret) {
4850 mlog_errno(ret); 5248 mlog_errno(ret);
4851 goto out; 5249 goto out;
@@ -5188,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5188 goto bail; 5586 goto bail;
5189 } 5587 }
5190 5588
5191 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 5589 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
5192 OCFS2_BH_CACHED, inode);
5193 if (status < 0) { 5590 if (status < 0) {
5194 iput(inode); 5591 iput(inode);
5195 mlog_errno(status); 5592 mlog_errno(status);
@@ -5264,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5264bail: 5661bail:
5265 if (tl_inode) 5662 if (tl_inode)
5266 iput(tl_inode); 5663 iput(tl_inode);
5267 if (tl_bh) 5664 brelse(tl_bh);
5268 brelse(tl_bh);
5269 5665
5270 if (status < 0 && (*tl_copy)) { 5666 if (status < 0 && (*tl_copy)) {
5271 kfree(*tl_copy); 5667 kfree(*tl_copy);
@@ -6008,20 +6404,13 @@ bail:
6008 return status; 6404 return status;
6009} 6405}
6010 6406
6011static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) 6407static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6012{ 6408{
6013 set_buffer_uptodate(bh); 6409 set_buffer_uptodate(bh);
6014 mark_buffer_dirty(bh); 6410 mark_buffer_dirty(bh);
6015 return 0; 6411 return 0;
6016} 6412}
6017 6413
6018static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6019{
6020 set_buffer_uptodate(bh);
6021 mark_buffer_dirty(bh);
6022 return ocfs2_journal_dirty_data(handle, bh);
6023}
6024
6025static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 6414static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6026 unsigned int from, unsigned int to, 6415 unsigned int from, unsigned int to,
6027 struct page *page, int zero, u64 *phys) 6416 struct page *page, int zero, u64 *phys)
@@ -6040,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6040 * here if they aren't - ocfs2_map_page_blocks() 6429 * here if they aren't - ocfs2_map_page_blocks()
6041 * might've skipped some 6430 * might've skipped some
6042 */ 6431 */
6043 if (ocfs2_should_order_data(inode)) { 6432 ret = walk_page_buffers(handle, page_buffers(page),
6044 ret = walk_page_buffers(handle, 6433 from, to, &partial,
6045 page_buffers(page), 6434 ocfs2_zero_func);
6046 from, to, &partial, 6435 if (ret < 0)
6047 ocfs2_ordered_zero_func); 6436 mlog_errno(ret);
6048 if (ret < 0) 6437 else if (ocfs2_should_order_data(inode)) {
6049 mlog_errno(ret); 6438 ret = ocfs2_jbd2_file_inode(handle, inode);
6050 } else { 6439#ifdef CONFIG_OCFS2_COMPAT_JBD
6051 ret = walk_page_buffers(handle, page_buffers(page), 6440 ret = walk_page_buffers(handle, page_buffers(page),
6052 from, to, &partial, 6441 from, to, &partial,
6053 ocfs2_writeback_zero_func); 6442 ocfs2_journal_dirty_data);
6443#endif
6054 if (ret < 0) 6444 if (ret < 0)
6055 mlog_errno(ret); 6445 mlog_errno(ret);
6056 } 6446 }
@@ -6215,20 +6605,29 @@ out:
6215 return ret; 6605 return ret;
6216} 6606}
6217 6607
6218static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di) 6608static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6609 struct ocfs2_dinode *di)
6219{ 6610{
6220 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; 6611 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6612 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6221 6613
6222 memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2)); 6614 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6615 memset(&di->id2, 0, blocksize -
6616 offsetof(struct ocfs2_dinode, id2) -
6617 xattrsize);
6618 else
6619 memset(&di->id2, 0, blocksize -
6620 offsetof(struct ocfs2_dinode, id2));
6223} 6621}
6224 6622
6225void ocfs2_dinode_new_extent_list(struct inode *inode, 6623void ocfs2_dinode_new_extent_list(struct inode *inode,
6226 struct ocfs2_dinode *di) 6624 struct ocfs2_dinode *di)
6227{ 6625{
6228 ocfs2_zero_dinode_id2(inode, di); 6626 ocfs2_zero_dinode_id2_with_xattr(inode, di);
6229 di->id2.i_list.l_tree_depth = 0; 6627 di->id2.i_list.l_tree_depth = 0;
6230 di->id2.i_list.l_next_free_rec = 0; 6628 di->id2.i_list.l_next_free_rec = 0;
6231 di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb)); 6629 di->id2.i_list.l_count = cpu_to_le16(
6630 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6232} 6631}
6233 6632
6234void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) 6633void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
@@ -6245,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6245 * We clear the entire i_data structure here so that all 6644 * We clear the entire i_data structure here so that all
6246 * fields can be properly initialized. 6645 * fields can be properly initialized.
6247 */ 6646 */
6248 ocfs2_zero_dinode_id2(inode, di); 6647 ocfs2_zero_dinode_id2_with_xattr(inode, di);
6249 6648
6250 idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb)); 6649 idata->id_count = cpu_to_le16(
6650 ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6251} 6651}
6252 6652
6253int ocfs2_convert_inline_data_to_extents(struct inode *inode, 6653int ocfs2_convert_inline_data_to_extents(struct inode *inode,
@@ -6262,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6262 struct ocfs2_alloc_context *data_ac = NULL; 6662 struct ocfs2_alloc_context *data_ac = NULL;
6263 struct page **pages = NULL; 6663 struct page **pages = NULL;
6264 loff_t end = osb->s_clustersize; 6664 loff_t end = osb->s_clustersize;
6665 struct ocfs2_extent_tree et;
6265 6666
6266 has_data = i_size_read(inode) ? 1 : 0; 6667 has_data = i_size_read(inode) ? 1 : 0;
6267 6668
@@ -6361,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6361 * this proves to be false, we could always re-build 6762 * this proves to be false, we could always re-build
6362 * the in-inode data from our pages. 6763 * the in-inode data from our pages.
6363 */ 6764 */
6364 ret = ocfs2_insert_extent(osb, handle, inode, di_bh, 6765 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
6766 ret = ocfs2_insert_extent(osb, handle, inode, &et,
6365 0, block, 1, 0, NULL); 6767 0, block, 1, 0, NULL);
6366 if (ret) { 6768 if (ret) {
6367 mlog_errno(ret); 6769 mlog_errno(ret);
@@ -6404,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
6404 handle_t *handle = NULL; 6806 handle_t *handle = NULL;
6405 struct inode *tl_inode = osb->osb_tl_inode; 6807 struct inode *tl_inode = osb->osb_tl_inode;
6406 struct ocfs2_path *path = NULL; 6808 struct ocfs2_path *path = NULL;
6809 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6407 6810
6408 mlog_entry_void(); 6811 mlog_entry_void();
6409 6812
6410 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 6813 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6411 i_size_read(inode)); 6814 i_size_read(inode));
6412 6815
6413 path = ocfs2_new_inode_path(fe_bh); 6816 path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6414 if (!path) { 6817 if (!path) {
6415 status = -ENOMEM; 6818 status = -ENOMEM;
6416 mlog_errno(status); 6819 mlog_errno(status);
@@ -6581,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6581 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 6984 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6582 6985
6583 if (fe->id2.i_list.l_tree_depth) { 6986 if (fe->id2.i_list.l_tree_depth) {
6584 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 6987 status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
6585 &last_eb_bh, OCFS2_BH_CACHED, inode); 6988 &last_eb_bh);
6586 if (status < 0) { 6989 if (status < 0) {
6587 mlog_errno(status); 6990 mlog_errno(status);
6588 goto bail; 6991 goto bail;
@@ -6695,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6695 mlog(ML_NOTICE, 7098 mlog(ML_NOTICE,
6696 "Truncate completion has non-empty dealloc context\n"); 7099 "Truncate completion has non-empty dealloc context\n");
6697 7100
6698 if (tc->tc_last_eb_bh) 7101 brelse(tc->tc_last_eb_bh);
6699 brelse(tc->tc_last_eb_bh);
6700 7102
6701 kfree(tc); 7103 kfree(tc);
6702} 7104}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd8011..70257c84cfbe 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,30 +26,102 @@
26#ifndef OCFS2_ALLOC_H 26#ifndef OCFS2_ALLOC_H
27#define OCFS2_ALLOC_H 27#define OCFS2_ALLOC_H
28 28
29
30/*
31 * For xattr tree leaf, we limit the leaf byte size to be 64K.
32 */
33#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
34
35/*
36 * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
37 * the b-tree operations in ocfs2. Now all the b-tree operations are not
38 * limited to ocfs2_dinode only. Any data which need to allocate clusters
39 * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
40 * and operation.
41 *
42 * ocfs2_extent_tree becomes the first-class object for extent tree
43 * manipulation. Callers of the alloc.c code need to fill it via one of
44 * the ocfs2_init_*_extent_tree() operations below.
45 *
46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
48 * functions.
49 * ocfs2_extent_tree_operations abstract the normal operations we do for
50 * the root of extent b-tree.
51 */
52struct ocfs2_extent_tree_operations;
53struct ocfs2_extent_tree {
54 struct ocfs2_extent_tree_operations *et_ops;
55 struct buffer_head *et_root_bh;
56 struct ocfs2_extent_list *et_root_el;
57 void *et_object;
58 unsigned int et_max_leaf_clusters;
59};
60
61/*
62 * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
63 * specified object buffer.
64 */
65void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
66 struct inode *inode,
67 struct buffer_head *bh);
68void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
69 struct inode *inode,
70 struct buffer_head *bh);
71void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
72 struct inode *inode,
73 struct buffer_head *bh,
74 struct ocfs2_xattr_value_root *xv);
75
29struct ocfs2_alloc_context; 76struct ocfs2_alloc_context;
30int ocfs2_insert_extent(struct ocfs2_super *osb, 77int ocfs2_insert_extent(struct ocfs2_super *osb,
31 handle_t *handle, 78 handle_t *handle,
32 struct inode *inode, 79 struct inode *inode,
33 struct buffer_head *fe_bh, 80 struct ocfs2_extent_tree *et,
34 u32 cpos, 81 u32 cpos,
35 u64 start_blk, 82 u64 start_blk,
36 u32 new_clusters, 83 u32 new_clusters,
37 u8 flags, 84 u8 flags,
38 struct ocfs2_alloc_context *meta_ac); 85 struct ocfs2_alloc_context *meta_ac);
86
87enum ocfs2_alloc_restarted {
88 RESTART_NONE = 0,
89 RESTART_TRANS,
90 RESTART_META
91};
92int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
93 struct inode *inode,
94 u32 *logical_offset,
95 u32 clusters_to_add,
96 int mark_unwritten,
97 struct ocfs2_extent_tree *et,
98 handle_t *handle,
99 struct ocfs2_alloc_context *data_ac,
100 struct ocfs2_alloc_context *meta_ac,
101 enum ocfs2_alloc_restarted *reason_ret);
39struct ocfs2_cached_dealloc_ctxt; 102struct ocfs2_cached_dealloc_ctxt;
40int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, 103int ocfs2_mark_extent_written(struct inode *inode,
104 struct ocfs2_extent_tree *et,
41 handle_t *handle, u32 cpos, u32 len, u32 phys, 105 handle_t *handle, u32 cpos, u32 len, u32 phys,
42 struct ocfs2_alloc_context *meta_ac, 106 struct ocfs2_alloc_context *meta_ac,
43 struct ocfs2_cached_dealloc_ctxt *dealloc); 107 struct ocfs2_cached_dealloc_ctxt *dealloc);
44int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, 108int ocfs2_remove_extent(struct inode *inode,
109 struct ocfs2_extent_tree *et,
45 u32 cpos, u32 len, handle_t *handle, 110 u32 cpos, u32 len, handle_t *handle,
46 struct ocfs2_alloc_context *meta_ac, 111 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc); 112 struct ocfs2_cached_dealloc_ctxt *dealloc);
48int ocfs2_num_free_extents(struct ocfs2_super *osb, 113int ocfs2_num_free_extents(struct ocfs2_super *osb,
49 struct inode *inode, 114 struct inode *inode,
50 struct ocfs2_dinode *fe); 115 struct ocfs2_extent_tree *et);
51/* how many new metadata chunks would an allocation need at maximum? */ 116
52static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) 117/*
118 * how many new metadata chunks would an allocation need at maximum?
119 *
120 * Please note that the caller must make sure that root_el is the root
121 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
122 * the result may be wrong.
123 */
124static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
53{ 125{
54 /* 126 /*
55 * Rather than do all the work of determining how much we need 127 * Rather than do all the work of determining how much we need
@@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
59 * new tree_depth==0 extent_block, and one block at the new 131 * new tree_depth==0 extent_block, and one block at the new
60 * top-of-the tree. 132 * top-of-the tree.
61 */ 133 */
62 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; 134 return le16_to_cpu(root_el->l_tree_depth) + 2;
63} 135}
64 136
65void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di); 137void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
@@ -146,4 +218,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
146 return le16_to_cpu(rec->e_leaf_clusters); 218 return le16_to_cpu(rec->e_leaf_clusters);
147} 219}
148 220
221/*
222 * This is only valid for leaf nodes, which are the only ones that can
223 * have empty extents anyway.
224 */
225static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
226{
227 return !rec->e_leaf_clusters;
228}
229
149#endif /* OCFS2_ALLOC_H */ 230#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 506c24fb5078..c22543b33420 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
68 goto bail; 68 goto bail;
69 } 69 }
70 70
71 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 71 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
72 OCFS2_I(inode)->ip_blkno,
73 &bh, OCFS2_BH_CACHED, inode);
74 if (status < 0) { 72 if (status < 0) {
75 mlog_errno(status); 73 mlog_errno(status);
76 goto bail; 74 goto bail;
@@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
128 err = 0; 126 err = 0;
129 127
130bail: 128bail:
131 if (bh) 129 brelse(bh);
132 brelse(bh);
133 130
134 mlog_exit(err); 131 mlog_exit(err);
135 return err; 132 return err;
@@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
261{ 258{
262 int ret; 259 int ret;
263 struct buffer_head *di_bh = NULL; 260 struct buffer_head *di_bh = NULL;
264 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
265 261
266 BUG_ON(!PageLocked(page)); 262 BUG_ON(!PageLocked(page));
267 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); 263 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
268 264
269 ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh, 265 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
270 OCFS2_BH_CACHED, inode);
271 if (ret) { 266 if (ret) {
272 mlog_errno(ret); 267 mlog_errno(ret);
273 goto out; 268 goto out;
@@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
485 } 480 }
486 481
487 if (ocfs2_should_order_data(inode)) { 482 if (ocfs2_should_order_data(inode)) {
483 ret = ocfs2_jbd2_file_inode(handle, inode);
484#ifdef CONFIG_OCFS2_COMPAT_JBD
488 ret = walk_page_buffers(handle, 485 ret = walk_page_buffers(handle,
489 page_buffers(page), 486 page_buffers(page),
490 from, to, NULL, 487 from, to, NULL,
491 ocfs2_journal_dirty_data); 488 ocfs2_journal_dirty_data);
492 if (ret < 0) 489#endif
490 if (ret < 0)
493 mlog_errno(ret); 491 mlog_errno(ret);
494 } 492 }
495out: 493out:
@@ -594,7 +592,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
594 goto bail; 592 goto bail;
595 } 593 }
596 594
597 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { 595 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
598 ocfs2_error(inode->i_sb, 596 ocfs2_error(inode->i_sb,
599 "Inode %llu has a hole at block %llu\n", 597 "Inode %llu has a hole at block %llu\n",
600 (unsigned long long)OCFS2_I(inode)->ip_blkno, 598 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
669{ 667{
670 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 668 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
671 669
672 journal_invalidatepage(journal, page, offset); 670 jbd2_journal_invalidatepage(journal, page, offset);
673} 671}
674 672
675static int ocfs2_releasepage(struct page *page, gfp_t wait) 673static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
678 676
679 if (!page_has_buffers(page)) 677 if (!page_has_buffers(page))
680 return 0; 678 return 0;
681 return journal_try_to_free_buffers(journal, page, wait); 679 return jbd2_journal_try_to_free_buffers(journal, page, wait);
682} 680}
683 681
684static ssize_t ocfs2_direct_IO(int rw, 682static ssize_t ocfs2_direct_IO(int rw,
@@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode,
1074 tmppage = wc->w_pages[i]; 1072 tmppage = wc->w_pages[i];
1075 1073
1076 if (page_has_buffers(tmppage)) { 1074 if (page_has_buffers(tmppage)) {
1077 if (ocfs2_should_order_data(inode)) 1075 if (ocfs2_should_order_data(inode)) {
1076 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1077#ifdef CONFIG_OCFS2_COMPAT_JBD
1078 walk_page_buffers(wc->w_handle, 1078 walk_page_buffers(wc->w_handle,
1079 page_buffers(tmppage), 1079 page_buffers(tmppage),
1080 from, to, NULL, 1080 from, to, NULL,
1081 ocfs2_journal_dirty_data); 1081 ocfs2_journal_dirty_data);
1082#endif
1083 }
1082 1084
1083 block_commit_write(tmppage, from, to); 1085 block_commit_write(tmppage, from, to);
1084 } 1086 }
@@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1242 int ret, i, new, should_zero = 0; 1244 int ret, i, new, should_zero = 0;
1243 u64 v_blkno, p_blkno; 1245 u64 v_blkno, p_blkno;
1244 struct inode *inode = mapping->host; 1246 struct inode *inode = mapping->host;
1247 struct ocfs2_extent_tree et;
1245 1248
1246 new = phys == 0 ? 1 : 0; 1249 new = phys == 0 ? 1 : 0;
1247 if (new || unwritten) 1250 if (new || unwritten)
@@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1255 * any additional semaphores or cluster locks. 1258 * any additional semaphores or cluster locks.
1256 */ 1259 */
1257 tmp_pos = cpos; 1260 tmp_pos = cpos;
1258 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1261 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
1259 &tmp_pos, 1, 0, wc->w_di_bh, 1262 &tmp_pos, 1, 0, wc->w_di_bh,
1260 wc->w_handle, data_ac, 1263 wc->w_handle, data_ac,
1261 meta_ac, NULL); 1264 meta_ac, NULL);
1262 /* 1265 /*
1263 * This shouldn't happen because we must have already 1266 * This shouldn't happen because we must have already
1264 * calculated the correct meta data allocation required. The 1267 * calculated the correct meta data allocation required. The
@@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1276 goto out; 1279 goto out;
1277 } 1280 }
1278 } else if (unwritten) { 1281 } else if (unwritten) {
1279 ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, 1282 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
1283 ret = ocfs2_mark_extent_written(inode, &et,
1280 wc->w_handle, cpos, 1, phys, 1284 wc->w_handle, cpos, 1, phys,
1281 meta_ac, &wc->w_dealloc); 1285 meta_ac, &wc->w_dealloc);
1282 if (ret < 0) { 1286 if (ret < 0) {
@@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1665 struct ocfs2_alloc_context *data_ac = NULL; 1669 struct ocfs2_alloc_context *data_ac = NULL;
1666 struct ocfs2_alloc_context *meta_ac = NULL; 1670 struct ocfs2_alloc_context *meta_ac = NULL;
1667 handle_t *handle; 1671 handle_t *handle;
1672 struct ocfs2_extent_tree et;
1668 1673
1669 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1674 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1670 if (ret) { 1675 if (ret) {
@@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1712 * ocfs2_lock_allocators(). It greatly over-estimates 1717 * ocfs2_lock_allocators(). It greatly over-estimates
1713 * the work to be done. 1718 * the work to be done.
1714 */ 1719 */
1715 ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, 1720 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
1716 extents_to_split, &data_ac, &meta_ac); 1721 " clusters_to_add = %u, extents_to_split = %u\n",
1722 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1723 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
1724 clusters_to_alloc, extents_to_split);
1725
1726 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
1727 ret = ocfs2_lock_allocators(inode, &et,
1728 clusters_to_alloc, extents_to_split,
1729 &data_ac, &meta_ac);
1717 if (ret) { 1730 if (ret) {
1718 mlog_errno(ret); 1731 mlog_errno(ret);
1719 goto out; 1732 goto out;
1720 } 1733 }
1721 1734
1722 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1735 credits = ocfs2_calc_extend_credits(inode->i_sb,
1736 &di->id2.i_list,
1723 clusters_to_alloc); 1737 clusters_to_alloc);
1724 1738
1725 } 1739 }
@@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1905 } 1919 }
1906 1920
1907 if (page_has_buffers(tmppage)) { 1921 if (page_has_buffers(tmppage)) {
1908 if (ocfs2_should_order_data(inode)) 1922 if (ocfs2_should_order_data(inode)) {
1923 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1924#ifdef CONFIG_OCFS2_COMPAT_JBD
1909 walk_page_buffers(wc->w_handle, 1925 walk_page_buffers(wc->w_handle,
1910 page_buffers(tmppage), 1926 page_buffers(tmppage),
1911 from, to, NULL, 1927 from, to, NULL,
1912 ocfs2_journal_dirty_data); 1928 ocfs2_journal_dirty_data);
1929#endif
1930 }
1913 block_commit_write(tmppage, from, to); 1931 block_commit_write(tmppage, from, to);
1914 } 1932 }
1915 } 1933 }
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f136639f5b41..7e947c672469 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
66 /* remove from dirty list before I/O. */ 66 /* remove from dirty list before I/O. */
67 clear_buffer_dirty(bh); 67 clear_buffer_dirty(bh);
68 68
69 get_bh(bh); /* for end_buffer_write_sync() */ 69 get_bh(bh); /* for end_buffer_write_sync() */
70 bh->b_end_io = end_buffer_write_sync; 70 bh->b_end_io = end_buffer_write_sync;
71 submit_bh(WRITE, bh); 71 submit_bh(WRITE, bh);
72 72
@@ -88,22 +88,103 @@ out:
88 return ret; 88 return ret;
89} 89}
90 90
91int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, 91int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
92 struct buffer_head *bhs[], int flags, 92 unsigned int nr, struct buffer_head *bhs[])
93 struct inode *inode) 93{
94 int status = 0;
95 unsigned int i;
96 struct buffer_head *bh;
97
98 if (!nr) {
99 mlog(ML_BH_IO, "No buffers will be read!\n");
100 goto bail;
101 }
102
103 for (i = 0 ; i < nr ; i++) {
104 if (bhs[i] == NULL) {
105 bhs[i] = sb_getblk(osb->sb, block++);
106 if (bhs[i] == NULL) {
107 status = -EIO;
108 mlog_errno(status);
109 goto bail;
110 }
111 }
112 bh = bhs[i];
113
114 if (buffer_jbd(bh)) {
115 mlog(ML_ERROR,
116 "trying to sync read a jbd "
117 "managed bh (blocknr = %llu), skipping\n",
118 (unsigned long long)bh->b_blocknr);
119 continue;
120 }
121
122 if (buffer_dirty(bh)) {
123 /* This should probably be a BUG, or
124 * at least return an error. */
125 mlog(ML_ERROR,
126 "trying to sync read a dirty "
127 "buffer! (blocknr = %llu), skipping\n",
128 (unsigned long long)bh->b_blocknr);
129 continue;
130 }
131
132 lock_buffer(bh);
133 if (buffer_jbd(bh)) {
134 mlog(ML_ERROR,
135 "block %llu had the JBD bit set "
136 "while I was in lock_buffer!",
137 (unsigned long long)bh->b_blocknr);
138 BUG();
139 }
140
141 clear_buffer_uptodate(bh);
142 get_bh(bh); /* for end_buffer_read_sync() */
143 bh->b_end_io = end_buffer_read_sync;
144 submit_bh(READ, bh);
145 }
146
147 for (i = nr; i > 0; i--) {
148 bh = bhs[i - 1];
149
150 if (buffer_jbd(bh)) {
151 mlog(ML_ERROR,
152 "the journal got the buffer while it was "
153 "locked for io! (blocknr = %llu)\n",
154 (unsigned long long)bh->b_blocknr);
155 BUG();
156 }
157
158 wait_on_buffer(bh);
159 if (!buffer_uptodate(bh)) {
160 /* Status won't be cleared from here on out,
161 * so we can safely record this and loop back
162 * to cleanup the other buffers. */
163 status = -EIO;
164 put_bh(bh);
165 bhs[i - 1] = NULL;
166 }
167 }
168
169bail:
170 return status;
171}
172
173int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
174 struct buffer_head *bhs[], int flags)
94{ 175{
95 int status = 0; 176 int status = 0;
96 struct super_block *sb;
97 int i, ignore_cache = 0; 177 int i, ignore_cache = 0;
98 struct buffer_head *bh; 178 struct buffer_head *bh;
99 179
100 mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", 180 mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
101 (unsigned long long)block, nr, flags, inode); 181 inode, (unsigned long long)block, nr, flags);
102 182
183 BUG_ON(!inode);
103 BUG_ON((flags & OCFS2_BH_READAHEAD) && 184 BUG_ON((flags & OCFS2_BH_READAHEAD) &&
104 (!inode || !(flags & OCFS2_BH_CACHED))); 185 (flags & OCFS2_BH_IGNORE_CACHE));
105 186
106 if (osb == NULL || osb->sb == NULL || bhs == NULL) { 187 if (bhs == NULL) {
107 status = -EINVAL; 188 status = -EINVAL;
108 mlog_errno(status); 189 mlog_errno(status);
109 goto bail; 190 goto bail;
@@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
122 goto bail; 203 goto bail;
123 } 204 }
124 205
125 sb = osb->sb; 206 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
126
127 if (flags & OCFS2_BH_CACHED && !inode)
128 flags &= ~OCFS2_BH_CACHED;
129
130 if (inode)
131 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
132 for (i = 0 ; i < nr ; i++) { 207 for (i = 0 ; i < nr ; i++) {
133 if (bhs[i] == NULL) { 208 if (bhs[i] == NULL) {
134 bhs[i] = sb_getblk(sb, block++); 209 bhs[i] = sb_getblk(inode->i_sb, block++);
135 if (bhs[i] == NULL) { 210 if (bhs[i] == NULL) {
136 if (inode) 211 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
137 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
138 status = -EIO; 212 status = -EIO;
139 mlog_errno(status); 213 mlog_errno(status);
140 goto bail; 214 goto bail;
141 } 215 }
142 } 216 }
143 bh = bhs[i]; 217 bh = bhs[i];
144 ignore_cache = 0; 218 ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
145 219
146 /* There are three read-ahead cases here which we need to 220 /* There are three read-ahead cases here which we need to
147 * be concerned with. All three assume a buffer has 221 * be concerned with. All three assume a buffer has
@@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
167 * before our is-it-in-flight check. 241 * before our is-it-in-flight check.
168 */ 242 */
169 243
170 if (flags & OCFS2_BH_CACHED && 244 if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
171 !ocfs2_buffer_uptodate(inode, bh)) {
172 mlog(ML_UPTODATE, 245 mlog(ML_UPTODATE,
173 "bh (%llu), inode %llu not uptodate\n", 246 "bh (%llu), inode %llu not uptodate\n",
174 (unsigned long long)bh->b_blocknr, 247 (unsigned long long)bh->b_blocknr,
175 (unsigned long long)OCFS2_I(inode)->ip_blkno); 248 (unsigned long long)OCFS2_I(inode)->ip_blkno);
249 /* We're using ignore_cache here to say
250 * "go to disk" */
176 ignore_cache = 1; 251 ignore_cache = 1;
177 } 252 }
178 253
179 /* XXX: Can we ever get this and *not* have the cached 254 /* XXX: Can we ever get this and *not* have the cached
180 * flag set? */ 255 * flag set? */
181 if (buffer_jbd(bh)) { 256 if (buffer_jbd(bh)) {
182 if (!(flags & OCFS2_BH_CACHED) || ignore_cache) 257 if (ignore_cache)
183 mlog(ML_BH_IO, "trying to sync read a jbd " 258 mlog(ML_BH_IO, "trying to sync read a jbd "
184 "managed bh (blocknr = %llu)\n", 259 "managed bh (blocknr = %llu)\n",
185 (unsigned long long)bh->b_blocknr); 260 (unsigned long long)bh->b_blocknr);
186 continue; 261 continue;
187 } 262 }
188 263
189 if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { 264 if (ignore_cache) {
190 if (buffer_dirty(bh)) { 265 if (buffer_dirty(bh)) {
191 /* This should probably be a BUG, or 266 /* This should probably be a BUG, or
192 * at least return an error. */ 267 * at least return an error. */
@@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
221 * previously read-ahead buffer may have 296 * previously read-ahead buffer may have
222 * completed I/O while we were waiting for the 297 * completed I/O while we were waiting for the
223 * buffer lock. */ 298 * buffer lock. */
224 if ((flags & OCFS2_BH_CACHED) 299 if (!(flags & OCFS2_BH_IGNORE_CACHE)
225 && !(flags & OCFS2_BH_READAHEAD) 300 && !(flags & OCFS2_BH_READAHEAD)
226 && ocfs2_buffer_uptodate(inode, bh)) { 301 && ocfs2_buffer_uptodate(inode, bh)) {
227 unlock_buffer(bh); 302 unlock_buffer(bh);
@@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
265 /* Always set the buffer in the cache, even if it was 340 /* Always set the buffer in the cache, even if it was
266 * a forced read, or read-ahead which hasn't yet 341 * a forced read, or read-ahead which hasn't yet
267 * completed. */ 342 * completed. */
268 if (inode) 343 ocfs2_set_buffer_uptodate(inode, bh);
269 ocfs2_set_buffer_uptodate(inode, bh);
270 } 344 }
271 if (inode) 345 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
272 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
273 346
274 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 347 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
275 (unsigned long long)block, nr, 348 (unsigned long long)block, nr,
276 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags); 349 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
350 flags);
277 351
278bail: 352bail:
279 353
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c2e78614c3e5..75e1dcb1ade7 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,31 +31,29 @@
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh, 31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate); 32 int uptodate);
33 33
34static inline int ocfs2_read_block(struct ocfs2_super *osb, 34static inline int ocfs2_read_block(struct inode *inode,
35 u64 off, 35 u64 off,
36 struct buffer_head **bh, 36 struct buffer_head **bh);
37 int flags,
38 struct inode *inode);
39 37
40int ocfs2_write_block(struct ocfs2_super *osb, 38int ocfs2_write_block(struct ocfs2_super *osb,
41 struct buffer_head *bh, 39 struct buffer_head *bh,
42 struct inode *inode); 40 struct inode *inode);
43int ocfs2_read_blocks(struct ocfs2_super *osb, 41int ocfs2_read_blocks(struct inode *inode,
44 u64 block, 42 u64 block,
45 int nr, 43 int nr,
46 struct buffer_head *bhs[], 44 struct buffer_head *bhs[],
47 int flags, 45 int flags);
48 struct inode *inode); 46int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
47 unsigned int nr, struct buffer_head *bhs[]);
49 48
50int ocfs2_write_super_or_backup(struct ocfs2_super *osb, 49int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
51 struct buffer_head *bh); 50 struct buffer_head *bh);
52 51
53#define OCFS2_BH_CACHED 1 52#define OCFS2_BH_IGNORE_CACHE 1
54#define OCFS2_BH_READAHEAD 8 53#define OCFS2_BH_READAHEAD 8
55 54
56static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, 55static inline int ocfs2_read_block(struct inode *inode, u64 off,
57 struct buffer_head **bh, int flags, 56 struct buffer_head **bh)
58 struct inode *inode)
59{ 57{
60 int status = 0; 58 int status = 0;
61 59
@@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
65 goto bail; 63 goto bail;
66 } 64 }
67 65
68 status = ocfs2_read_blocks(osb, off, 1, bh, 66 status = ocfs2_read_blocks(inode, off, 1, bh, 0);
69 flags, inode);
70 67
71bail: 68bail:
72 return status; 69 return status;
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 23c732f27529..d8a0cb92cef6 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
109 define_mask(CONN), 109 define_mask(CONN),
110 define_mask(QUORUM), 110 define_mask(QUORUM),
111 define_mask(EXPORT), 111 define_mask(EXPORT),
112 define_mask(XATTR),
112 define_mask(ERROR), 113 define_mask(ERROR),
113 define_mask(NOTICE), 114 define_mask(NOTICE),
114 define_mask(KTHREAD), 115 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 597e064bb94f..57670c680471 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -112,6 +112,7 @@
112#define ML_CONN 0x0000000004000000ULL /* net connection management */ 112#define ML_CONN 0x0000000004000000ULL /* net connection management */
113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ 113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ 114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
115#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115/* bits that are infrequently given and frequently matched in the high word */ 116/* bits that are infrequently given and frequently matched in the high word */
116#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 117#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
117#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 118#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index d8bfa0eb41b2..52276c02f710 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v)
138 " message id: %d\n" 138 " message id: %d\n"
139 " message type: %u\n" 139 " message type: %u\n"
140 " message key: 0x%08x\n" 140 " message key: 0x%08x\n"
141 " sock acquiry: %lu.%lu\n" 141 " sock acquiry: %lu.%ld\n"
142 " send start: %lu.%lu\n" 142 " send start: %lu.%ld\n"
143 " wait start: %lu.%lu\n", 143 " wait start: %lu.%ld\n",
144 nst, (unsigned long)nst->st_task->pid, 144 nst, (unsigned long)nst->st_task->pid,
145 (unsigned long)nst->st_task->tgid, 145 (unsigned long)nst->st_task->tgid,
146 nst->st_task->comm, nst->st_node, 146 nst->st_task->comm, nst->st_node,
147 nst->st_sc, nst->st_id, nst->st_msg_type, 147 nst->st_sc, nst->st_id, nst->st_msg_type,
148 nst->st_msg_key, 148 nst->st_msg_key,
149 nst->st_sock_time.tv_sec, 149 nst->st_sock_time.tv_sec,
150 (unsigned long)nst->st_sock_time.tv_usec, 150 (long)nst->st_sock_time.tv_usec,
151 nst->st_send_time.tv_sec, 151 nst->st_send_time.tv_sec,
152 (unsigned long)nst->st_send_time.tv_usec, 152 (long)nst->st_send_time.tv_usec,
153 nst->st_status_time.tv_sec, 153 nst->st_status_time.tv_sec,
154 nst->st_status_time.tv_usec); 154 (long)nst->st_status_time.tv_usec);
155 } 155 }
156 156
157 spin_unlock(&o2net_debug_lock); 157 spin_unlock(&o2net_debug_lock);
@@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
276 return sc; /* unused, just needs to be null when done */ 276 return sc; /* unused, just needs to be null when done */
277} 277}
278 278
279#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec 279#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
280 280
281static int sc_seq_show(struct seq_file *seq, void *v) 281static int sc_seq_show(struct seq_file *seq, void *v)
282{ 282{
@@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v)
309 " remote node: %s\n" 309 " remote node: %s\n"
310 " page off: %zu\n" 310 " page off: %zu\n"
311 " handshake ok: %u\n" 311 " handshake ok: %u\n"
312 " timer: %lu.%lu\n" 312 " timer: %lu.%ld\n"
313 " data ready: %lu.%lu\n" 313 " data ready: %lu.%ld\n"
314 " advance start: %lu.%lu\n" 314 " advance start: %lu.%ld\n"
315 " advance stop: %lu.%lu\n" 315 " advance stop: %lu.%ld\n"
316 " func start: %lu.%lu\n" 316 " func start: %lu.%ld\n"
317 " func stop: %lu.%lu\n" 317 " func stop: %lu.%ld\n"
318 " func key: %u\n" 318 " func key: %u\n"
319 " func type: %u\n", 319 " func type: %u\n",
320 sc, 320 sc,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a27d61581bd6..2bcf706d9dd3 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); 143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
144 144
145#ifdef CONFIG_DEBUG_FS 145#ifdef CONFIG_DEBUG_FS
146void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, 146static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
147 u32 msgkey, struct task_struct *task, u8 node) 147 u32 msgkey, struct task_struct *task, u8 node)
148{ 148{
149 INIT_LIST_HEAD(&nst->st_net_debug_item); 149 INIT_LIST_HEAD(&nst->st_net_debug_item);
150 nst->st_task = task; 150 nst->st_task = task;
@@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
153 nst->st_node = node; 153 nst->st_node = node;
154} 154}
155 155
156void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 156static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
157{ 157{
158 do_gettimeofday(&nst->st_sock_time); 158 do_gettimeofday(&nst->st_sock_time);
159} 159}
160 160
161void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 161static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
162{ 162{
163 do_gettimeofday(&nst->st_send_time); 163 do_gettimeofday(&nst->st_send_time);
164} 164}
165 165
166void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 166static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
167{ 167{
168 do_gettimeofday(&nst->st_status_time); 168 do_gettimeofday(&nst->st_status_time);
169} 169}
170 170
171void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 171static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
172 struct o2net_sock_container *sc) 172 struct o2net_sock_container *sc)
173{ 173{
174 nst->st_sc = sc; 174 nst->st_sc = sc;
175} 175}
176 176
177void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) 177static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
178{ 178{
179 nst->st_id = msg_id; 179 nst->st_id = msg_id;
180} 180}
181
182#else /* CONFIG_DEBUG_FS */
183
184static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
185 u32 msgkey, struct task_struct *task, u8 node)
186{
187}
188
189static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
190{
191}
192
193static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
194{
195}
196
197static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
198{
199}
200
201static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
202 struct o2net_sock_container *sc)
203{
204}
205
206static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
207 u32 msg_id)
208{
209}
210
181#endif /* CONFIG_DEBUG_FS */ 211#endif /* CONFIG_DEBUG_FS */
182 212
183static inline int o2net_reconnect_delay(void) 213static inline int o2net_reconnect_delay(void)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 18307ff81b77..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,42 +224,10 @@ struct o2net_send_tracking {
224 struct timeval st_send_time; 224 struct timeval st_send_time;
225 struct timeval st_status_time; 225 struct timeval st_status_time;
226}; 226};
227
228void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
229 u32 msgkey, struct task_struct *task, u8 node);
230void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
231void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
232void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
233void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
234 struct o2net_sock_container *sc);
235void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
236
237#else 227#else
238struct o2net_send_tracking { 228struct o2net_send_tracking {
239 u32 dummy; 229 u32 dummy;
240}; 230};
241
242static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
243 u32 msgkey, struct task_struct *task, u8 node)
244{
245}
246static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
247{
248}
249static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
250{
251}
252static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
253{
254}
255static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
256 struct o2net_sock_container *sc)
257{
258}
259static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
260 u32 msg_id)
261{
262}
263#endif /* CONFIG_DEBUG_FS */ 231#endif /* CONFIG_DEBUG_FS */
264 232
265#endif /* O2CLUSTER_TCP_INTERNAL_H */ 233#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a1875848080..026e6eb85187 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
82 struct ocfs2_alloc_context *meta_ac, 82 struct ocfs2_alloc_context *meta_ac,
83 struct buffer_head **new_bh); 83 struct buffer_head **new_bh);
84 84
85static struct buffer_head *ocfs2_bread(struct inode *inode,
86 int block, int *err, int reada)
87{
88 struct buffer_head *bh = NULL;
89 int tmperr;
90 u64 p_blkno;
91 int readflags = 0;
92
93 if (reada)
94 readflags |= OCFS2_BH_READAHEAD;
95
96 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
97 i_size_read(inode)) {
98 BUG_ON(!reada);
99 return NULL;
100 }
101
102 down_read(&OCFS2_I(inode)->ip_alloc_sem);
103 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
104 NULL);
105 up_read(&OCFS2_I(inode)->ip_alloc_sem);
106 if (tmperr < 0) {
107 mlog_errno(tmperr);
108 goto fail;
109 }
110
111 tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
112 if (tmperr < 0)
113 goto fail;
114
115 tmperr = 0;
116
117 *err = 0;
118 return bh;
119
120fail:
121 brelse(bh);
122 bh = NULL;
123
124 *err = -EIO;
125 return NULL;
126}
127
85/* 128/*
86 * bh passed here can be an inode block or a dir data block, depending 129 * bh passed here can be an inode block or a dir data block, depending
87 * on the inode inline data flag. 130 * on the inode inline data flag.
@@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
188 struct ocfs2_dinode *di; 231 struct ocfs2_dinode *di;
189 struct ocfs2_inline_data *data; 232 struct ocfs2_inline_data *data;
190 233
191 ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, 234 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
192 &di_bh, OCFS2_BH_CACHED, dir);
193 if (ret) { 235 if (ret) {
194 mlog_errno(ret); 236 mlog_errno(ret);
195 goto out; 237 goto out;
@@ -260,14 +302,13 @@ restart:
260 } 302 }
261 if ((bh = bh_use[ra_ptr++]) == NULL) 303 if ((bh = bh_use[ra_ptr++]) == NULL)
262 goto next; 304 goto next;
263 wait_on_buffer(bh); 305 if (ocfs2_read_block(dir, block, &bh)) {
264 if (!buffer_uptodate(bh)) { 306 /* read error, skip block & hope for the best.
265 /* read error, skip block & hope for the best */ 307 * ocfs2_read_block() has released the bh. */
266 ocfs2_error(dir->i_sb, "reading directory %llu, " 308 ocfs2_error(dir->i_sb, "reading directory %llu, "
267 "offset %lu\n", 309 "offset %lu\n",
268 (unsigned long long)OCFS2_I(dir)->ip_blkno, 310 (unsigned long long)OCFS2_I(dir)->ip_blkno,
269 block); 311 block);
270 brelse(bh);
271 goto next; 312 goto next;
272 } 313 }
273 i = ocfs2_search_dirblock(bh, dir, name, namelen, 314 i = ocfs2_search_dirblock(bh, dir, name, namelen,
@@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
417 struct ocfs2_dinode *di; 458 struct ocfs2_dinode *di;
418 struct ocfs2_inline_data *data; 459 struct ocfs2_inline_data *data;
419 460
420 ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, 461 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
421 &di_bh, OCFS2_BH_CACHED, dir);
422 if (ret) { 462 if (ret) {
423 mlog_errno(ret); 463 mlog_errno(ret);
424 goto out; 464 goto out;
@@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
596 struct ocfs2_inline_data *data; 636 struct ocfs2_inline_data *data;
597 struct ocfs2_dir_entry *de; 637 struct ocfs2_dir_entry *de;
598 638
599 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, 639 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
600 &di_bh, OCFS2_BH_CACHED, inode);
601 if (ret) { 640 if (ret) {
602 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", 641 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
603 (unsigned long long)OCFS2_I(inode)->ip_blkno); 642 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
716 for (i = ra_sectors >> (sb->s_blocksize_bits - 9); 755 for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
717 i > 0; i--) { 756 i > 0; i--) {
718 tmp = ocfs2_bread(inode, ++blk, &err, 1); 757 tmp = ocfs2_bread(inode, ++blk, &err, 1);
719 if (tmp) 758 brelse(tmp);
720 brelse(tmp);
721 } 759 }
722 last_ra_blk = blk; 760 last_ra_blk = blk;
723 ra_sectors = 8; 761 ra_sectors = 8;
@@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name,
899leave: 937leave:
900 if (status < 0) { 938 if (status < 0) {
901 *dirent = NULL; 939 *dirent = NULL;
902 if (*dirent_bh) { 940 brelse(*dirent_bh);
903 brelse(*dirent_bh); 941 *dirent_bh = NULL;
904 *dirent_bh = NULL;
905 }
906 } 942 }
907 943
908 mlog_exit(status); 944 mlog_exit(status);
@@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
951 987
952 ret = 0; 988 ret = 0;
953bail: 989bail:
954 if (dirent_bh) 990 brelse(dirent_bh);
955 brelse(dirent_bh);
956 991
957 mlog_exit(ret); 992 mlog_exit(ret);
958 return ret; 993 return ret;
@@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1127 1162
1128 status = 0; 1163 status = 0;
1129bail: 1164bail:
1130 if (new_bh) 1165 brelse(new_bh);
1131 brelse(new_bh);
1132 1166
1133 mlog_exit(status); 1167 mlog_exit(status);
1134 return status; 1168 return status;
@@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1192 struct buffer_head *dirdata_bh = NULL; 1226 struct buffer_head *dirdata_bh = NULL;
1193 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1227 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1194 handle_t *handle; 1228 handle_t *handle;
1229 struct ocfs2_extent_tree et;
1230
1231 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1195 1232
1196 alloc = ocfs2_clusters_for_bytes(sb, bytes); 1233 alloc = ocfs2_clusters_for_bytes(sb, bytes);
1197 1234
@@ -1300,19 +1337,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1300 di->i_size = cpu_to_le64(sb->s_blocksize); 1337 di->i_size = cpu_to_le64(sb->s_blocksize);
1301 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); 1338 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
1302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); 1339 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
1303 dir->i_blocks = ocfs2_inode_sector_count(dir);
1304 1340
1305 /* 1341 /*
1306 * This should never fail as our extent list is empty and all 1342 * This should never fail as our extent list is empty and all
1307 * related blocks have been journaled already. 1343 * related blocks have been journaled already.
1308 */ 1344 */
1309 ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0, 1345 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
1310 NULL); 1346 0, NULL);
1311 if (ret) { 1347 if (ret) {
1312 mlog_errno(ret); 1348 mlog_errno(ret);
1313 goto out; 1349 goto out_commit;
1314 } 1350 }
1315 1351
1352 /*
1353 * Set i_blocks after the extent insert for the most up to
1354 * date ip_clusters value.
1355 */
1356 dir->i_blocks = ocfs2_inode_sector_count(dir);
1357
1316 ret = ocfs2_journal_dirty(handle, di_bh); 1358 ret = ocfs2_journal_dirty(handle, di_bh);
1317 if (ret) { 1359 if (ret) {
1318 mlog_errno(ret); 1360 mlog_errno(ret);
@@ -1332,11 +1374,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1332 } 1374 }
1333 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); 1375 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
1334 1376
1335 ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno, 1377 ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
1336 len, 0, NULL); 1378 blkno, len, 0, NULL);
1337 if (ret) { 1379 if (ret) {
1338 mlog_errno(ret); 1380 mlog_errno(ret);
1339 goto out; 1381 goto out_commit;
1340 } 1382 }
1341 } 1383 }
1342 1384
@@ -1378,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1378 if (extend) { 1420 if (extend) {
1379 u32 offset = OCFS2_I(dir)->ip_clusters; 1421 u32 offset = OCFS2_I(dir)->ip_clusters;
1380 1422
1381 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, 1423 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
1382 1, 0, parent_fe_bh, handle, 1424 1, 0, parent_fe_bh, handle,
1383 data_ac, meta_ac, NULL); 1425 data_ac, meta_ac, NULL);
1384 BUG_ON(status == -EAGAIN); 1426 BUG_ON(status == -EAGAIN);
1385 if (status < 0) { 1427 if (status < 0) {
1386 mlog_errno(status); 1428 mlog_errno(status);
@@ -1425,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1425 int credits, num_free_extents, drop_alloc_sem = 0; 1467 int credits, num_free_extents, drop_alloc_sem = 0;
1426 loff_t dir_i_size; 1468 loff_t dir_i_size;
1427 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1469 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1470 struct ocfs2_extent_list *el = &fe->id2.i_list;
1428 struct ocfs2_alloc_context *data_ac = NULL; 1471 struct ocfs2_alloc_context *data_ac = NULL;
1429 struct ocfs2_alloc_context *meta_ac = NULL; 1472 struct ocfs2_alloc_context *meta_ac = NULL;
1430 handle_t *handle = NULL; 1473 handle_t *handle = NULL;
1431 struct buffer_head *new_bh = NULL; 1474 struct buffer_head *new_bh = NULL;
1432 struct ocfs2_dir_entry * de; 1475 struct ocfs2_dir_entry * de;
1433 struct super_block *sb = osb->sb; 1476 struct super_block *sb = osb->sb;
1477 struct ocfs2_extent_tree et;
1434 1478
1435 mlog_entry_void(); 1479 mlog_entry_void();
1436 1480
@@ -1474,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1474 spin_lock(&OCFS2_I(dir)->ip_lock); 1518 spin_lock(&OCFS2_I(dir)->ip_lock);
1475 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { 1519 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
1476 spin_unlock(&OCFS2_I(dir)->ip_lock); 1520 spin_unlock(&OCFS2_I(dir)->ip_lock);
1477 num_free_extents = ocfs2_num_free_extents(osb, dir, fe); 1521 ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
1522 num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
1478 if (num_free_extents < 0) { 1523 if (num_free_extents < 0) {
1479 status = num_free_extents; 1524 status = num_free_extents;
1480 mlog_errno(status); 1525 mlog_errno(status);
@@ -1482,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1482 } 1527 }
1483 1528
1484 if (!num_free_extents) { 1529 if (!num_free_extents) {
1485 status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); 1530 status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
1486 if (status < 0) { 1531 if (status < 0) {
1487 if (status != -ENOSPC) 1532 if (status != -ENOSPC)
1488 mlog_errno(status); 1533 mlog_errno(status);
@@ -1497,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1497 goto bail; 1542 goto bail;
1498 } 1543 }
1499 1544
1500 credits = ocfs2_calc_extend_credits(sb, fe, 1); 1545 credits = ocfs2_calc_extend_credits(sb, el, 1);
1501 } else { 1546 } else {
1502 spin_unlock(&OCFS2_I(dir)->ip_lock); 1547 spin_unlock(&OCFS2_I(dir)->ip_lock);
1503 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; 1548 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -1563,8 +1608,7 @@ bail:
1563 if (meta_ac) 1608 if (meta_ac)
1564 ocfs2_free_alloc_context(meta_ac); 1609 ocfs2_free_alloc_context(meta_ac);
1565 1610
1566 if (new_bh) 1611 brelse(new_bh);
1567 brelse(new_bh);
1568 1612
1569 mlog_exit(status); 1613 mlog_exit(status);
1570 return status; 1614 return status;
@@ -1691,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1691 1735
1692 status = 0; 1736 status = 0;
1693bail: 1737bail:
1694 if (bh) 1738 brelse(bh);
1695 brelse(bh);
1696 1739
1697 mlog_exit(status); 1740 mlog_exit(status);
1698 return status; 1741 return status;
@@ -1751,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1751 *ret_de_bh = bh; 1794 *ret_de_bh = bh;
1752 bh = NULL; 1795 bh = NULL;
1753out: 1796out:
1754 if (bh) 1797 brelse(bh);
1755 brelse(bh);
1756 return ret; 1798 return ret;
1757} 1799}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index eae3d643a5e4..ec684426034b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2024 } else { 2024 } else {
2025 /* Boo, we have to go to disk. */ 2025 /* Boo, we have to go to disk. */
2026 /* read bh, cast, ocfs2_refresh_inode */ 2026 /* read bh, cast, ocfs2_refresh_inode */
2027 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, 2027 status = ocfs2_read_block(inode, oi->ip_blkno, bh);
2028 bh, OCFS2_BH_CACHED, inode);
2029 if (status < 0) { 2028 if (status < 0) {
2030 mlog_errno(status); 2029 mlog_errno(status);
2031 goto bail_refresh; 2030 goto bail_refresh;
@@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
2086 return 0; 2085 return 0;
2087 } 2086 }
2088 2087
2089 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 2088 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
2090 OCFS2_I(inode)->ip_blkno,
2091 ret_bh,
2092 OCFS2_BH_CACHED,
2093 inode);
2094 if (status < 0) 2089 if (status < 0)
2095 mlog_errno(status); 2090 mlog_errno(status);
2096 2091
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326fe..2baedac58234 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/fiemap.h>
28 29
29#define MLOG_MASK_PREFIX ML_EXTENT_MAP 30#define MLOG_MASK_PREFIX ML_EXTENT_MAP
30#include <cluster/masklog.h> 31#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
32#include "ocfs2.h" 33#include "ocfs2.h"
33 34
34#include "alloc.h" 35#include "alloc.h"
36#include "dlmglue.h"
35#include "extent_map.h" 37#include "extent_map.h"
36#include "inode.h" 38#include "inode.h"
37#include "super.h" 39#include "super.h"
@@ -282,6 +284,50 @@ out:
282 kfree(new_emi); 284 kfree(new_emi);
283} 285}
284 286
287static int ocfs2_last_eb_is_empty(struct inode *inode,
288 struct ocfs2_dinode *di)
289{
290 int ret, next_free;
291 u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
292 struct buffer_head *eb_bh = NULL;
293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el;
295
296 ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
297 if (ret) {
298 mlog_errno(ret);
299 goto out;
300 }
301
302 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
303 el = &eb->h_list;
304
305 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
306 ret = -EROFS;
307 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
308 goto out;
309 }
310
311 if (el->l_tree_depth) {
312 ocfs2_error(inode->i_sb,
313 "Inode %lu has non zero tree depth in "
314 "leaf block %llu\n", inode->i_ino,
315 (unsigned long long)eb_bh->b_blocknr);
316 ret = -EROFS;
317 goto out;
318 }
319
320 next_free = le16_to_cpu(el->l_next_free_rec);
321
322 if (next_free == 0 ||
323 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
324 ret = 1;
325
326out:
327 brelse(eb_bh);
328 return ret;
329}
330
285/* 331/*
286 * Return the 1st index within el which contains an extent start 332 * Return the 1st index within el which contains an extent start
287 * larger than v_cluster. 333 * larger than v_cluster.
@@ -335,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
335 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 381 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
336 goto no_more_extents; 382 goto no_more_extents;
337 383
338 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 384 ret = ocfs2_read_block(inode,
339 le64_to_cpu(eb->h_next_leaf_blk), 385 le64_to_cpu(eb->h_next_leaf_blk),
340 &next_eb_bh, OCFS2_BH_CACHED, inode); 386 &next_eb_bh);
341 if (ret) { 387 if (ret) {
342 mlog_errno(ret); 388 mlog_errno(ret);
343 goto out; 389 goto out;
@@ -373,42 +419,28 @@ out:
373 return ret; 419 return ret;
374} 420}
375 421
376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 422static int ocfs2_get_clusters_nocache(struct inode *inode,
377 u32 *p_cluster, u32 *num_clusters, 423 struct buffer_head *di_bh,
378 unsigned int *extent_flags) 424 u32 v_cluster, unsigned int *hole_len,
425 struct ocfs2_extent_rec *ret_rec,
426 unsigned int *is_last)
379{ 427{
380 int ret, i; 428 int i, ret, tree_height, len;
381 unsigned int flags = 0;
382 struct buffer_head *di_bh = NULL;
383 struct buffer_head *eb_bh = NULL;
384 struct ocfs2_dinode *di; 429 struct ocfs2_dinode *di;
385 struct ocfs2_extent_block *eb; 430 struct ocfs2_extent_block *uninitialized_var(eb);
386 struct ocfs2_extent_list *el; 431 struct ocfs2_extent_list *el;
387 struct ocfs2_extent_rec *rec; 432 struct ocfs2_extent_rec *rec;
388 u32 coff; 433 struct buffer_head *eb_bh = NULL;
389
390 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
391 ret = -ERANGE;
392 mlog_errno(ret);
393 goto out;
394 }
395
396 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
397 num_clusters, extent_flags);
398 if (ret == 0)
399 goto out;
400 434
401 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, 435 memset(ret_rec, 0, sizeof(*ret_rec));
402 &di_bh, OCFS2_BH_CACHED, inode); 436 if (is_last)
403 if (ret) { 437 *is_last = 0;
404 mlog_errno(ret);
405 goto out;
406 }
407 438
408 di = (struct ocfs2_dinode *) di_bh->b_data; 439 di = (struct ocfs2_dinode *) di_bh->b_data;
409 el = &di->id2.i_list; 440 el = &di->id2.i_list;
441 tree_height = le16_to_cpu(el->l_tree_depth);
410 442
411 if (el->l_tree_depth) { 443 if (tree_height > 0) {
412 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 444 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
413 if (ret) { 445 if (ret) {
414 mlog_errno(ret); 446 mlog_errno(ret);
@@ -431,46 +463,202 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
431 i = ocfs2_search_extent_list(el, v_cluster); 463 i = ocfs2_search_extent_list(el, v_cluster);
432 if (i == -1) { 464 if (i == -1) {
433 /* 465 /*
434 * A hole was found. Return some canned values that 466 * Holes can be larger than the maximum size of an
435 * callers can key on. If asked for, num_clusters will 467 * extent, so we return their lengths in a seperate
436 * be populated with the size of the hole. 468 * field.
437 */ 469 */
438 *p_cluster = 0; 470 if (hole_len) {
439 if (num_clusters) {
440 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, 471 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
441 v_cluster, 472 v_cluster, &len);
442 num_clusters);
443 if (ret) { 473 if (ret) {
444 mlog_errno(ret); 474 mlog_errno(ret);
445 goto out; 475 goto out;
446 } 476 }
477
478 *hole_len = len;
479 }
480 goto out_hole;
481 }
482
483 rec = &el->l_recs[i];
484
485 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
486
487 if (!rec->e_blkno) {
488 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
489 "record (%u, %u, 0)", inode->i_ino,
490 le32_to_cpu(rec->e_cpos),
491 ocfs2_rec_clusters(el, rec));
492 ret = -EROFS;
493 goto out;
494 }
495
496 *ret_rec = *rec;
497
498 /*
499 * Checking for last extent is potentially expensive - we
500 * might have to look at the next leaf over to see if it's
501 * empty.
502 *
503 * The first two checks are to see whether the caller even
504 * cares for this information, and if the extent is at least
505 * the last in it's list.
506 *
507 * If those hold true, then the extent is last if any of the
508 * additional conditions hold true:
509 * - Extent list is in-inode
510 * - Extent list is right-most
511 * - Extent list is 2nd to rightmost, with empty right-most
512 */
513 if (is_last) {
514 if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
515 if (tree_height == 0)
516 *is_last = 1;
517 else if (eb->h_blkno == di->i_last_eb_blk)
518 *is_last = 1;
519 else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
520 ret = ocfs2_last_eb_is_empty(inode, di);
521 if (ret < 0) {
522 mlog_errno(ret);
523 goto out;
524 }
525 if (ret == 1)
526 *is_last = 1;
527 }
528 }
529 }
530
531out_hole:
532 ret = 0;
533out:
534 brelse(eb_bh);
535 return ret;
536}
537
538static void ocfs2_relative_extent_offsets(struct super_block *sb,
539 u32 v_cluster,
540 struct ocfs2_extent_rec *rec,
541 u32 *p_cluster, u32 *num_clusters)
542
543{
544 u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
545
546 *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
547 *p_cluster = *p_cluster + coff;
548
549 if (num_clusters)
550 *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
551}
552
553int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
554 u32 *p_cluster, u32 *num_clusters,
555 struct ocfs2_extent_list *el)
556{
557 int ret = 0, i;
558 struct buffer_head *eb_bh = NULL;
559 struct ocfs2_extent_block *eb;
560 struct ocfs2_extent_rec *rec;
561 u32 coff;
562
563 if (el->l_tree_depth) {
564 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
565 if (ret) {
566 mlog_errno(ret);
567 goto out;
447 } 568 }
569
570 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
571 el = &eb->h_list;
572
573 if (el->l_tree_depth) {
574 ocfs2_error(inode->i_sb,
575 "Inode %lu has non zero tree depth in "
576 "xattr leaf block %llu\n", inode->i_ino,
577 (unsigned long long)eb_bh->b_blocknr);
578 ret = -EROFS;
579 goto out;
580 }
581 }
582
583 i = ocfs2_search_extent_list(el, v_cluster);
584 if (i == -1) {
585 ret = -EROFS;
586 mlog_errno(ret);
587 goto out;
448 } else { 588 } else {
449 rec = &el->l_recs[i]; 589 rec = &el->l_recs[i];
450
451 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 590 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
452 591
453 if (!rec->e_blkno) { 592 if (!rec->e_blkno) {
454 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 593 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
455 "record (%u, %u, 0)", inode->i_ino, 594 "record (%u, %u, 0) in xattr", inode->i_ino,
456 le32_to_cpu(rec->e_cpos), 595 le32_to_cpu(rec->e_cpos),
457 ocfs2_rec_clusters(el, rec)); 596 ocfs2_rec_clusters(el, rec));
458 ret = -EROFS; 597 ret = -EROFS;
459 goto out; 598 goto out;
460 } 599 }
461
462 coff = v_cluster - le32_to_cpu(rec->e_cpos); 600 coff = v_cluster - le32_to_cpu(rec->e_cpos);
463
464 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, 601 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
465 le64_to_cpu(rec->e_blkno)); 602 le64_to_cpu(rec->e_blkno));
466 *p_cluster = *p_cluster + coff; 603 *p_cluster = *p_cluster + coff;
467
468 if (num_clusters) 604 if (num_clusters)
469 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 605 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
606 }
607out:
608 if (eb_bh)
609 brelse(eb_bh);
610 return ret;
611}
612
613int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
614 u32 *p_cluster, u32 *num_clusters,
615 unsigned int *extent_flags)
616{
617 int ret;
618 unsigned int uninitialized_var(hole_len), flags = 0;
619 struct buffer_head *di_bh = NULL;
620 struct ocfs2_extent_rec rec;
621
622 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
623 ret = -ERANGE;
624 mlog_errno(ret);
625 goto out;
626 }
627
628 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
629 num_clusters, extent_flags);
630 if (ret == 0)
631 goto out;
632
633 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
634 if (ret) {
635 mlog_errno(ret);
636 goto out;
637 }
470 638
471 flags = rec->e_flags; 639 ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
640 &rec, NULL);
641 if (ret) {
642 mlog_errno(ret);
643 goto out;
644 }
645
646 if (rec.e_blkno == 0ULL) {
647 /*
648 * A hole was found. Return some canned values that
649 * callers can key on. If asked for, num_clusters will
650 * be populated with the size of the hole.
651 */
652 *p_cluster = 0;
653 if (num_clusters) {
654 *num_clusters = hole_len;
655 }
656 } else {
657 ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
658 p_cluster, num_clusters);
659 flags = rec.e_flags;
472 660
473 ocfs2_extent_map_insert_rec(inode, rec); 661 ocfs2_extent_map_insert_rec(inode, &rec);
474 } 662 }
475 663
476 if (extent_flags) 664 if (extent_flags)
@@ -478,7 +666,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
478 666
479out: 667out:
480 brelse(di_bh); 668 brelse(di_bh);
481 brelse(eb_bh);
482 return ret; 669 return ret;
483} 670}
484 671
@@ -521,3 +708,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
521out: 708out:
522 return ret; 709 return ret;
523} 710}
711
712static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
713 struct fiemap_extent_info *fieinfo,
714 u64 map_start)
715{
716 int ret;
717 unsigned int id_count;
718 struct ocfs2_dinode *di;
719 u64 phys;
720 u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
721 struct ocfs2_inode_info *oi = OCFS2_I(inode);
722
723 di = (struct ocfs2_dinode *)di_bh->b_data;
724 id_count = le16_to_cpu(di->id2.i_data.id_count);
725
726 if (map_start < id_count) {
727 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
728 phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
729
730 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
731 flags);
732 if (ret < 0)
733 return ret;
734 }
735
736 return 0;
737}
738
739#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
740
741int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
742 u64 map_start, u64 map_len)
743{
744 int ret, is_last;
745 u32 mapping_end, cpos;
746 unsigned int hole_size;
747 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
748 u64 len_bytes, phys_bytes, virt_bytes;
749 struct buffer_head *di_bh = NULL;
750 struct ocfs2_extent_rec rec;
751
752 ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
753 if (ret)
754 return ret;
755
756 ret = ocfs2_inode_lock(inode, &di_bh, 0);
757 if (ret) {
758 mlog_errno(ret);
759 goto out;
760 }
761
762 down_read(&OCFS2_I(inode)->ip_alloc_sem);
763
764 /*
765 * Handle inline-data separately.
766 */
767 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
768 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
769 goto out_unlock;
770 }
771
772 cpos = map_start >> osb->s_clustersize_bits;
773 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
774 map_start + map_len);
775 mapping_end -= cpos;
776 is_last = 0;
777 while (cpos < mapping_end && !is_last) {
778 u32 fe_flags;
779
780 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
781 &hole_size, &rec, &is_last);
782 if (ret) {
783 mlog_errno(ret);
784 goto out;
785 }
786
787 if (rec.e_blkno == 0ULL) {
788 cpos += hole_size;
789 continue;
790 }
791
792 fe_flags = 0;
793 if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
794 fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
795 if (is_last)
796 fe_flags |= FIEMAP_EXTENT_LAST;
797 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
798 phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
799 virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
800
801 ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
802 len_bytes, fe_flags);
803 if (ret)
804 break;
805
806 cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
807 }
808
809 if (ret > 0)
810 ret = 0;
811
812out_unlock:
813 brelse(di_bh);
814
815 up_read(&OCFS2_I(inode)->ip_alloc_sem);
816
817 ocfs2_inode_unlock(inode, 0);
818out:
819
820 return ret;
821}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a22..1c4aa8b06f34 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, 50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
51 u64 *ret_count, unsigned int *extent_flags); 51 u64 *ret_count, unsigned int *extent_flags);
52 52
53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
54 u64 map_start, u64 map_len);
55
56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el);
59
53#endif /* _EXTENT_MAP_H */ 60#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15c3daa..8d3225a78073 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
55#include "mmap.h" 55#include "mmap.h"
56#include "suballoc.h" 56#include "suballoc.h"
57#include "super.h" 57#include "super.h"
58#include "xattr.h"
58 59
59#include "buffer_head_io.h" 60#include "buffer_head_io.h"
60 61
@@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
184 goto bail; 185 goto bail;
185 186
186 journal = osb->journal->j_journal; 187 journal = osb->journal->j_journal;
187 err = journal_force_commit(journal); 188 err = jbd2_journal_force_commit(journal);
188 189
189bail: 190bail:
190 mlog_exit(err); 191 mlog_exit(err);
@@ -488,7 +489,7 @@ bail:
488} 489}
489 490
490/* 491/*
491 * extend allocation only here. 492 * extend file allocation only here.
492 * we'll update all the disk stuff, and oip->alloc_size 493 * we'll update all the disk stuff, and oip->alloc_size
493 * 494 *
494 * expect stuff to be locked, a transaction started and enough data / 495 * expect stuff to be locked, a transaction started and enough data /
@@ -497,189 +498,25 @@ bail:
497 * Will return -EAGAIN, and a reason if a restart is needed. 498 * Will return -EAGAIN, and a reason if a restart is needed.
498 * If passed in, *reason will always be set, even in error. 499 * If passed in, *reason will always be set, even in error.
499 */ 500 */
500int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 501int ocfs2_add_inode_data(struct ocfs2_super *osb,
501 struct inode *inode, 502 struct inode *inode,
502 u32 *logical_offset, 503 u32 *logical_offset,
503 u32 clusters_to_add, 504 u32 clusters_to_add,
504 int mark_unwritten, 505 int mark_unwritten,
505 struct buffer_head *fe_bh, 506 struct buffer_head *fe_bh,
506 handle_t *handle, 507 handle_t *handle,
507 struct ocfs2_alloc_context *data_ac, 508 struct ocfs2_alloc_context *data_ac,
508 struct ocfs2_alloc_context *meta_ac, 509 struct ocfs2_alloc_context *meta_ac,
509 enum ocfs2_alloc_restarted *reason_ret) 510 enum ocfs2_alloc_restarted *reason_ret)
510{ 511{
511 int status = 0; 512 int ret;
512 int free_extents; 513 struct ocfs2_extent_tree et;
513 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
514 enum ocfs2_alloc_restarted reason = RESTART_NONE;
515 u32 bit_off, num_bits;
516 u64 block;
517 u8 flags = 0;
518
519 BUG_ON(!clusters_to_add);
520
521 if (mark_unwritten)
522 flags = OCFS2_EXT_UNWRITTEN;
523
524 free_extents = ocfs2_num_free_extents(osb, inode, fe);
525 if (free_extents < 0) {
526 status = free_extents;
527 mlog_errno(status);
528 goto leave;
529 }
530
531 /* there are two cases which could cause us to EAGAIN in the
532 * we-need-more-metadata case:
533 * 1) we haven't reserved *any*
534 * 2) we are so fragmented, we've needed to add metadata too
535 * many times. */
536 if (!free_extents && !meta_ac) {
537 mlog(0, "we haven't reserved any metadata!\n");
538 status = -EAGAIN;
539 reason = RESTART_META;
540 goto leave;
541 } else if ((!free_extents)
542 && (ocfs2_alloc_context_bits_left(meta_ac)
543 < ocfs2_extend_meta_needed(fe))) {
544 mlog(0, "filesystem is really fragmented...\n");
545 status = -EAGAIN;
546 reason = RESTART_META;
547 goto leave;
548 }
549
550 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
551 clusters_to_add, &bit_off, &num_bits);
552 if (status < 0) {
553 if (status != -ENOSPC)
554 mlog_errno(status);
555 goto leave;
556 }
557
558 BUG_ON(num_bits > clusters_to_add);
559
560 /* reserve our write early -- insert_extent may update the inode */
561 status = ocfs2_journal_access(handle, inode, fe_bh,
562 OCFS2_JOURNAL_ACCESS_WRITE);
563 if (status < 0) {
564 mlog_errno(status);
565 goto leave;
566 }
567
568 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
569 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
570 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
571 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
572 *logical_offset, block, num_bits,
573 flags, meta_ac);
574 if (status < 0) {
575 mlog_errno(status);
576 goto leave;
577 }
578
579 status = ocfs2_journal_dirty(handle, fe_bh);
580 if (status < 0) {
581 mlog_errno(status);
582 goto leave;
583 }
584
585 clusters_to_add -= num_bits;
586 *logical_offset += num_bits;
587
588 if (clusters_to_add) {
589 mlog(0, "need to alloc once more, clusters = %u, wanted = "
590 "%u\n", fe->i_clusters, clusters_to_add);
591 status = -EAGAIN;
592 reason = RESTART_TRANS;
593 }
594
595leave:
596 mlog_exit(status);
597 if (reason_ret)
598 *reason_ret = reason;
599 return status;
600}
601
602/*
603 * For a given allocation, determine which allocators will need to be
604 * accessed, and lock them, reserving the appropriate number of bits.
605 *
606 * Sparse file systems call this from ocfs2_write_begin_nolock()
607 * and ocfs2_allocate_unwritten_extents().
608 *
609 * File systems which don't support holes call this from
610 * ocfs2_extend_allocation().
611 */
612int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
613 u32 clusters_to_add, u32 extents_to_split,
614 struct ocfs2_alloc_context **data_ac,
615 struct ocfs2_alloc_context **meta_ac)
616{
617 int ret = 0, num_free_extents;
618 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
619 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
620
621 *meta_ac = NULL;
622 if (data_ac)
623 *data_ac = NULL;
624
625 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
626
627 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
628 "clusters_to_add = %u, extents_to_split = %u\n",
629 (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
630 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
631
632 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
633 if (num_free_extents < 0) {
634 ret = num_free_extents;
635 mlog_errno(ret);
636 goto out;
637 }
638
639 /*
640 * Sparse allocation file systems need to be more conservative
641 * with reserving room for expansion - the actual allocation
642 * happens while we've got a journal handle open so re-taking
643 * a cluster lock (because we ran out of room for another
644 * extent) will violate ordering rules.
645 *
646 * Most of the time we'll only be seeing this 1 cluster at a time
647 * anyway.
648 *
649 * Always lock for any unwritten extents - we might want to
650 * add blocks during a split.
651 */
652 if (!num_free_extents ||
653 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
654 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
655 if (ret < 0) {
656 if (ret != -ENOSPC)
657 mlog_errno(ret);
658 goto out;
659 }
660 }
661
662 if (clusters_to_add == 0)
663 goto out;
664
665 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
666 if (ret < 0) {
667 if (ret != -ENOSPC)
668 mlog_errno(ret);
669 goto out;
670 }
671
672out:
673 if (ret) {
674 if (*meta_ac) {
675 ocfs2_free_alloc_context(*meta_ac);
676 *meta_ac = NULL;
677 }
678 514
679 /* 515 ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
680 * We cannot have an error and a non null *data_ac. 516 ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
681 */ 517 clusters_to_add, mark_unwritten,
682 } 518 &et, handle,
519 data_ac, meta_ac, reason_ret);
683 520
684 return ret; 521 return ret;
685} 522}
@@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
698 struct ocfs2_alloc_context *meta_ac = NULL; 535 struct ocfs2_alloc_context *meta_ac = NULL;
699 enum ocfs2_alloc_restarted why; 536 enum ocfs2_alloc_restarted why;
700 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
538 struct ocfs2_extent_tree et;
701 539
702 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 540 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
703 541
@@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
707 */ 545 */
708 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 546 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
709 547
710 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 548 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
711 OCFS2_BH_CACHED, inode);
712 if (status < 0) { 549 if (status < 0) {
713 mlog_errno(status); 550 mlog_errno(status);
714 goto leave; 551 goto leave;
@@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
724restart_all: 561restart_all:
725 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 562 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
726 563
727 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, 564 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
728 &meta_ac); 565 "clusters_to_add = %u\n",
566 (unsigned long long)OCFS2_I(inode)->ip_blkno,
567 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
568 clusters_to_add);
569 ocfs2_init_dinode_extent_tree(&et, inode, bh);
570 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
571 &data_ac, &meta_ac);
729 if (status) { 572 if (status) {
730 mlog_errno(status); 573 mlog_errno(status);
731 goto leave; 574 goto leave;
732 } 575 }
733 576
734 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 577 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
578 clusters_to_add);
735 handle = ocfs2_start_trans(osb, credits); 579 handle = ocfs2_start_trans(osb, credits);
736 if (IS_ERR(handle)) { 580 if (IS_ERR(handle)) {
737 status = PTR_ERR(handle); 581 status = PTR_ERR(handle);
@@ -753,16 +597,16 @@ restarted_transaction:
753 597
754 prev_clusters = OCFS2_I(inode)->ip_clusters; 598 prev_clusters = OCFS2_I(inode)->ip_clusters;
755 599
756 status = ocfs2_do_extend_allocation(osb, 600 status = ocfs2_add_inode_data(osb,
757 inode, 601 inode,
758 &logical_start, 602 &logical_start,
759 clusters_to_add, 603 clusters_to_add,
760 mark_unwritten, 604 mark_unwritten,
761 bh, 605 bh,
762 handle, 606 handle,
763 data_ac, 607 data_ac,
764 meta_ac, 608 meta_ac,
765 &why); 609 &why);
766 if ((status < 0) && (status != -EAGAIN)) { 610 if ((status < 0) && (status != -EAGAIN)) {
767 if (status != -ENOSPC) 611 if (status != -ENOSPC)
768 mlog_errno(status); 612 mlog_errno(status);
@@ -789,7 +633,7 @@ restarted_transaction:
789 mlog(0, "restarting transaction.\n"); 633 mlog(0, "restarting transaction.\n");
790 /* TODO: This can be more intelligent. */ 634 /* TODO: This can be more intelligent. */
791 credits = ocfs2_calc_extend_credits(osb->sb, 635 credits = ocfs2_calc_extend_credits(osb->sb,
792 fe, 636 &fe->id2.i_list,
793 clusters_to_add); 637 clusters_to_add);
794 status = ocfs2_extend_trans(handle, credits); 638 status = ocfs2_extend_trans(handle, credits);
795 if (status < 0) { 639 if (status < 0) {
@@ -826,10 +670,8 @@ leave:
826 restart_func = 0; 670 restart_func = 0;
827 goto restart_all; 671 goto restart_all;
828 } 672 }
829 if (bh) { 673 brelse(bh);
830 brelse(bh); 674 bh = NULL;
831 bh = NULL;
832 }
833 675
834 mlog_exit(status); 676 mlog_exit(status);
835 return status; 677 return status;
@@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1096 goto bail_unlock; 938 goto bail_unlock;
1097 } 939 }
1098 940
1099 if (i_size_read(inode) > attr->ia_size) 941 if (i_size_read(inode) > attr->ia_size) {
942 if (ocfs2_should_order_data(inode)) {
943 status = ocfs2_begin_ordered_truncate(inode,
944 attr->ia_size);
945 if (status)
946 goto bail_unlock;
947 }
1100 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 948 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1101 else 949 } else
1102 status = ocfs2_extend_file(inode, bh, attr->ia_size); 950 status = ocfs2_extend_file(inode, bh, attr->ia_size);
1103 if (status < 0) { 951 if (status < 0) {
1104 if (status != -ENOSPC) 952 if (status != -ENOSPC)
@@ -1140,8 +988,7 @@ bail_unlock_rw:
1140 if (size_change) 988 if (size_change)
1141 ocfs2_rw_unlock(inode, 1); 989 ocfs2_rw_unlock(inode, 1);
1142bail: 990bail:
1143 if (bh) 991 brelse(bh);
1144 brelse(bh);
1145 992
1146 mlog_exit(status); 993 mlog_exit(status);
1147 return status; 994 return status;
@@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1284 struct buffer_head *bh = NULL; 1131 struct buffer_head *bh = NULL;
1285 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1132 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1286 1133
1287 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 1134 ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
1288 oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1289 if (ret < 0) { 1135 if (ret < 0) {
1290 mlog_errno(ret); 1136 mlog_errno(ret);
1291 goto out; 1137 goto out;
@@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1311 struct buffer_head *di_bh = NULL; 1157 struct buffer_head *di_bh = NULL;
1312 1158
1313 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1159 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1314 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 1160 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
1315 OCFS2_I(inode)->ip_blkno, &di_bh, 1161 &di_bh);
1316 OCFS2_BH_CACHED, inode);
1317 if (ret) { 1162 if (ret) {
1318 mlog_errno(ret); 1163 mlog_errno(ret);
1319 goto out; 1164 goto out;
@@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
1394 handle_t *handle; 1239 handle_t *handle;
1395 struct ocfs2_alloc_context *meta_ac = NULL; 1240 struct ocfs2_alloc_context *meta_ac = NULL;
1396 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1241 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1242 struct ocfs2_extent_tree et;
1397 1243
1398 ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); 1244 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1245
1246 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
1399 if (ret) { 1247 if (ret) {
1400 mlog_errno(ret); 1248 mlog_errno(ret);
1401 return ret; 1249 return ret;
@@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
1425 goto out; 1273 goto out;
1426 } 1274 }
1427 1275
1428 ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, 1276 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
1429 dealloc); 1277 dealloc);
1430 if (ret) { 1278 if (ret) {
1431 mlog_errno(ret); 1279 mlog_errno(ret);
@@ -2040,7 +1888,7 @@ out_dio:
2040 */ 1888 */
2041 if (old_size != i_size_read(inode) || 1889 if (old_size != i_size_read(inode) ||
2042 old_clusters != OCFS2_I(inode)->ip_clusters) { 1890 old_clusters != OCFS2_I(inode)->ip_clusters) {
2043 ret = journal_force_commit(osb->journal->j_journal); 1891 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2044 if (ret < 0) 1892 if (ret < 0)
2045 written = ret; 1893 written = ret;
2046 } 1894 }
@@ -2227,7 +2075,12 @@ const struct inode_operations ocfs2_file_iops = {
2227 .setattr = ocfs2_setattr, 2075 .setattr = ocfs2_setattr,
2228 .getattr = ocfs2_getattr, 2076 .getattr = ocfs2_getattr,
2229 .permission = ocfs2_permission, 2077 .permission = ocfs2_permission,
2078 .setxattr = generic_setxattr,
2079 .getxattr = generic_getxattr,
2080 .listxattr = ocfs2_listxattr,
2081 .removexattr = generic_removexattr,
2230 .fallocate = ocfs2_fallocate, 2082 .fallocate = ocfs2_fallocate,
2083 .fiemap = ocfs2_fiemap,
2231}; 2084};
2232 2085
2233const struct inode_operations ocfs2_special_file_iops = { 2086const struct inode_operations ocfs2_special_file_iops = {
@@ -2236,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = {
2236 .permission = ocfs2_permission, 2089 .permission = ocfs2_permission,
2237}; 2090};
2238 2091
2092/*
2093 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2094 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2095 */
2239const struct file_operations ocfs2_fops = { 2096const struct file_operations ocfs2_fops = {
2240 .llseek = generic_file_llseek, 2097 .llseek = generic_file_llseek,
2241 .read = do_sync_read, 2098 .read = do_sync_read,
@@ -2250,6 +2107,7 @@ const struct file_operations ocfs2_fops = {
2250#ifdef CONFIG_COMPAT 2107#ifdef CONFIG_COMPAT
2251 .compat_ioctl = ocfs2_compat_ioctl, 2108 .compat_ioctl = ocfs2_compat_ioctl,
2252#endif 2109#endif
2110 .lock = ocfs2_lock,
2253 .flock = ocfs2_flock, 2111 .flock = ocfs2_flock,
2254 .splice_read = ocfs2_file_splice_read, 2112 .splice_read = ocfs2_file_splice_read,
2255 .splice_write = ocfs2_file_splice_write, 2113 .splice_write = ocfs2_file_splice_write,
@@ -2266,5 +2124,51 @@ const struct file_operations ocfs2_dops = {
2266#ifdef CONFIG_COMPAT 2124#ifdef CONFIG_COMPAT
2267 .compat_ioctl = ocfs2_compat_ioctl, 2125 .compat_ioctl = ocfs2_compat_ioctl,
2268#endif 2126#endif
2127 .lock = ocfs2_lock,
2128 .flock = ocfs2_flock,
2129};
2130
2131/*
2132 * POSIX-lockless variants of our file_operations.
2133 *
2134 * These will be used if the underlying cluster stack does not support
2135 * posix file locking, if the user passes the "localflocks" mount
2136 * option, or if we have a local-only fs.
2137 *
2138 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2139 * so we still want it in the case of no stack support for
2140 * plocks. Internally, it will do the right thing when asked to ignore
2141 * the cluster.
2142 */
2143const struct file_operations ocfs2_fops_no_plocks = {
2144 .llseek = generic_file_llseek,
2145 .read = do_sync_read,
2146 .write = do_sync_write,
2147 .mmap = ocfs2_mmap,
2148 .fsync = ocfs2_sync_file,
2149 .release = ocfs2_file_release,
2150 .open = ocfs2_file_open,
2151 .aio_read = ocfs2_file_aio_read,
2152 .aio_write = ocfs2_file_aio_write,
2153 .unlocked_ioctl = ocfs2_ioctl,
2154#ifdef CONFIG_COMPAT
2155 .compat_ioctl = ocfs2_compat_ioctl,
2156#endif
2157 .flock = ocfs2_flock,
2158 .splice_read = ocfs2_file_splice_read,
2159 .splice_write = ocfs2_file_splice_write,
2160};
2161
2162const struct file_operations ocfs2_dops_no_plocks = {
2163 .llseek = generic_file_llseek,
2164 .read = generic_read_dir,
2165 .readdir = ocfs2_readdir,
2166 .fsync = ocfs2_sync_file,
2167 .release = ocfs2_dir_release,
2168 .open = ocfs2_dir_open,
2169 .unlocked_ioctl = ocfs2_ioctl,
2170#ifdef CONFIG_COMPAT
2171 .compat_ioctl = ocfs2_compat_ioctl,
2172#endif
2269 .flock = ocfs2_flock, 2173 .flock = ocfs2_flock,
2270}; 2174};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 1e27b4d017ea..e92382cbca5f 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,9 +28,12 @@
28 28
29extern const struct file_operations ocfs2_fops; 29extern const struct file_operations ocfs2_fops;
30extern const struct file_operations ocfs2_dops; 30extern const struct file_operations ocfs2_dops;
31extern const struct file_operations ocfs2_fops_no_plocks;
32extern const struct file_operations ocfs2_dops_no_plocks;
31extern const struct inode_operations ocfs2_file_iops; 33extern const struct inode_operations ocfs2_file_iops;
32extern const struct inode_operations ocfs2_special_file_iops; 34extern const struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context; 35struct ocfs2_alloc_context;
36enum ocfs2_alloc_restarted;
34 37
35struct ocfs2_file_private { 38struct ocfs2_file_private {
36 struct file *fp_file; 39 struct file *fp_file;
@@ -38,27 +41,18 @@ struct ocfs2_file_private {
38 struct ocfs2_lock_res fp_flock; 41 struct ocfs2_lock_res fp_flock;
39}; 42};
40 43
41enum ocfs2_alloc_restarted { 44int ocfs2_add_inode_data(struct ocfs2_super *osb,
42 RESTART_NONE = 0, 45 struct inode *inode,
43 RESTART_TRANS, 46 u32 *logical_offset,
44 RESTART_META 47 u32 clusters_to_add,
45}; 48 int mark_unwritten,
46int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 49 struct buffer_head *fe_bh,
47 struct inode *inode, 50 handle_t *handle,
48 u32 *logical_offset, 51 struct ocfs2_alloc_context *data_ac,
49 u32 clusters_to_add, 52 struct ocfs2_alloc_context *meta_ac,
50 int mark_unwritten, 53 enum ocfs2_alloc_restarted *reason_ret);
51 struct buffer_head *fe_bh,
52 handle_t *handle,
53 struct ocfs2_alloc_context *data_ac,
54 struct ocfs2_alloc_context *meta_ac,
55 enum ocfs2_alloc_restarted *reason_ret);
56int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, 54int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
57 u64 zero_to); 55 u64 zero_to);
58int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
59 u32 clusters_to_add, u32 extents_to_split,
60 struct ocfs2_alloc_context **data_ac,
61 struct ocfs2_alloc_context **meta_ac);
62int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 56int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
63int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 57int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
64 struct kstat *stat); 58 struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec7..4903688f72a9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
49#include "symlink.h" 49#include "symlink.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "xattr.h"
52 53
53#include "buffer_head_io.h" 54#include "buffer_head_io.h"
54 55
@@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
219 struct super_block *sb; 220 struct super_block *sb;
220 struct ocfs2_super *osb; 221 struct ocfs2_super *osb;
221 int status = -EINVAL; 222 int status = -EINVAL;
223 int use_plocks = 1;
222 224
223 mlog_entry("(0x%p, size:%llu)\n", inode, 225 mlog_entry("(0x%p, size:%llu)\n", inode,
224 (unsigned long long)le64_to_cpu(fe->i_size)); 226 (unsigned long long)le64_to_cpu(fe->i_size));
@@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
226 sb = inode->i_sb; 228 sb = inode->i_sb;
227 osb = OCFS2_SB(sb); 229 osb = OCFS2_SB(sb);
228 230
231 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
232 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
233 use_plocks = 0;
234
229 /* this means that read_inode cannot create a superblock inode 235 /* this means that read_inode cannot create a superblock inode
230 * today. change if needed. */ 236 * today. change if needed. */
231 if (!OCFS2_IS_VALID_DINODE(fe) || 237 if (!OCFS2_IS_VALID_DINODE(fe) ||
@@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
295 301
296 switch (inode->i_mode & S_IFMT) { 302 switch (inode->i_mode & S_IFMT) {
297 case S_IFREG: 303 case S_IFREG:
298 inode->i_fop = &ocfs2_fops; 304 if (use_plocks)
305 inode->i_fop = &ocfs2_fops;
306 else
307 inode->i_fop = &ocfs2_fops_no_plocks;
299 inode->i_op = &ocfs2_file_iops; 308 inode->i_op = &ocfs2_file_iops;
300 i_size_write(inode, le64_to_cpu(fe->i_size)); 309 i_size_write(inode, le64_to_cpu(fe->i_size));
301 break; 310 break;
302 case S_IFDIR: 311 case S_IFDIR:
303 inode->i_op = &ocfs2_dir_iops; 312 inode->i_op = &ocfs2_dir_iops;
304 inode->i_fop = &ocfs2_dops; 313 if (use_plocks)
314 inode->i_fop = &ocfs2_dops;
315 else
316 inode->i_fop = &ocfs2_dops_no_plocks;
305 i_size_write(inode, le64_to_cpu(fe->i_size)); 317 i_size_write(inode, le64_to_cpu(fe->i_size));
306 break; 318 break;
307 case S_IFLNK: 319 case S_IFLNK:
@@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
448 } 460 }
449 } 461 }
450 462
451 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, 463 if (can_lock)
452 can_lock ? inode : NULL); 464 status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
465 OCFS2_BH_IGNORE_CACHE);
466 else
467 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
453 if (status < 0) { 468 if (status < 0) {
454 mlog_errno(status); 469 mlog_errno(status);
455 goto bail; 470 goto bail;
@@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
522 * data and fast symlinks. 537 * data and fast symlinks.
523 */ 538 */
524 if (fe->i_clusters) { 539 if (fe->i_clusters) {
540 if (ocfs2_should_order_data(inode))
541 ocfs2_begin_ordered_truncate(inode, 0);
542
525 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 543 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
526 if (IS_ERR(handle)) { 544 if (IS_ERR(handle)) {
527 status = PTR_ERR(handle); 545 status = PTR_ERR(handle);
@@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
730 goto bail_unlock_dir; 748 goto bail_unlock_dir;
731 } 749 }
732 750
751 /*Free extended attribute resources associated with this inode.*/
752 status = ocfs2_xattr_remove(inode, di_bh);
753 if (status < 0) {
754 mlog_errno(status);
755 goto bail_unlock_dir;
756 }
757
733 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, 758 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
734 orphan_dir_bh); 759 orphan_dir_bh);
735 if (status < 0) 760 if (status < 0)
@@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode)
1081 oi->ip_last_trans = 0; 1106 oi->ip_last_trans = 0;
1082 oi->ip_dir_start_lookup = 0; 1107 oi->ip_dir_start_lookup = 0;
1083 oi->ip_blkno = 0ULL; 1108 oi->ip_blkno = 0ULL;
1109 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
1110 &oi->ip_jinode);
1084 1111
1085bail: 1112bail:
1086 mlog_exit_void(); 1113 mlog_exit_void();
@@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode)
1107} 1134}
1108 1135
1109/* 1136/*
1110 * TODO: this should probably be merged into ocfs2_get_block
1111 *
1112 * However, you now need to pay attention to the cont_prepare_write()
1113 * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
1114 * expects never to extend).
1115 */
1116struct buffer_head *ocfs2_bread(struct inode *inode,
1117 int block, int *err, int reada)
1118{
1119 struct buffer_head *bh = NULL;
1120 int tmperr;
1121 u64 p_blkno;
1122 int readflags = OCFS2_BH_CACHED;
1123
1124 if (reada)
1125 readflags |= OCFS2_BH_READAHEAD;
1126
1127 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
1128 i_size_read(inode)) {
1129 BUG_ON(!reada);
1130 return NULL;
1131 }
1132
1133 down_read(&OCFS2_I(inode)->ip_alloc_sem);
1134 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
1135 NULL);
1136 up_read(&OCFS2_I(inode)->ip_alloc_sem);
1137 if (tmperr < 0) {
1138 mlog_errno(tmperr);
1139 goto fail;
1140 }
1141
1142 tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
1143 readflags, inode);
1144 if (tmperr < 0)
1145 goto fail;
1146
1147 tmperr = 0;
1148
1149 *err = 0;
1150 return bh;
1151
1152fail:
1153 if (bh) {
1154 brelse(bh);
1155 bh = NULL;
1156 }
1157 *err = -EIO;
1158 return NULL;
1159}
1160
1161/*
1162 * This is called from our getattr. 1137 * This is called from our getattr.
1163 */ 1138 */
1164int ocfs2_inode_revalidate(struct dentry *dentry) 1139int ocfs2_inode_revalidate(struct dentry *dentry)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 390a85596aa0..2f37af9bcc4a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -40,6 +40,9 @@ struct ocfs2_inode_info
40 /* protects allocation changes on this inode. */ 40 /* protects allocation changes on this inode. */
41 struct rw_semaphore ip_alloc_sem; 41 struct rw_semaphore ip_alloc_sem;
42 42
43 /* protects extended attribute changes on this inode */
44 struct rw_semaphore ip_xattr_sem;
45
43 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
44 spinlock_t ip_lock; 47 spinlock_t ip_lock;
45 u32 ip_open_count; 48 u32 ip_open_count;
@@ -68,6 +71,7 @@ struct ocfs2_inode_info
68 struct ocfs2_extent_map ip_extent_map; 71 struct ocfs2_extent_map ip_extent_map;
69 72
70 struct inode vfs_inode; 73 struct inode vfs_inode;
74 struct jbd2_inode ip_jinode;
71}; 75};
72 76
73/* 77/*
@@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;
113 117
114extern const struct address_space_operations ocfs2_aops; 118extern const struct address_space_operations ocfs2_aops;
115 119
116struct buffer_head *ocfs2_bread(struct inode *inode, int block,
117 int *err, int reada);
118void ocfs2_clear_inode(struct inode *inode); 120void ocfs2_clear_inode(struct inode *inode);
119void ocfs2_delete_inode(struct inode *inode); 121void ocfs2_delete_inode(struct inode *inode);
120void ocfs2_drop_inode(struct inode *inode); 122void ocfs2_drop_inode(struct inode *inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7b142f0ce995..9fcd36dcc9a0 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -102,8 +102,7 @@ bail_unlock:
102bail: 102bail:
103 mutex_unlock(&inode->i_mutex); 103 mutex_unlock(&inode->i_mutex);
104 104
105 if (bh) 105 brelse(bh);
106 brelse(bh);
107 106
108 mlog_exit(status); 107 mlog_exit(status);
109 return status; 108 return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7a37240f7a31..81e40677eecb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
215 goto finally; 215 goto finally;
216 } 216 }
217 217
218 journal_lock_updates(journal->j_journal); 218 jbd2_journal_lock_updates(journal->j_journal);
219 status = journal_flush(journal->j_journal); 219 status = jbd2_journal_flush(journal->j_journal);
220 journal_unlock_updates(journal->j_journal); 220 jbd2_journal_unlock_updates(journal->j_journal);
221 if (status < 0) { 221 if (status < 0) {
222 up_write(&journal->j_trans_barrier); 222 up_write(&journal->j_trans_barrier);
223 mlog_errno(status); 223 mlog_errno(status);
@@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
264 264
265 down_read(&osb->journal->j_trans_barrier); 265 down_read(&osb->journal->j_trans_barrier);
266 266
267 handle = journal_start(journal, max_buffs); 267 handle = jbd2_journal_start(journal, max_buffs);
268 if (IS_ERR(handle)) { 268 if (IS_ERR(handle)) {
269 up_read(&osb->journal->j_trans_barrier); 269 up_read(&osb->journal->j_trans_barrier);
270 270
@@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
290 290
291 BUG_ON(!handle); 291 BUG_ON(!handle);
292 292
293 ret = journal_stop(handle); 293 ret = jbd2_journal_stop(handle);
294 if (ret < 0) 294 if (ret < 0)
295 mlog_errno(ret); 295 mlog_errno(ret);
296 296
@@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
304 * transaction. extend_trans will either extend the current handle by 304 * transaction. extend_trans will either extend the current handle by
305 * nblocks, or commit it and start a new one with nblocks credits. 305 * nblocks, or commit it and start a new one with nblocks credits.
306 * 306 *
307 * This might call journal_restart() which will commit dirty buffers 307 * This might call jbd2_journal_restart() which will commit dirty buffers
308 * and then restart the transaction. Before calling 308 * and then restart the transaction. Before calling
309 * ocfs2_extend_trans(), any changed blocks should have been 309 * ocfs2_extend_trans(), any changed blocks should have been
310 * dirtied. After calling it, all blocks which need to be changed must 310 * dirtied. After calling it, all blocks which need to be changed must
@@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
332#ifdef CONFIG_OCFS2_DEBUG_FS 332#ifdef CONFIG_OCFS2_DEBUG_FS
333 status = 1; 333 status = 1;
334#else 334#else
335 status = journal_extend(handle, nblocks); 335 status = jbd2_journal_extend(handle, nblocks);
336 if (status < 0) { 336 if (status < 0) {
337 mlog_errno(status); 337 mlog_errno(status);
338 goto bail; 338 goto bail;
@@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
340#endif 340#endif
341 341
342 if (status > 0) { 342 if (status > 0) {
343 mlog(0, "journal_extend failed, trying journal_restart\n"); 343 mlog(0,
344 status = journal_restart(handle, nblocks); 344 "jbd2_journal_extend failed, trying "
345 "jbd2_journal_restart\n");
346 status = jbd2_journal_restart(handle, nblocks);
345 if (status < 0) { 347 if (status < 0) {
346 mlog_errno(status); 348 mlog_errno(status);
347 goto bail; 349 goto bail;
@@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
393 switch (type) { 395 switch (type) {
394 case OCFS2_JOURNAL_ACCESS_CREATE: 396 case OCFS2_JOURNAL_ACCESS_CREATE:
395 case OCFS2_JOURNAL_ACCESS_WRITE: 397 case OCFS2_JOURNAL_ACCESS_WRITE:
396 status = journal_get_write_access(handle, bh); 398 status = jbd2_journal_get_write_access(handle, bh);
397 break; 399 break;
398 400
399 case OCFS2_JOURNAL_ACCESS_UNDO: 401 case OCFS2_JOURNAL_ACCESS_UNDO:
400 status = journal_get_undo_access(handle, bh); 402 status = jbd2_journal_get_undo_access(handle, bh);
401 break; 403 break;
402 404
403 default: 405 default:
@@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
422 mlog_entry("(bh->b_blocknr=%llu)\n", 424 mlog_entry("(bh->b_blocknr=%llu)\n",
423 (unsigned long long)bh->b_blocknr); 425 (unsigned long long)bh->b_blocknr);
424 426
425 status = journal_dirty_metadata(handle, bh); 427 status = jbd2_journal_dirty_metadata(handle, bh);
426 if (status < 0) 428 if (status < 0)
427 mlog(ML_ERROR, "Could not dirty metadata buffer. " 429 mlog(ML_ERROR, "Could not dirty metadata buffer. "
428 "(bh->b_blocknr=%llu)\n", 430 "(bh->b_blocknr=%llu)\n",
@@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
432 return status; 434 return status;
433} 435}
434 436
437#ifdef CONFIG_OCFS2_COMPAT_JBD
435int ocfs2_journal_dirty_data(handle_t *handle, 438int ocfs2_journal_dirty_data(handle_t *handle,
436 struct buffer_head *bh) 439 struct buffer_head *bh)
437{ 440{
@@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
443 446
444 return err; 447 return err;
445} 448}
449#endif
446 450
447#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE) 451#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
448 452
449void ocfs2_set_journal_params(struct ocfs2_super *osb) 453void ocfs2_set_journal_params(struct ocfs2_super *osb)
450{ 454{
@@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
457 spin_lock(&journal->j_state_lock); 461 spin_lock(&journal->j_state_lock);
458 journal->j_commit_interval = commit_interval; 462 journal->j_commit_interval = commit_interval;
459 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 463 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
460 journal->j_flags |= JFS_BARRIER; 464 journal->j_flags |= JBD2_BARRIER;
461 else 465 else
462 journal->j_flags &= ~JFS_BARRIER; 466 journal->j_flags &= ~JBD2_BARRIER;
463 spin_unlock(&journal->j_state_lock); 467 spin_unlock(&journal->j_state_lock);
464} 468}
465 469
@@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
524 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); 528 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
525 529
526 /* call the kernels journal init function now */ 530 /* call the kernels journal init function now */
527 j_journal = journal_init_inode(inode); 531 j_journal = jbd2_journal_init_inode(inode);
528 if (j_journal == NULL) { 532 if (j_journal == NULL) {
529 mlog(ML_ERROR, "Linux journal layer error\n"); 533 mlog(ML_ERROR, "Linux journal layer error\n");
530 status = -EINVAL; 534 status = -EINVAL;
531 goto done; 535 goto done;
532 } 536 }
533 537
534 mlog(0, "Returned from journal_init_inode\n"); 538 mlog(0, "Returned from jbd2_journal_init_inode\n");
535 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); 539 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
536 540
537 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & 541 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@@ -550,8 +554,7 @@ done:
550 if (status < 0) { 554 if (status < 0) {
551 if (inode_lock) 555 if (inode_lock)
552 ocfs2_inode_unlock(inode, 1); 556 ocfs2_inode_unlock(inode, 1);
553 if (bh != NULL) 557 brelse(bh);
554 brelse(bh);
555 if (inode) { 558 if (inode) {
556 OCFS2_I(inode)->ip_open_count--; 559 OCFS2_I(inode)->ip_open_count--;
557 iput(inode); 560 iput(inode);
@@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
639 if (journal->j_state != OCFS2_JOURNAL_LOADED) 642 if (journal->j_state != OCFS2_JOURNAL_LOADED)
640 goto done; 643 goto done;
641 644
642 /* need to inc inode use count as journal_destroy will iput. */ 645 /* need to inc inode use count - jbd2_journal_destroy will iput. */
643 if (!igrab(inode)) 646 if (!igrab(inode))
644 BUG(); 647 BUG();
645 648
@@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
668 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); 671 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
669 672
670 if (ocfs2_mount_local(osb)) { 673 if (ocfs2_mount_local(osb)) {
671 journal_lock_updates(journal->j_journal); 674 jbd2_journal_lock_updates(journal->j_journal);
672 status = journal_flush(journal->j_journal); 675 status = jbd2_journal_flush(journal->j_journal);
673 journal_unlock_updates(journal->j_journal); 676 jbd2_journal_unlock_updates(journal->j_journal);
674 if (status < 0) 677 if (status < 0)
675 mlog_errno(status); 678 mlog_errno(status);
676 } 679 }
@@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
686 } 689 }
687 690
688 /* Shutdown the kernel journal system */ 691 /* Shutdown the kernel journal system */
689 journal_destroy(journal->j_journal); 692 jbd2_journal_destroy(journal->j_journal);
690 693
691 OCFS2_I(inode)->ip_open_count--; 694 OCFS2_I(inode)->ip_open_count--;
692 695
@@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
711{ 714{
712 int olderr; 715 int olderr;
713 716
714 olderr = journal_errno(journal); 717 olderr = jbd2_journal_errno(journal);
715 if (olderr) { 718 if (olderr) {
716 mlog(ML_ERROR, "File system error %d recorded in " 719 mlog(ML_ERROR, "File system error %d recorded in "
717 "journal %u.\n", olderr, slot); 720 "journal %u.\n", olderr, slot);
718 mlog(ML_ERROR, "File system on device %s needs checking.\n", 721 mlog(ML_ERROR, "File system on device %s needs checking.\n",
719 sb->s_id); 722 sb->s_id);
720 723
721 journal_ack_err(journal); 724 jbd2_journal_ack_err(journal);
722 journal_clear_err(journal); 725 jbd2_journal_clear_err(journal);
723 } 726 }
724} 727}
725 728
@@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
734 737
735 osb = journal->j_osb; 738 osb = journal->j_osb;
736 739
737 status = journal_load(journal->j_journal); 740 status = jbd2_journal_load(journal->j_journal);
738 if (status < 0) { 741 if (status < 0) {
739 mlog(ML_ERROR, "Failed to load journal!\n"); 742 mlog(ML_ERROR, "Failed to load journal!\n");
740 goto done; 743 goto done;
@@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
778 781
779 BUG_ON(!journal); 782 BUG_ON(!journal);
780 783
781 status = journal_wipe(journal->j_journal, full); 784 status = jbd2_journal_wipe(journal->j_journal, full);
782 if (status < 0) { 785 if (status < 0) {
783 mlog_errno(status); 786 mlog_errno(status);
784 goto bail; 787 goto bail;
@@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
847 850
848 /* We are reading journal data which should not 851 /* We are reading journal data which should not
849 * be put in the uptodate cache */ 852 * be put in the uptodate cache */
850 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), 853 status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
851 p_blkno, p_blocks, bhs, 0, 854 p_blkno, p_blocks, bhs);
852 NULL);
853 if (status < 0) { 855 if (status < 0) {
854 mlog_errno(status); 856 mlog_errno(status);
855 goto bail; 857 goto bail;
@@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
865 867
866bail: 868bail:
867 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) 869 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
868 if (bhs[i]) 870 brelse(bhs[i]);
869 brelse(bhs[i]);
870 mlog_exit(status); 871 mlog_exit(status);
871 return status; 872 return status;
872} 873}
@@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
1133 } 1134 }
1134 SET_INODE_JOURNAL(inode); 1135 SET_INODE_JOURNAL(inode);
1135 1136
1136 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode); 1137 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
1138 OCFS2_BH_IGNORE_CACHE);
1137 if (status < 0) { 1139 if (status < 0) {
1138 mlog_errno(status); 1140 mlog_errno(status);
1139 goto bail; 1141 goto bail;
@@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1229 } 1231 }
1230 1232
1231 mlog(0, "calling journal_init_inode\n"); 1233 mlog(0, "calling journal_init_inode\n");
1232 journal = journal_init_inode(inode); 1234 journal = jbd2_journal_init_inode(inode);
1233 if (journal == NULL) { 1235 if (journal == NULL) {
1234 mlog(ML_ERROR, "Linux journal layer error\n"); 1236 mlog(ML_ERROR, "Linux journal layer error\n");
1235 status = -EIO; 1237 status = -EIO;
1236 goto done; 1238 goto done;
1237 } 1239 }
1238 1240
1239 status = journal_load(journal); 1241 status = jbd2_journal_load(journal);
1240 if (status < 0) { 1242 if (status < 0) {
1241 mlog_errno(status); 1243 mlog_errno(status);
1242 if (!igrab(inode)) 1244 if (!igrab(inode))
1243 BUG(); 1245 BUG();
1244 journal_destroy(journal); 1246 jbd2_journal_destroy(journal);
1245 goto done; 1247 goto done;
1246 } 1248 }
1247 1249
@@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1249 1251
1250 /* wipe the journal */ 1252 /* wipe the journal */
1251 mlog(0, "flushing the journal.\n"); 1253 mlog(0, "flushing the journal.\n");
1252 journal_lock_updates(journal); 1254 jbd2_journal_lock_updates(journal);
1253 status = journal_flush(journal); 1255 status = jbd2_journal_flush(journal);
1254 journal_unlock_updates(journal); 1256 jbd2_journal_unlock_updates(journal);
1255 if (status < 0) 1257 if (status < 0)
1256 mlog_errno(status); 1258 mlog_errno(status);
1257 1259
@@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1272 if (!igrab(inode)) 1274 if (!igrab(inode))
1273 BUG(); 1275 BUG();
1274 1276
1275 journal_destroy(journal); 1277 jbd2_journal_destroy(journal);
1276 1278
1277done: 1279done:
1278 /* drop the lock on this nodes journal */ 1280 /* drop the lock on this nodes journal */
@@ -1282,8 +1284,7 @@ done:
1282 if (inode) 1284 if (inode)
1283 iput(inode); 1285 iput(inode);
1284 1286
1285 if (bh) 1287 brelse(bh);
1286 brelse(bh);
1287 1288
1288 mlog_exit(status); 1289 mlog_exit(status);
1289 return status; 1290 return status;
@@ -1418,13 +1419,13 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1418{ 1419{
1419 unsigned int node_num; 1420 unsigned int node_num;
1420 int status, i; 1421 int status, i;
1422 u32 gen;
1421 struct buffer_head *bh = NULL; 1423 struct buffer_head *bh = NULL;
1422 struct ocfs2_dinode *di; 1424 struct ocfs2_dinode *di;
1423 1425
1424 /* This is called with the super block cluster lock, so we 1426 /* This is called with the super block cluster lock, so we
1425 * know that the slot map can't change underneath us. */ 1427 * know that the slot map can't change underneath us. */
1426 1428
1427 spin_lock(&osb->osb_lock);
1428 for (i = 0; i < osb->max_slots; i++) { 1429 for (i = 0; i < osb->max_slots; i++) {
1429 /* Read journal inode to get the recovery generation */ 1430 /* Read journal inode to get the recovery generation */
1430 status = ocfs2_read_journal_inode(osb, i, &bh, NULL); 1431 status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
@@ -1433,23 +1434,31 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1433 goto bail; 1434 goto bail;
1434 } 1435 }
1435 di = (struct ocfs2_dinode *)bh->b_data; 1436 di = (struct ocfs2_dinode *)bh->b_data;
1436 osb->slot_recovery_generations[i] = 1437 gen = ocfs2_get_recovery_generation(di);
1437 ocfs2_get_recovery_generation(di);
1438 brelse(bh); 1438 brelse(bh);
1439 bh = NULL; 1439 bh = NULL;
1440 1440
1441 spin_lock(&osb->osb_lock);
1442 osb->slot_recovery_generations[i] = gen;
1443
1441 mlog(0, "Slot %u recovery generation is %u\n", i, 1444 mlog(0, "Slot %u recovery generation is %u\n", i,
1442 osb->slot_recovery_generations[i]); 1445 osb->slot_recovery_generations[i]);
1443 1446
1444 if (i == osb->slot_num) 1447 if (i == osb->slot_num) {
1448 spin_unlock(&osb->osb_lock);
1445 continue; 1449 continue;
1450 }
1446 1451
1447 status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); 1452 status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
1448 if (status == -ENOENT) 1453 if (status == -ENOENT) {
1454 spin_unlock(&osb->osb_lock);
1449 continue; 1455 continue;
1456 }
1450 1457
1451 if (__ocfs2_recovery_map_test(osb, node_num)) 1458 if (__ocfs2_recovery_map_test(osb, node_num)) {
1459 spin_unlock(&osb->osb_lock);
1452 continue; 1460 continue;
1461 }
1453 spin_unlock(&osb->osb_lock); 1462 spin_unlock(&osb->osb_lock);
1454 1463
1455 /* Ok, we have a slot occupied by another node which 1464 /* Ok, we have a slot occupied by another node which
@@ -1465,10 +1474,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1465 mlog_errno(status); 1474 mlog_errno(status);
1466 goto bail; 1475 goto bail;
1467 } 1476 }
1468
1469 spin_lock(&osb->osb_lock);
1470 } 1477 }
1471 spin_unlock(&osb->osb_lock);
1472 1478
1473 status = 0; 1479 status = 0;
1474bail: 1480bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2178ebffa05f..d4d14e9a3cea 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,12 @@
27#define OCFS2_JOURNAL_H 27#define OCFS2_JOURNAL_H
28 28
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/jbd.h> 30#ifndef CONFIG_OCFS2_COMPAT_JBD
31# include <linux/jbd2.h>
32#else
33# include <linux/jbd.h>
34# include "ocfs2_jbd_compat.h"
35#endif
31 36
32enum ocfs2_journal_state { 37enum ocfs2_journal_state {
33 OCFS2_JOURNAL_FREE = 0, 38 OCFS2_JOURNAL_FREE = 0,
@@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
215 * buffer. Will have to call ocfs2_journal_dirty once 220 * buffer. Will have to call ocfs2_journal_dirty once
216 * we've actually dirtied it. Type is one of . or . 221 * we've actually dirtied it. Type is one of . or .
217 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. 222 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
218 * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before 223 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
219 * the current handle commits. 224 * the current handle commits.
220 */ 225 */
221 226
222/* You must always start_trans with a number of buffs > 0, but it's 227/* You must always start_trans with a number of buffs > 0, but it's
@@ -268,8 +273,10 @@ int ocfs2_journal_access(handle_t *handle,
268 */ 273 */
269int ocfs2_journal_dirty(handle_t *handle, 274int ocfs2_journal_dirty(handle_t *handle,
270 struct buffer_head *bh); 275 struct buffer_head *bh);
276#ifdef CONFIG_OCFS2_COMPAT_JBD
271int ocfs2_journal_dirty_data(handle_t *handle, 277int ocfs2_journal_dirty_data(handle_t *handle,
272 struct buffer_head *bh); 278 struct buffer_head *bh);
279#endif
273 280
274/* 281/*
275 * Credit Macros: 282 * Credit Macros:
@@ -283,6 +290,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
283/* simple file updates like chmod, etc. */ 290/* simple file updates like chmod, etc. */
284#define OCFS2_INODE_UPDATE_CREDITS 1 291#define OCFS2_INODE_UPDATE_CREDITS 1
285 292
293/* extended attribute block update */
294#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
295
286/* group extend. inode update and last group update. */ 296/* group extend. inode update and last group update. */
287#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 297#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
288 298
@@ -340,11 +350,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
340#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ 350#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
341 + OCFS2_UNLINK_CREDITS) 351 + OCFS2_UNLINK_CREDITS)
342 352
353/* global bitmap dinode, group desc., relinked group,
354 * suballocator dinode, group desc., relinked group,
355 * dinode, xattr block */
356#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
357 + OCFS2_INODE_UPDATE_CREDITS \
358 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
359
360/*
361 * Please note that the caller must make sure that root_el is the root
362 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
363 * the result may be wrong.
364 */
343static inline int ocfs2_calc_extend_credits(struct super_block *sb, 365static inline int ocfs2_calc_extend_credits(struct super_block *sb,
344 struct ocfs2_dinode *fe, 366 struct ocfs2_extent_list *root_el,
345 u32 bits_wanted) 367 u32 bits_wanted)
346{ 368{
347 int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; 369 int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
348 370
349 /* bitmap dinode, group desc. + relinked group. */ 371 /* bitmap dinode, group desc. + relinked group. */
350 bitmap_blocks = OCFS2_SUBALLOC_ALLOC; 372 bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
355 * however many metadata chunks needed * a remaining suballoc 377 * however many metadata chunks needed * a remaining suballoc
356 * alloc. */ 378 * alloc. */
357 sysfile_bitmap_blocks = 1 + 379 sysfile_bitmap_blocks = 1 +
358 (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); 380 (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
359 381
360 /* this does not include *new* metadata blocks, which are 382 /* this does not include *new* metadata blocks, which are
361 * accounted for in sysfile_bitmap_blocks. fe + 383 * accounted for in sysfile_bitmap_blocks. root_el +
362 * prev. last_eb_blk + blocks along edge of tree. 384 * prev. last_eb_blk + blocks along edge of tree.
363 * calc_symlink_credits passes because we just need 1 385 * calc_symlink_credits passes because we just need 1
364 * credit for the dinode there. */ 386 * credit for the dinode there. */
365 dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); 387 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
366 388
367 return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; 389 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
368} 390}
369 391
370static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 392static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
415 return credits; 437 return credits;
416} 438}
417 439
440static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
441{
442 return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
443}
444
445static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
446 loff_t new_size)
447{
448 return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
449 new_size);
450}
451
418#endif /* OCFS2_JOURNAL_H */ 452#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 28e492e4ec88..687b28713c32 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,6 +28,7 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/debugfs.h>
31 32
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC 33#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h> 34#include <cluster/masklog.h>
@@ -47,8 +48,6 @@
47 48
48#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) 49#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
49 50
50static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
51
52static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); 51static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
53 52
54static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 75 struct inode *local_alloc_inode);
77 76
78static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) 77#ifdef CONFIG_OCFS2_FS_STATS
78
79static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
80{
81 file->private_data = inode->i_private;
82 return 0;
83}
84
85#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
86#define LA_DEBUG_VER 1
87static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
88 size_t count, loff_t *ppos)
89{
90 static DEFINE_MUTEX(la_debug_mutex);
91 struct ocfs2_super *osb = file->private_data;
92 int written, ret;
93 char *buf = osb->local_alloc_debug_buf;
94
95 mutex_lock(&la_debug_mutex);
96 memset(buf, 0, LA_DEBUG_BUF_SZ);
97
98 written = snprintf(buf, LA_DEBUG_BUF_SZ,
99 "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
100 LA_DEBUG_VER,
101 (unsigned long long)osb->la_last_gd,
102 osb->local_alloc_default_bits,
103 osb->local_alloc_bits, osb->local_alloc_state);
104
105 ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
106
107 mutex_unlock(&la_debug_mutex);
108 return ret;
109}
110
111static const struct file_operations ocfs2_la_debug_fops = {
112 .open = ocfs2_la_debug_open,
113 .read = ocfs2_la_debug_read,
114};
115
116static void ocfs2_init_la_debug(struct ocfs2_super *osb)
117{
118 osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
119 if (!osb->local_alloc_debug_buf)
120 return;
121
122 osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
123 S_IFREG|S_IRUSR,
124 osb->osb_debug_root,
125 osb,
126 &ocfs2_la_debug_fops);
127 if (!osb->local_alloc_debug) {
128 kfree(osb->local_alloc_debug_buf);
129 osb->local_alloc_debug_buf = NULL;
130 }
131}
132
133static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
134{
135 if (osb->local_alloc_debug)
136 debugfs_remove(osb->local_alloc_debug);
137
138 if (osb->local_alloc_debug_buf)
139 kfree(osb->local_alloc_debug_buf);
140
141 osb->local_alloc_debug_buf = NULL;
142 osb->local_alloc_debug = NULL;
143}
144#else /* CONFIG_OCFS2_FS_STATS */
145static void ocfs2_init_la_debug(struct ocfs2_super *osb)
146{
147 return;
148}
149static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
150{
151 return;
152}
153#endif
154
155static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
79{ 156{
80 BUG_ON(osb->s_clustersize_bits > 20); 157 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
158 osb->local_alloc_state == OCFS2_LA_ENABLED);
159}
81 160
82 /* Size local alloc windows by the megabyte */ 161void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
83 return osb->local_alloc_size << (20 - osb->s_clustersize_bits); 162 unsigned int num_clusters)
163{
164 spin_lock(&osb->osb_lock);
165 if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
166 osb->local_alloc_state == OCFS2_LA_THROTTLED)
167 if (num_clusters >= osb->local_alloc_default_bits) {
168 cancel_delayed_work(&osb->la_enable_wq);
169 osb->local_alloc_state = OCFS2_LA_ENABLED;
170 }
171 spin_unlock(&osb->osb_lock);
172}
173
174void ocfs2_la_enable_worker(struct work_struct *work)
175{
176 struct ocfs2_super *osb =
177 container_of(work, struct ocfs2_super,
178 la_enable_wq.work);
179 spin_lock(&osb->osb_lock);
180 osb->local_alloc_state = OCFS2_LA_ENABLED;
181 spin_unlock(&osb->osb_lock);
84} 182}
85 183
86/* 184/*
87 * Tell us whether a given allocation should use the local alloc 185 * Tell us whether a given allocation should use the local alloc
88 * file. Otherwise, it has to go to the main bitmap. 186 * file. Otherwise, it has to go to the main bitmap.
187 *
188 * This function does semi-dirty reads of local alloc size and state!
189 * This is ok however, as the values are re-checked once under mutex.
89 */ 190 */
90int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) 191int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
91{ 192{
92 int la_bits = ocfs2_local_alloc_window_bits(osb);
93 int ret = 0; 193 int ret = 0;
194 int la_bits;
195
196 spin_lock(&osb->osb_lock);
197 la_bits = osb->local_alloc_bits;
94 198
95 if (osb->local_alloc_state != OCFS2_LA_ENABLED) 199 if (!ocfs2_la_state_enabled(osb))
96 goto bail; 200 goto bail;
97 201
98 /* la_bits should be at least twice the size (in clusters) of 202 /* la_bits should be at least twice the size (in clusters) of
@@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
106bail: 210bail:
107 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", 211 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
108 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); 212 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
213 spin_unlock(&osb->osb_lock);
109 return ret; 214 return ret;
110} 215}
111 216
@@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
120 225
121 mlog_entry_void(); 226 mlog_entry_void();
122 227
123 if (osb->local_alloc_size == 0) 228 ocfs2_init_la_debug(osb);
229
230 if (osb->local_alloc_bits == 0)
124 goto bail; 231 goto bail;
125 232
126 if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) { 233 if (osb->local_alloc_bits >= osb->bitmap_cpg) {
127 mlog(ML_NOTICE, "Requested local alloc window %d is larger " 234 mlog(ML_NOTICE, "Requested local alloc window %d is larger "
128 "than max possible %u. Using defaults.\n", 235 "than max possible %u. Using defaults.\n",
129 ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1)); 236 osb->local_alloc_bits, (osb->bitmap_cpg - 1));
130 osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 237 osb->local_alloc_bits =
238 ocfs2_megabytes_to_clusters(osb->sb,
239 OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
131 } 240 }
132 241
133 /* read the alloc off disk */ 242 /* read the alloc off disk */
@@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
139 goto bail; 248 goto bail;
140 } 249 }
141 250
142 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, 251 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
143 &alloc_bh, 0, inode); 252 &alloc_bh, OCFS2_BH_IGNORE_CACHE);
144 if (status < 0) { 253 if (status < 0) {
145 mlog_errno(status); 254 mlog_errno(status);
146 goto bail; 255 goto bail;
@@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
185 294
186bail: 295bail:
187 if (status < 0) 296 if (status < 0)
188 if (alloc_bh) 297 brelse(alloc_bh);
189 brelse(alloc_bh);
190 if (inode) 298 if (inode)
191 iput(inode); 299 iput(inode);
192 300
193 mlog(0, "Local alloc window bits = %d\n", 301 if (status < 0)
194 ocfs2_local_alloc_window_bits(osb)); 302 ocfs2_shutdown_la_debug(osb);
303
304 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
195 305
196 mlog_exit(status); 306 mlog_exit(status);
197 return status; 307 return status;
@@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
217 327
218 mlog_entry_void(); 328 mlog_entry_void();
219 329
330 cancel_delayed_work(&osb->la_enable_wq);
331 flush_workqueue(ocfs2_wq);
332
333 ocfs2_shutdown_la_debug(osb);
334
220 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 335 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
221 goto out; 336 goto out;
222 337
@@ -295,8 +410,7 @@ out_commit:
295 ocfs2_commit_trans(osb, handle); 410 ocfs2_commit_trans(osb, handle);
296 411
297out_unlock: 412out_unlock:
298 if (main_bm_bh) 413 brelse(main_bm_bh);
299 brelse(main_bm_bh);
300 414
301 ocfs2_inode_unlock(main_bm_inode, 1); 415 ocfs2_inode_unlock(main_bm_inode, 1);
302 416
@@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
345 459
346 mutex_lock(&inode->i_mutex); 460 mutex_lock(&inode->i_mutex);
347 461
348 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, 462 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
349 &alloc_bh, 0, inode); 463 &alloc_bh, OCFS2_BH_IGNORE_CACHE);
350 if (status < 0) { 464 if (status < 0) {
351 mlog_errno(status); 465 mlog_errno(status);
352 goto bail; 466 goto bail;
@@ -372,8 +486,7 @@ bail:
372 *alloc_copy = NULL; 486 *alloc_copy = NULL;
373 } 487 }
374 488
375 if (alloc_bh) 489 brelse(alloc_bh);
376 brelse(alloc_bh);
377 490
378 if (inode) { 491 if (inode) {
379 mutex_unlock(&inode->i_mutex); 492 mutex_unlock(&inode->i_mutex);
@@ -441,8 +554,7 @@ out_unlock:
441out_mutex: 554out_mutex:
442 mutex_unlock(&main_bm_inode->i_mutex); 555 mutex_unlock(&main_bm_inode->i_mutex);
443 556
444 if (main_bm_bh) 557 brelse(main_bm_bh);
445 brelse(main_bm_bh);
446 558
447 iput(main_bm_inode); 559 iput(main_bm_inode);
448 560
@@ -453,8 +565,48 @@ out:
453 return status; 565 return status;
454} 566}
455 567
568/* Check to see if the local alloc window is within ac->ac_max_block */
569static int ocfs2_local_alloc_in_range(struct inode *inode,
570 struct ocfs2_alloc_context *ac,
571 u32 bits_wanted)
572{
573 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
574 struct ocfs2_dinode *alloc;
575 struct ocfs2_local_alloc *la;
576 int start;
577 u64 block_off;
578
579 if (!ac->ac_max_block)
580 return 1;
581
582 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
583 la = OCFS2_LOCAL_ALLOC(alloc);
584
585 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
586 if (start == -1) {
587 mlog_errno(-ENOSPC);
588 return 0;
589 }
590
591 /*
592 * Converting (bm_off + start + bits_wanted) to blocks gives us
593 * the blkno just past our actual allocation. This is perfect
594 * to compare with ac_max_block.
595 */
596 block_off = ocfs2_clusters_to_blocks(inode->i_sb,
597 le32_to_cpu(la->la_bm_off) +
598 start + bits_wanted);
599 mlog(0, "Checking %llu against %llu\n",
600 (unsigned long long)block_off,
601 (unsigned long long)ac->ac_max_block);
602 if (block_off > ac->ac_max_block)
603 return 0;
604
605 return 1;
606}
607
456/* 608/*
457 * make sure we've got at least bitswanted contiguous bits in the 609 * make sure we've got at least bits_wanted contiguous bits in the
458 * local alloc. You lose them when you drop i_mutex. 610 * local alloc. You lose them when you drop i_mutex.
459 * 611 *
460 * We will add ourselves to the transaction passed in, but may start 612 * We will add ourselves to the transaction passed in, but may start
@@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
485 637
486 mutex_lock(&local_alloc_inode->i_mutex); 638 mutex_lock(&local_alloc_inode->i_mutex);
487 639
488 if (osb->local_alloc_state != OCFS2_LA_ENABLED) { 640 /*
489 status = -ENOSPC; 641 * We must double check state and allocator bits because
490 goto bail; 642 * another process may have changed them while holding i_mutex.
491 } 643 */
492 644 spin_lock(&osb->osb_lock);
493 if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { 645 if (!ocfs2_la_state_enabled(osb) ||
494 mlog(0, "Asking for more than my max window size!\n"); 646 (bits_wanted > osb->local_alloc_bits)) {
647 spin_unlock(&osb->osb_lock);
495 status = -ENOSPC; 648 status = -ENOSPC;
496 goto bail; 649 goto bail;
497 } 650 }
651 spin_unlock(&osb->osb_lock);
498 652
499 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 653 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
500 654
@@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
522 mlog_errno(status); 676 mlog_errno(status);
523 goto bail; 677 goto bail;
524 } 678 }
679
680 /*
681 * Under certain conditions, the window slide code
682 * might have reduced the number of bits available or
683 * disabled the the local alloc entirely. Re-check
684 * here and return -ENOSPC if necessary.
685 */
686 status = -ENOSPC;
687 if (!ocfs2_la_state_enabled(osb))
688 goto bail;
689
690 free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
691 le32_to_cpu(alloc->id1.bitmap1.i_used);
692 if (bits_wanted > free_bits)
693 goto bail;
694 }
695
696 if (ac->ac_max_block)
697 mlog(0, "Calling in_range for max block %llu\n",
698 (unsigned long long)ac->ac_max_block);
699
700 if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
701 bits_wanted)) {
702 /*
703 * The window is outside ac->ac_max_block.
704 * This errno tells the caller to keep localalloc enabled
705 * but to get the allocation from the main bitmap.
706 */
707 status = -EFBIG;
708 goto bail;
525 } 709 }
526 710
527 ac->ac_inode = local_alloc_inode; 711 ac->ac_inode = local_alloc_inode;
@@ -789,6 +973,85 @@ bail:
789 return status; 973 return status;
790} 974}
791 975
976enum ocfs2_la_event {
977 OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */
978 OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has
979 * enough bits theoretically
980 * free, but a contiguous
981 * allocation could not be
982 * found. */
983 OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have
984 * enough bits free to satisfy
985 * our request. */
986};
987#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
988/*
989 * Given an event, calculate the size of our next local alloc window.
990 *
991 * This should always be called under i_mutex of the local alloc inode
992 * so that local alloc disabling doesn't race with processes trying to
993 * use the allocator.
994 *
995 * Returns the state which the local alloc was left in. This value can
996 * be ignored by some paths.
997 */
998static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
999 enum ocfs2_la_event event)
1000{
1001 unsigned int bits;
1002 int state;
1003
1004 spin_lock(&osb->osb_lock);
1005 if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
1006 WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
1007 goto out_unlock;
1008 }
1009
1010 /*
1011 * ENOSPC and fragmentation are treated similarly for now.
1012 */
1013 if (event == OCFS2_LA_EVENT_ENOSPC ||
1014 event == OCFS2_LA_EVENT_FRAGMENTED) {
1015 /*
1016 * We ran out of contiguous space in the primary
1017 * bitmap. Drastically reduce the number of bits used
1018 * by local alloc until we have to disable it.
1019 */
1020 bits = osb->local_alloc_bits >> 1;
1021 if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
1022 /*
1023 * By setting state to THROTTLED, we'll keep
1024 * the number of local alloc bits used down
1025 * until an event occurs which would give us
1026 * reason to assume the bitmap situation might
1027 * have changed.
1028 */
1029 osb->local_alloc_state = OCFS2_LA_THROTTLED;
1030 osb->local_alloc_bits = bits;
1031 } else {
1032 osb->local_alloc_state = OCFS2_LA_DISABLED;
1033 }
1034 queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
1035 OCFS2_LA_ENABLE_INTERVAL);
1036 goto out_unlock;
1037 }
1038
1039 /*
1040 * Don't increase the size of the local alloc window until we
1041 * know we might be able to fulfill the request. Otherwise, we
1042 * risk bouncing around the global bitmap during periods of
1043 * low space.
1044 */
1045 if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
1046 osb->local_alloc_bits = osb->local_alloc_default_bits;
1047
1048out_unlock:
1049 state = osb->local_alloc_state;
1050 spin_unlock(&osb->osb_lock);
1051
1052 return state;
1053}
1054
792static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, 1055static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
793 struct ocfs2_alloc_context **ac, 1056 struct ocfs2_alloc_context **ac,
794 struct inode **bitmap_inode, 1057 struct inode **bitmap_inode,
@@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
803 goto bail; 1066 goto bail;
804 } 1067 }
805 1068
806 (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); 1069retry_enospc:
1070 (*ac)->ac_bits_wanted = osb->local_alloc_bits;
807 1071
808 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1072 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1073 if (status == -ENOSPC) {
1074 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
1075 OCFS2_LA_DISABLED)
1076 goto bail;
1077
1078 ocfs2_free_ac_resource(*ac);
1079 memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
1080 goto retry_enospc;
1081 }
809 if (status < 0) { 1082 if (status < 0) {
810 if (status != -ENOSPC) 1083 mlog_errno(status);
811 mlog_errno(status);
812 goto bail; 1084 goto bail;
813 } 1085 }
814 1086
@@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
849 "one\n"); 1121 "one\n");
850 1122
851 mlog(0, "Allocating %u clusters for a new window.\n", 1123 mlog(0, "Allocating %u clusters for a new window.\n",
852 ocfs2_local_alloc_window_bits(osb)); 1124 osb->local_alloc_bits);
853 1125
854 /* Instruct the allocation code to try the most recently used 1126 /* Instruct the allocation code to try the most recently used
855 * cluster group. We'll re-record the group used this pass 1127 * cluster group. We'll re-record the group used this pass
@@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
859 /* we used the generic suballoc reserve function, but we set 1131 /* we used the generic suballoc reserve function, but we set
860 * everything up nicely, so there's no reason why we can't use 1132 * everything up nicely, so there's no reason why we can't use
861 * the more specific cluster api to claim bits. */ 1133 * the more specific cluster api to claim bits. */
862 status = ocfs2_claim_clusters(osb, handle, ac, 1134 status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
863 ocfs2_local_alloc_window_bits(osb),
864 &cluster_off, &cluster_count); 1135 &cluster_off, &cluster_count);
1136 if (status == -ENOSPC) {
1137retry_enospc:
1138 /*
1139 * Note: We could also try syncing the journal here to
1140 * allow use of any free bits which the current
1141 * transaction can't give us access to. --Mark
1142 */
1143 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
1144 OCFS2_LA_DISABLED)
1145 goto bail;
1146
1147 status = ocfs2_claim_clusters(osb, handle, ac,
1148 osb->local_alloc_bits,
1149 &cluster_off,
1150 &cluster_count);
1151 if (status == -ENOSPC)
1152 goto retry_enospc;
1153 /*
1154 * We only shrunk the *minimum* number of in our
1155 * request - it's entirely possible that the allocator
1156 * might give us more than we asked for.
1157 */
1158 if (status == 0) {
1159 spin_lock(&osb->osb_lock);
1160 osb->local_alloc_bits = cluster_count;
1161 spin_unlock(&osb->osb_lock);
1162 }
1163 }
865 if (status < 0) { 1164 if (status < 0) {
866 if (status != -ENOSPC) 1165 if (status != -ENOSPC)
867 mlog_errno(status); 1166 mlog_errno(status);
@@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
905 1204
906 mlog_entry_void(); 1205 mlog_entry_void();
907 1206
1207 ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
1208
908 /* This will lock the main bitmap for us. */ 1209 /* This will lock the main bitmap for us. */
909 status = ocfs2_local_alloc_reserve_for_window(osb, 1210 status = ocfs2_local_alloc_reserve_for_window(osb,
910 &ac, 1211 &ac,
@@ -976,8 +1277,7 @@ bail:
976 if (handle) 1277 if (handle)
977 ocfs2_commit_trans(osb, handle); 1278 ocfs2_commit_trans(osb, handle);
978 1279
979 if (main_bm_bh) 1280 brelse(main_bm_bh);
980 brelse(main_bm_bh);
981 1281
982 if (main_bm_inode) 1282 if (main_bm_inode)
983 iput(main_bm_inode); 1283 iput(main_bm_inode);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 3f76631e110c..ac5ea9f86653 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
52 u32 *bit_off, 52 u32 *bit_off,
53 u32 *num_bits); 53 u32 *num_bits);
54 54
55void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
56 unsigned int num_clusters);
57void ocfs2_la_enable_worker(struct work_struct *work);
58
55#endif /* OCFS2_LOCALALLOC_H */ 59#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 203f87143877..544ac6245175 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -24,6 +24,7 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fcntl.h>
27 28
28#define MLOG_MASK_PREFIX ML_INODE 29#define MLOG_MASK_PREFIX ML_INODE
29#include <cluster/masklog.h> 30#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
32 33
33#include "dlmglue.h" 34#include "dlmglue.h"
34#include "file.h" 35#include "file.h"
36#include "inode.h"
35#include "locks.h" 37#include "locks.h"
36 38
37static int ocfs2_do_flock(struct file *file, struct inode *inode, 39static int ocfs2_do_flock(struct file *file, struct inode *inode,
@@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
123 else 125 else
124 return ocfs2_do_flock(file, inode, cmd, fl); 126 return ocfs2_do_flock(file, inode, cmd, fl);
125} 127}
128
129int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
130{
131 struct inode *inode = file->f_mapping->host;
132 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
133
134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK;
136 if (__mandatory_lock(inode))
137 return -ENOLCK;
138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
140}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 9743ef2324ec..496d488b271f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -27,5 +27,6 @@
27#define OCFS2_LOCKS_H 27#define OCFS2_LOCKS_H
28 28
29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); 29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
30int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
30 31
31#endif /* OCFS2_LOCKS_H */ 32#endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808fe0140..485a6aa0ad39 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
60#include "symlink.h" 60#include "symlink.h"
61#include "sysfile.h" 61#include "sysfile.h"
62#include "uptodate.h" 62#include "uptodate.h"
63#include "xattr.h"
63 64
64#include "buffer_head_io.h" 65#include "buffer_head_io.h"
65 66
@@ -327,14 +328,9 @@ leave:
327 if (status == -ENOSPC) 328 if (status == -ENOSPC)
328 mlog(0, "Disk is full\n"); 329 mlog(0, "Disk is full\n");
329 330
330 if (new_fe_bh) 331 brelse(new_fe_bh);
331 brelse(new_fe_bh); 332 brelse(de_bh);
332 333 brelse(parent_fe_bh);
333 if (de_bh)
334 brelse(de_bh);
335
336 if (parent_fe_bh)
337 brelse(parent_fe_bh);
338 334
339 if ((status < 0) && inode) 335 if ((status < 0) && inode)
340 iput(inode); 336 iput(inode);
@@ -647,12 +643,9 @@ out_unlock_inode:
647out: 643out:
648 ocfs2_inode_unlock(dir, 1); 644 ocfs2_inode_unlock(dir, 1);
649 645
650 if (de_bh) 646 brelse(de_bh);
651 brelse(de_bh); 647 brelse(fe_bh);
652 if (fe_bh) 648 brelse(parent_fe_bh);
653 brelse(fe_bh);
654 if (parent_fe_bh)
655 brelse(parent_fe_bh);
656 649
657 mlog_exit(err); 650 mlog_exit(err);
658 651
@@ -851,17 +844,10 @@ leave:
851 iput(orphan_dir); 844 iput(orphan_dir);
852 } 845 }
853 846
854 if (fe_bh) 847 brelse(fe_bh);
855 brelse(fe_bh); 848 brelse(dirent_bh);
856 849 brelse(parent_node_bh);
857 if (dirent_bh) 850 brelse(orphan_entry_bh);
858 brelse(dirent_bh);
859
860 if (parent_node_bh)
861 brelse(parent_node_bh);
862
863 if (orphan_entry_bh)
864 brelse(orphan_entry_bh);
865 851
866 mlog_exit(status); 852 mlog_exit(status);
867 853
@@ -1372,24 +1358,15 @@ bail:
1372 1358
1373 if (new_inode) 1359 if (new_inode)
1374 iput(new_inode); 1360 iput(new_inode);
1375 if (newfe_bh) 1361 brelse(newfe_bh);
1376 brelse(newfe_bh); 1362 brelse(old_inode_bh);
1377 if (old_inode_bh) 1363 brelse(old_dir_bh);
1378 brelse(old_inode_bh); 1364 brelse(new_dir_bh);
1379 if (old_dir_bh) 1365 brelse(new_de_bh);
1380 brelse(old_dir_bh); 1366 brelse(old_de_bh);
1381 if (new_dir_bh) 1367 brelse(old_inode_de_bh);
1382 brelse(new_dir_bh); 1368 brelse(orphan_entry_bh);
1383 if (new_de_bh) 1369 brelse(insert_entry_bh);
1384 brelse(new_de_bh);
1385 if (old_de_bh)
1386 brelse(old_de_bh);
1387 if (old_inode_de_bh)
1388 brelse(old_inode_de_bh);
1389 if (orphan_entry_bh)
1390 brelse(orphan_entry_bh);
1391 if (insert_entry_bh)
1392 brelse(insert_entry_bh);
1393 1370
1394 mlog_exit(status); 1371 mlog_exit(status);
1395 1372
@@ -1492,8 +1469,7 @@ bail:
1492 1469
1493 if (bhs) { 1470 if (bhs) {
1494 for(i = 0; i < blocks; i++) 1471 for(i = 0; i < blocks; i++)
1495 if (bhs[i]) 1472 brelse(bhs[i]);
1496 brelse(bhs[i]);
1497 kfree(bhs); 1473 kfree(bhs);
1498 } 1474 }
1499 1475
@@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir,
1598 u32 offset = 0; 1574 u32 offset = 0;
1599 1575
1600 inode->i_op = &ocfs2_symlink_inode_operations; 1576 inode->i_op = &ocfs2_symlink_inode_operations;
1601 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0, 1577 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1602 new_fe_bh, 1578 new_fe_bh,
1603 handle, data_ac, NULL, 1579 handle, data_ac, NULL,
1604 NULL); 1580 NULL);
1605 if (status < 0) { 1581 if (status < 0) {
1606 if (status != -ENOSPC && status != -EINTR) { 1582 if (status != -ENOSPC && status != -EINTR) {
1607 mlog(ML_ERROR, 1583 mlog(ML_ERROR,
@@ -1659,12 +1635,9 @@ bail:
1659 1635
1660 ocfs2_inode_unlock(dir, 1); 1636 ocfs2_inode_unlock(dir, 1);
1661 1637
1662 if (new_fe_bh) 1638 brelse(new_fe_bh);
1663 brelse(new_fe_bh); 1639 brelse(parent_fe_bh);
1664 if (parent_fe_bh) 1640 brelse(de_bh);
1665 brelse(parent_fe_bh);
1666 if (de_bh)
1667 brelse(de_bh);
1668 if (inode_ac) 1641 if (inode_ac)
1669 ocfs2_free_alloc_context(inode_ac); 1642 ocfs2_free_alloc_context(inode_ac);
1670 if (data_ac) 1643 if (data_ac)
@@ -1759,8 +1732,7 @@ leave:
1759 iput(orphan_dir_inode); 1732 iput(orphan_dir_inode);
1760 } 1733 }
1761 1734
1762 if (orphan_dir_bh) 1735 brelse(orphan_dir_bh);
1763 brelse(orphan_dir_bh);
1764 1736
1765 mlog_exit(status); 1737 mlog_exit(status);
1766 return status; 1738 return status;
@@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1780 1752
1781 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1753 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1782 1754
1783 status = ocfs2_read_block(osb, 1755 status = ocfs2_read_block(orphan_dir_inode,
1784 OCFS2_I(orphan_dir_inode)->ip_blkno, 1756 OCFS2_I(orphan_dir_inode)->ip_blkno,
1785 &orphan_dir_bh, OCFS2_BH_CACHED, 1757 &orphan_dir_bh);
1786 orphan_dir_inode);
1787 if (status < 0) { 1758 if (status < 0) {
1788 mlog_errno(status); 1759 mlog_errno(status);
1789 goto leave; 1760 goto leave;
@@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1829 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1800 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1830 1801
1831leave: 1802leave:
1832 if (orphan_dir_bh) 1803 brelse(orphan_dir_bh);
1833 brelse(orphan_dir_bh);
1834 1804
1835 mlog_exit(status); 1805 mlog_exit(status);
1836 return status; 1806 return status;
@@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1898 } 1868 }
1899 1869
1900leave: 1870leave:
1901 if (target_de_bh) 1871 brelse(target_de_bh);
1902 brelse(target_de_bh);
1903 1872
1904 mlog_exit(status); 1873 mlog_exit(status);
1905 return status; 1874 return status;
@@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = {
1918 .setattr = ocfs2_setattr, 1887 .setattr = ocfs2_setattr,
1919 .getattr = ocfs2_getattr, 1888 .getattr = ocfs2_getattr,
1920 .permission = ocfs2_permission, 1889 .permission = ocfs2_permission,
1890 .setxattr = generic_setxattr,
1891 .getxattr = generic_getxattr,
1892 .listxattr = ocfs2_listxattr,
1893 .removexattr = generic_removexattr,
1921}; 1894};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7f625f2b1117..a21a465490c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,7 +34,12 @@
34#include <linux/workqueue.h> 34#include <linux/workqueue.h>
35#include <linux/kref.h> 35#include <linux/kref.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/jbd.h> 37#ifndef CONFIG_OCFS2_COMPAT_JBD
38# include <linux/jbd2.h>
39#else
40# include <linux/jbd.h>
41# include "ocfs2_jbd_compat.h"
42#endif
38 43
39/* For union ocfs2_dlm_lksb */ 44/* For union ocfs2_dlm_lksb */
40#include "stackglue.h" 45#include "stackglue.h"
@@ -171,9 +176,13 @@ struct ocfs2_alloc_stats
171 176
172enum ocfs2_local_alloc_state 177enum ocfs2_local_alloc_state
173{ 178{
174 OCFS2_LA_UNUSED = 0, 179 OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for
175 OCFS2_LA_ENABLED, 180 * this mountpoint. */
176 OCFS2_LA_DISABLED 181 OCFS2_LA_ENABLED, /* Local alloc is in use. */
182 OCFS2_LA_THROTTLED, /* Local alloc is in use, but number
183 * of bits has been reduced. */
184 OCFS2_LA_DISABLED /* Local alloc has temporarily been
185 * disabled. */
177}; 186};
178 187
179enum ocfs2_mount_options 188enum ocfs2_mount_options
@@ -184,6 +193,8 @@ enum ocfs2_mount_options
184 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 193 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
185 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 194 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
186 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ 195 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
196 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
197 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
187}; 198};
188 199
189#define OCFS2_OSB_SOFT_RO 0x0001 200#define OCFS2_OSB_SOFT_RO 0x0001
@@ -214,6 +225,7 @@ struct ocfs2_super
214 u32 bitmap_cpg; 225 u32 bitmap_cpg;
215 u8 *uuid; 226 u8 *uuid;
216 char *uuid_str; 227 char *uuid_str;
228 u32 uuid_hash;
217 u8 *vol_label; 229 u8 *vol_label;
218 u64 first_cluster_group_blkno; 230 u64 first_cluster_group_blkno;
219 u32 fs_generation; 231 u32 fs_generation;
@@ -241,6 +253,7 @@ struct ocfs2_super
241 int s_sectsize_bits; 253 int s_sectsize_bits;
242 int s_clustersize; 254 int s_clustersize;
243 int s_clustersize_bits; 255 int s_clustersize_bits;
256 unsigned int s_xattr_inline_size;
244 257
245 atomic_t vol_state; 258 atomic_t vol_state;
246 struct mutex recovery_lock; 259 struct mutex recovery_lock;
@@ -252,11 +265,27 @@ struct ocfs2_super
252 struct ocfs2_journal *journal; 265 struct ocfs2_journal *journal;
253 unsigned long osb_commit_interval; 266 unsigned long osb_commit_interval;
254 267
255 int local_alloc_size; 268 struct delayed_work la_enable_wq;
256 enum ocfs2_local_alloc_state local_alloc_state; 269
270 /*
271 * Must hold local alloc i_mutex and osb->osb_lock to change
272 * local_alloc_bits. Reads can be done under either lock.
273 */
274 unsigned int local_alloc_bits;
275 unsigned int local_alloc_default_bits;
276
277 enum ocfs2_local_alloc_state local_alloc_state; /* protected
278 * by osb_lock */
279
257 struct buffer_head *local_alloc_bh; 280 struct buffer_head *local_alloc_bh;
281
258 u64 la_last_gd; 282 u64 la_last_gd;
259 283
284#ifdef CONFIG_OCFS2_FS_STATS
285 struct dentry *local_alloc_debug;
286 char *local_alloc_debug_buf;
287#endif
288
260 /* Next two fields are for local node slot recovery during 289 /* Next two fields are for local node slot recovery during
261 * mount. */ 290 * mount. */
262 int dirty; 291 int dirty;
@@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
340 return 0; 369 return 0;
341} 370}
342 371
372static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
373{
374 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
375 return 1;
376 return 0;
377}
378
343/* set / clear functions because cluster events can make these happen 379/* set / clear functions because cluster events can make these happen
344 * in parallel so we want the transitions to be atomic. this also 380 * in parallel so we want the transitions to be atomic. this also
345 * means that any future flags osb_flags must be protected by spinlock 381 * means that any future flags osb_flags must be protected by spinlock
@@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
554 return pages_per_cluster; 590 return pages_per_cluster;
555} 591}
556 592
593static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
594 unsigned int megs)
595{
596 BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
597
598 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
599}
600
557static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 601static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
558{ 602{
559 spin_lock(&osb->osb_lock); 603 spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 4f619850ccf7..f24ce3d3f956 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,7 @@
64#define OCFS2_INODE_SIGNATURE "INODE01" 64#define OCFS2_INODE_SIGNATURE "INODE01"
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" 65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
67 68
68/* Compatibility flags */ 69/* Compatibility flags */
69#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 70#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -90,7 +91,8 @@
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ 91 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ 92 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
92 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 93 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
93 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK) 94 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
95 | OCFS2_FEATURE_INCOMPAT_XATTR)
94#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 96#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
95 97
96/* 98/*
@@ -127,10 +129,6 @@
127/* Support for data packed into inode blocks */ 129/* Support for data packed into inode blocks */
128#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 130#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
129 131
130/* Support for the extended slot map */
131#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
132
133
134/* 132/*
135 * Support for alternate, userspace cluster stacks. If set, the superblock 133 * Support for alternate, userspace cluster stacks. If set, the superblock
136 * field s_cluster_info contains a tag for the alternate stack in use as 134 * field s_cluster_info contains a tag for the alternate stack in use as
@@ -142,6 +140,12 @@
142 */ 140 */
143#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 141#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
144 142
143/* Support for the extended slot map */
144#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
145
146/* Support for extended attributes */
147#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
148
145/* 149/*
146 * backup superblock flag is used to indicate that this volume 150 * backup superblock flag is used to indicate that this volume
147 * has backup superblocks. 151 * has backup superblocks.
@@ -299,6 +303,12 @@ struct ocfs2_new_group_input {
299 */ 303 */
300#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 304#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
301 305
306/*
307 * Inline extended attribute size (in bytes)
308 * The value chosen should be aligned to 16 byte boundaries.
309 */
310#define OCFS2_MIN_XATTR_INLINE_SIZE 256
311
302struct ocfs2_system_inode_info { 312struct ocfs2_system_inode_info {
303 char *si_name; 313 char *si_name;
304 int si_iflags; 314 int si_iflags;
@@ -563,7 +573,7 @@ struct ocfs2_super_block {
563/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts 573/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
564 before tunefs required */ 574 before tunefs required */
565 __le16 s_tunefs_flag; 575 __le16 s_tunefs_flag;
566 __le32 s_reserved1; 576 __le32 s_uuid_hash; /* hash value of uuid */
567 __le64 s_first_cluster_group; /* Block offset of 1st cluster 577 __le64 s_first_cluster_group; /* Block offset of 1st cluster
568 * group header */ 578 * group header */
569/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 579/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
@@ -571,7 +581,11 @@ struct ocfs2_super_block {
571/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace 581/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
572 stack. Only valid 582 stack. Only valid
573 with INCOMPAT flag. */ 583 with INCOMPAT flag. */
574/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */ 584/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
585 for this fs*/
586 __le16 s_reserved0;
587 __le32 s_reserved1;
588/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */
575/*140*/ 589/*140*/
576 590
577 /* 591 /*
@@ -621,7 +635,8 @@ struct ocfs2_dinode {
621 belongs to */ 635 belongs to */
622 __le16 i_suballoc_bit; /* Bit offset in suballocator 636 __le16 i_suballoc_bit; /* Bit offset in suballocator
623 block group */ 637 block group */
624/*10*/ __le32 i_reserved0; 638/*10*/ __le16 i_reserved0;
639 __le16 i_xattr_inline_size;
625 __le32 i_clusters; /* Cluster count */ 640 __le32 i_clusters; /* Cluster count */
626 __le32 i_uid; /* Owner UID */ 641 __le32 i_uid; /* Owner UID */
627 __le32 i_gid; /* Owning GID */ 642 __le32 i_gid; /* Owning GID */
@@ -640,11 +655,12 @@ struct ocfs2_dinode {
640 __le32 i_atime_nsec; 655 __le32 i_atime_nsec;
641 __le32 i_ctime_nsec; 656 __le32 i_ctime_nsec;
642 __le32 i_mtime_nsec; 657 __le32 i_mtime_nsec;
643 __le32 i_attr; 658/*70*/ __le32 i_attr;
644 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL 659 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
645 was set in i_flags */ 660 was set in i_flags */
646 __le16 i_dyn_features; 661 __le16 i_dyn_features;
647/*70*/ __le64 i_reserved2[8]; 662 __le64 i_xattr_loc;
663/*80*/ __le64 i_reserved2[7];
648/*B8*/ union { 664/*B8*/ union {
649 __le64 i_pad1; /* Generic way to refer to this 665 __le64 i_pad1; /* Generic way to refer to this
650 64bit union */ 666 64bit union */
@@ -715,6 +731,136 @@ struct ocfs2_group_desc
715/*40*/ __u8 bg_bitmap[0]; 731/*40*/ __u8 bg_bitmap[0];
716}; 732};
717 733
734/*
735 * On disk extended attribute structure for OCFS2.
736 */
737
738/*
739 * ocfs2_xattr_entry indicates one extend attribute.
740 *
741 * Note that it can be stored in inode, one block or one xattr bucket.
742 */
743struct ocfs2_xattr_entry {
744 __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */
745 __le16 xe_name_offset; /* byte offset from the 1st etnry in the local
746 local xattr storage(inode, xattr block or
747 xattr bucket). */
748 __u8 xe_name_len; /* xattr name len, does't include prefix. */
749 __u8 xe_type; /* the low 7 bits indicates the name prefix's
750 * type and the highest 1 bits indicate whether
751 * the EA is stored in the local storage. */
752 __le64 xe_value_size; /* real xattr value length. */
753};
754
755/*
756 * On disk structure for xattr header.
757 *
758 * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
759 * the local xattr storage.
760 */
761struct ocfs2_xattr_header {
762 __le16 xh_count; /* contains the count of how
763 many records are in the
764 local xattr storage. */
765 __le16 xh_free_start; /* current offset for storing
766 xattr. */
767 __le16 xh_name_value_len; /* total length of name/value
768 length in this bucket. */
769 __le16 xh_num_buckets; /* bucket nums in one extent
770 record, only valid in the
771 first bucket. */
772 __le64 xh_csum;
773 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
774};
775
776/*
777 * On disk structure for xattr value root.
778 *
779 * It is used when one extended attribute's size is larger, and we will save it
780 * in an outside cluster. It will stored in a b-tree like file content.
781 */
782struct ocfs2_xattr_value_root {
783/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */
784 __le32 xr_reserved0;
785 __le64 xr_last_eb_blk; /* Pointer to last extent block */
786/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */
787};
788
789/*
790 * On disk structure for xattr tree root.
791 *
792 * It is used when there are too many extended attributes for one file. These
793 * attributes will be organized and stored in an indexed-btree.
794 */
795struct ocfs2_xattr_tree_root {
796/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */
797 __le32 xt_reserved0;
798 __le64 xt_last_eb_blk; /* Pointer to last extent block */
799/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */
800};
801
802#define OCFS2_XATTR_INDEXED 0x1
803#define OCFS2_HASH_SHIFT 5
804#define OCFS2_XATTR_ROUND 3
805#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \
806 ~(OCFS2_XATTR_ROUND))
807
808#define OCFS2_XATTR_BUCKET_SIZE 4096
809#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \
810 / OCFS2_MIN_BLOCKSIZE)
811
812/*
813 * On disk structure for xattr block.
814 */
815struct ocfs2_xattr_block {
816/*00*/ __u8 xb_signature[8]; /* Signature for verification */
817 __le16 xb_suballoc_slot; /* Slot suballocator this
818 block belongs to. */
819 __le16 xb_suballoc_bit; /* Bit offset in suballocator
820 block group */
821 __le32 xb_fs_generation; /* Must match super block */
822/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
823 __le64 xb_csum;
824/*20*/ __le16 xb_flags; /* Indicates whether this block contains
825 real xattr or a xattr tree. */
826 __le16 xb_reserved0;
827 __le32 xb_reserved1;
828 __le64 xb_reserved2;
829/*30*/ union {
830 struct ocfs2_xattr_header xb_header; /* xattr header if this
831 block contains xattr */
832 struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
833 block cotains xattr
834 tree. */
835 } xb_attrs;
836};
837
838#define OCFS2_XATTR_ENTRY_LOCAL 0x80
839#define OCFS2_XATTR_TYPE_MASK 0x7F
840static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
841 int local)
842{
843 if (local)
844 xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
845 else
846 xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
847}
848
849static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
850{
851 return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
852}
853
854static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
855{
856 xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
857}
858
859static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
860{
861 return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
862}
863
718#ifdef __KERNEL__ 864#ifdef __KERNEL__
719static inline int ocfs2_fast_symlink_chars(struct super_block *sb) 865static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
720{ 866{
@@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
728 offsetof(struct ocfs2_dinode, id2.i_data.id_data); 874 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
729} 875}
730 876
877static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
878 struct ocfs2_dinode *di)
879{
880 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
881
882 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
883 return sb->s_blocksize -
884 offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
885 xattrsize;
886 else
887 return sb->s_blocksize -
888 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
889}
890
731static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) 891static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
732{ 892{
733 int size; 893 int size;
@@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
738 return size / sizeof(struct ocfs2_extent_rec); 898 return size / sizeof(struct ocfs2_extent_rec);
739} 899}
740 900
901static inline int ocfs2_extent_recs_per_inode_with_xattr(
902 struct super_block *sb,
903 struct ocfs2_dinode *di)
904{
905 int size;
906 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
907
908 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
909 size = sb->s_blocksize -
910 offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
911 xattrsize;
912 else
913 size = sb->s_blocksize -
914 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
915
916 return size / sizeof(struct ocfs2_extent_rec);
917}
918
741static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) 919static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
742{ 920{
743 int size; 921 int size;
@@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
801 return 0; 979 return 0;
802 980
803} 981}
982
983static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
984{
985 int size;
986
987 size = sb->s_blocksize -
988 offsetof(struct ocfs2_xattr_block,
989 xb_attrs.xb_root.xt_list.l_recs);
990
991 return size / sizeof(struct ocfs2_extent_rec);
992}
804#else 993#else
805static inline int ocfs2_fast_symlink_chars(int blocksize) 994static inline int ocfs2_fast_symlink_chars(int blocksize)
806{ 995{
@@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
884 1073
885 return 0; 1074 return 0;
886} 1075}
1076
1077static inline int ocfs2_xattr_recs_per_xb(int blocksize)
1078{
1079 int size;
1080
1081 size = blocksize -
1082 offsetof(struct ocfs2_xattr_block,
1083 xb_attrs.xb_root.xt_list.l_recs);
1084
1085 return size / sizeof(struct ocfs2_extent_rec);
1086}
887#endif /* __KERNEL__ */ 1087#endif /* __KERNEL__ */
888 1088
889 1089
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
new file mode 100644
index 000000000000..b91c78f8f558
--- /dev/null
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_jbd_compat.h
5 *
6 * Compatibility defines for JBD.
7 *
8 * Copyright (C) 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_JBD_COMPAT_H
21#define OCFS2_JBD_COMPAT_H
22
23#ifndef CONFIG_OCFS2_COMPAT_JBD
24# error Should not have been included
25#endif
26
27struct jbd2_inode {
28 unsigned int dummy;
29};
30
31#define JBD2_BARRIER JFS_BARRIER
32#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
33
34#define jbd2_journal_ack_err journal_ack_err
35#define jbd2_journal_clear_err journal_clear_err
36#define jbd2_journal_destroy journal_destroy
37#define jbd2_journal_dirty_metadata journal_dirty_metadata
38#define jbd2_journal_errno journal_errno
39#define jbd2_journal_extend journal_extend
40#define jbd2_journal_flush journal_flush
41#define jbd2_journal_force_commit journal_force_commit
42#define jbd2_journal_get_write_access journal_get_write_access
43#define jbd2_journal_get_undo_access journal_get_undo_access
44#define jbd2_journal_init_inode journal_init_inode
45#define jbd2_journal_invalidatepage journal_invalidatepage
46#define jbd2_journal_load journal_load
47#define jbd2_journal_lock_updates journal_lock_updates
48#define jbd2_journal_restart journal_restart
49#define jbd2_journal_start journal_start
50#define jbd2_journal_start_commit journal_start_commit
51#define jbd2_journal_stop journal_stop
52#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
53#define jbd2_journal_unlock_updates journal_unlock_updates
54#define jbd2_journal_wipe journal_wipe
55#define jbd2_log_wait_commit log_wait_commit
56
57static inline int jbd2_journal_file_inode(handle_t *handle,
58 struct jbd2_inode *inode)
59{
60 return 0;
61}
62
63static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
64 loff_t new_size)
65{
66 return 0;
67}
68
69static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
70 struct inode *inode)
71{
72 return;
73}
74
75static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
76 struct jbd2_inode *jinode)
77{
78 return;
79}
80
81
82#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 8166968e9015..ffd48db229a7 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
200 if (cluster > clusters) 200 if (cluster > clusters)
201 break; 201 break;
202 202
203 ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL); 203 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
204 if (ret < 0) { 204 if (ret < 0) {
205 mlog_errno(ret); 205 mlog_errno(ret);
206 break; 206 break;
@@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
236 * update the superblock last. 236 * update the superblock last.
237 * It doesn't matter if the write failed. 237 * It doesn't matter if the write failed.
238 */ 238 */
239 ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO, 239 ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
240 &super_bh, 0, NULL); 240 &super_bh);
241 if (ret < 0) { 241 if (ret < 0) {
242 mlog_errno(ret); 242 mlog_errno(ret);
243 goto out; 243 goto out;
@@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, 332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1); 333 first_new_cluster - 1);
334 334
335 ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED, 335 ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
336 main_bm_inode);
337 if (ret < 0) { 336 if (ret < 0) {
338 mlog_errno(ret); 337 mlog_errno(ret);
339 goto out_unlock; 338 goto out_unlock;
@@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
540 goto out_unlock; 539 goto out_unlock;
541 } 540 }
542 541
543 ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL); 542 ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
544 if (ret < 0) { 543 if (ret < 0) {
545 mlog(ML_ERROR, "Can't read the group descriptor # %llu " 544 mlog(ML_ERROR, "Can't read the group descriptor # %llu "
546 "from the device.", (unsigned long long)input->group); 545 "from the device.", (unsigned long long)input->group);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bb5ff8939bf1..bdda2d8f8508 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If 150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
151 * this is not true, the read of -1 (UINT64_MAX) will fail. 151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */ 152 */
153 ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, 153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
154 si->si_inode); 154 OCFS2_BH_IGNORE_CACHE);
155 if (ret == 0) { 155 if (ret == 0) {
156 spin_lock(&osb->osb_lock); 156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si); 157 ocfs2_update_slot_info(si);
@@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
404 (unsigned long long)blkno); 404 (unsigned long long)blkno);
405 405
406 bh = NULL; /* Acquire a fresh bh */ 406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); 407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
408 OCFS2_BH_IGNORE_CACHE);
408 if (status < 0) { 409 if (status < 0) {
409 mlog_errno(status); 410 mlog_errno(status);
410 goto bail; 411 goto bail;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 353fc35c6748..faec2d879357 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -28,6 +28,7 @@
28#include "ocfs2.h" /* For struct ocfs2_lock_res */ 28#include "ocfs2.h" /* For struct ocfs2_lock_res */
29#include "stackglue.h" 29#include "stackglue.h"
30 30
31#include <linux/dlm_plock.h>
31 32
32/* 33/*
33 * The control protocol starts with a handshake. Until the handshake 34 * The control protocol starts with a handshake. Until the handshake
@@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
746{ 747{
747} 748}
748 749
750static int user_plock(struct ocfs2_cluster_connection *conn,
751 u64 ino,
752 struct file *file,
753 int cmd,
754 struct file_lock *fl)
755{
756 /*
757 * This more or less just demuxes the plock request into any
758 * one of three dlm calls.
759 *
760 * Internally, fs/dlm will pass these to a misc device, which
761 * a userspace daemon will read and write to.
762 *
763 * For now, cancel requests (which happen internally only),
764 * are turned into unlocks. Most of this function taken from
765 * gfs2_lock.
766 */
767
768 if (cmd == F_CANCELLK) {
769 cmd = F_SETLK;
770 fl->fl_type = F_UNLCK;
771 }
772
773 if (IS_GETLK(cmd))
774 return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
775 else if (fl->fl_type == F_UNLCK)
776 return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
777 else
778 return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
779}
780
749/* 781/*
750 * Compare a requested locking protocol version against the current one. 782 * Compare a requested locking protocol version against the current one.
751 * 783 *
@@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
839 .dlm_unlock = user_dlm_unlock, 871 .dlm_unlock = user_dlm_unlock,
840 .lock_status = user_dlm_lock_status, 872 .lock_status = user_dlm_lock_status,
841 .lock_lvb = user_dlm_lvb, 873 .lock_lvb = user_dlm_lvb,
874 .plock = user_plock,
842 .dump_lksb = user_dlm_dump_lksb, 875 .dump_lksb = user_dlm_dump_lksb,
843}; 876};
844 877
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 10e149ae5e3a..68b668b0e60a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name,
97 goto out; 97 goto out;
98 } 98 }
99 99
100 /* Ok, the stack is pinned */
101 p->sp_count++;
102 active_stack = p; 100 active_stack = p;
103
104 rc = 0; 101 rc = 0;
105 102
106out: 103out:
104 /* If we found it, pin it */
105 if (!rc)
106 active_stack->sp_count++;
107
107 spin_unlock(&ocfs2_stack_lock); 108 spin_unlock(&ocfs2_stack_lock);
108 return rc; 109 return rc;
109} 110}
@@ -287,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
287} 288}
288EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); 289EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
289 290
291int ocfs2_stack_supports_plocks(void)
292{
293 return active_stack && active_stack->sp_ops->plock;
294}
295EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
296
297/*
298 * ocfs2_plock() can only be safely called if
299 * ocfs2_stack_supports_plocks() returned true
300 */
301int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
302 struct file *file, int cmd, struct file_lock *fl)
303{
304 WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
305 if (active_stack->sp_ops->plock)
306 return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
307 return -EOPNOTSUPP;
308}
309EXPORT_SYMBOL_GPL(ocfs2_plock);
310
290int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
291 const char *group, 312 const char *group,
292 int grouplen, 313 int grouplen,
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index db56281dd1be..c571af375ef8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -28,6 +28,10 @@
28#include "dlm/dlmapi.h" 28#include "dlm/dlmapi.h"
29#include <linux/dlm.h> 29#include <linux/dlm.h>
30 30
31/* Needed for plock-related prototypes */
32struct file;
33struct file_lock;
34
31/* 35/*
32 * dlmconstants.h does not have a LOCAL flag. We hope to remove it 36 * dlmconstants.h does not have a LOCAL flag. We hope to remove it
33 * some day, but right now we need it. Let's fake it. This value is larger 37 * some day, but right now we need it. Let's fake it. This value is larger
@@ -187,6 +191,17 @@ struct ocfs2_stack_operations {
187 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 191 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
188 192
189 /* 193 /*
194 * Cluster-aware posix locks
195 *
196 * This is NULL for stacks which do not support posix locks.
197 */
198 int (*plock)(struct ocfs2_cluster_connection *conn,
199 u64 ino,
200 struct file *file,
201 int cmd,
202 struct file_lock *fl);
203
204 /*
190 * This is an optoinal debugging hook. If provided, the 205 * This is an optoinal debugging hook. If provided, the
191 * stack can dump debugging information about this lock. 206 * stack can dump debugging information about this lock.
192 */ 207 */
@@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
240void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 255void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
241void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 256void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
242 257
258int ocfs2_stack_supports_plocks(void);
259int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
260 struct file *file, int cmd, struct file_lock *fl);
261
243void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); 262void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
244 263
245 264
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278fb9819..c5ff18b46b57 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
62 struct ocfs2_chain_list *cl); 62 struct ocfs2_chain_list *cl);
63static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 63static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
64 struct inode *alloc_inode, 64 struct inode *alloc_inode,
65 struct buffer_head *bh); 65 struct buffer_head *bh,
66 u64 max_block);
66 67
67static int ocfs2_cluster_group_search(struct inode *inode, 68static int ocfs2_cluster_group_search(struct inode *inode,
68 struct buffer_head *group_bh, 69 struct buffer_head *group_bh,
69 u32 bits_wanted, u32 min_bits, 70 u32 bits_wanted, u32 min_bits,
71 u64 max_block,
70 u16 *bit_off, u16 *bits_found); 72 u16 *bit_off, u16 *bits_found);
71static int ocfs2_block_group_search(struct inode *inode, 73static int ocfs2_block_group_search(struct inode *inode,
72 struct buffer_head *group_bh, 74 struct buffer_head *group_bh,
73 u32 bits_wanted, u32 min_bits, 75 u32 bits_wanted, u32 min_bits,
76 u64 max_block,
74 u16 *bit_off, u16 *bits_found); 77 u16 *bit_off, u16 *bits_found);
75static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 78static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
76 struct ocfs2_alloc_context *ac, 79 struct ocfs2_alloc_context *ac,
@@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
110 u64 data_blkno, 113 u64 data_blkno,
111 u64 *bg_blkno, 114 u64 *bg_blkno,
112 u16 *bg_bit_off); 115 u16 *bg_bit_off);
116static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
117 u32 bits_wanted, u64 max_block,
118 struct ocfs2_alloc_context **ac);
113 119
114static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 120void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
115{ 121{
116 struct inode *inode = ac->ac_inode; 122 struct inode *inode = ac->ac_inode;
117 123
@@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
124 iput(inode); 130 iput(inode);
125 ac->ac_inode = NULL; 131 ac->ac_inode = NULL;
126 } 132 }
127 if (ac->ac_bh) { 133 brelse(ac->ac_bh);
128 brelse(ac->ac_bh); 134 ac->ac_bh = NULL;
129 ac->ac_bh = NULL;
130 }
131} 135}
132 136
133void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 137void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
276 */ 280 */
277static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 281static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
278 struct inode *alloc_inode, 282 struct inode *alloc_inode,
279 struct buffer_head *bh) 283 struct buffer_head *bh,
284 u64 max_block)
280{ 285{
281 int status, credits; 286 int status, credits;
282 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 287 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
294 mlog_entry_void(); 299 mlog_entry_void();
295 300
296 cl = &fe->id2.i_chain; 301 cl = &fe->id2.i_chain;
297 status = ocfs2_reserve_clusters(osb, 302 status = ocfs2_reserve_clusters_with_limit(osb,
298 le16_to_cpu(cl->cl_cpg), 303 le16_to_cpu(cl->cl_cpg),
299 &ac); 304 max_block, &ac);
300 if (status < 0) { 305 if (status < 0) {
301 if (status != -ENOSPC) 306 if (status != -ENOSPC)
302 mlog_errno(status); 307 mlog_errno(status);
@@ -394,8 +399,7 @@ bail:
394 if (ac) 399 if (ac)
395 ocfs2_free_alloc_context(ac); 400 ocfs2_free_alloc_context(ac);
396 401
397 if (bg_bh) 402 brelse(bg_bh);
398 brelse(bg_bh);
399 403
400 mlog_exit(status); 404 mlog_exit(status);
401 return status; 405 return status;
@@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
469 goto bail; 473 goto bail;
470 } 474 }
471 475
472 status = ocfs2_block_group_alloc(osb, alloc_inode, bh); 476 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
477 ac->ac_max_block);
473 if (status < 0) { 478 if (status < 0) {
474 if (status != -ENOSPC) 479 if (status != -ENOSPC)
475 mlog_errno(status); 480 mlog_errno(status);
@@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
486 get_bh(bh); 491 get_bh(bh);
487 ac->ac_bh = bh; 492 ac->ac_bh = bh;
488bail: 493bail:
489 if (bh) 494 brelse(bh);
490 brelse(bh);
491 495
492 mlog_exit(status); 496 mlog_exit(status);
493 return status; 497 return status;
494} 498}
495 499
496int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 500int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
497 struct ocfs2_dinode *fe, 501 int blocks,
498 struct ocfs2_alloc_context **ac) 502 struct ocfs2_alloc_context **ac)
499{ 503{
500 int status; 504 int status;
501 u32 slot; 505 u32 slot;
@@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
507 goto bail; 511 goto bail;
508 } 512 }
509 513
510 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); 514 (*ac)->ac_bits_wanted = blocks;
511 (*ac)->ac_which = OCFS2_AC_USE_META; 515 (*ac)->ac_which = OCFS2_AC_USE_META;
512 slot = osb->slot_num; 516 slot = osb->slot_num;
513 (*ac)->ac_group_search = ocfs2_block_group_search; 517 (*ac)->ac_group_search = ocfs2_block_group_search;
@@ -532,6 +536,15 @@ bail:
532 return status; 536 return status;
533} 537}
534 538
539int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
540 struct ocfs2_extent_list *root_el,
541 struct ocfs2_alloc_context **ac)
542{
543 return ocfs2_reserve_new_metadata_blocks(osb,
544 ocfs2_extend_meta_needed(root_el),
545 ac);
546}
547
535static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, 548static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
536 struct ocfs2_alloc_context *ac) 549 struct ocfs2_alloc_context *ac)
537{ 550{
@@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
582 (*ac)->ac_group_search = ocfs2_block_group_search; 595 (*ac)->ac_group_search = ocfs2_block_group_search;
583 596
584 /* 597 /*
598 * stat(2) can't handle i_ino > 32bits, so we tell the
599 * lower levels not to allocate us a block group past that
600 * limit. The 'inode64' mount option avoids this behavior.
601 */
602 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
603 (*ac)->ac_max_block = (u32)~0U;
604
605 /*
585 * slot is set when we successfully steal inode from other nodes. 606 * slot is set when we successfully steal inode from other nodes.
586 * It is reset in 3 places: 607 * It is reset in 3 places:
587 * 1. when we flush the truncate log 608 * 1. when we flush the truncate log
@@ -661,9 +682,9 @@ bail:
661/* Callers don't need to care which bitmap (local alloc or main) to 682/* Callers don't need to care which bitmap (local alloc or main) to
662 * use so we figure it out for them, but unfortunately this clutters 683 * use so we figure it out for them, but unfortunately this clutters
663 * things a bit. */ 684 * things a bit. */
664int ocfs2_reserve_clusters(struct ocfs2_super *osb, 685static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
665 u32 bits_wanted, 686 u32 bits_wanted, u64 max_block,
666 struct ocfs2_alloc_context **ac) 687 struct ocfs2_alloc_context **ac)
667{ 688{
668 int status; 689 int status;
669 690
@@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
677 } 698 }
678 699
679 (*ac)->ac_bits_wanted = bits_wanted; 700 (*ac)->ac_bits_wanted = bits_wanted;
701 (*ac)->ac_max_block = max_block;
680 702
681 status = -ENOSPC; 703 status = -ENOSPC;
682 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { 704 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
683 status = ocfs2_reserve_local_alloc_bits(osb, 705 status = ocfs2_reserve_local_alloc_bits(osb,
684 bits_wanted, 706 bits_wanted,
685 *ac); 707 *ac);
686 if ((status < 0) && (status != -ENOSPC)) { 708 if (status == -EFBIG) {
709 /* The local alloc window is outside ac_max_block.
710 * use the main bitmap. */
711 status = -ENOSPC;
712 } else if ((status < 0) && (status != -ENOSPC)) {
687 mlog_errno(status); 713 mlog_errno(status);
688 goto bail; 714 goto bail;
689 } else if (status == -ENOSPC) {
690 /* reserve_local_bits will return enospc with
691 * the local alloc inode still locked, so we
692 * can change this safely here. */
693 mlog(0, "Disabling local alloc\n");
694 /* We set to OCFS2_LA_DISABLED so that umount
695 * can clean up what's left of the local
696 * allocation */
697 osb->local_alloc_state = OCFS2_LA_DISABLED;
698 } 715 }
699 } 716 }
700 717
@@ -718,6 +735,13 @@ bail:
718 return status; 735 return status;
719} 736}
720 737
738int ocfs2_reserve_clusters(struct ocfs2_super *osb,
739 u32 bits_wanted,
740 struct ocfs2_alloc_context **ac)
741{
742 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
743}
744
721/* 745/*
722 * More or less lifted from ext3. I'll leave their description below: 746 * More or less lifted from ext3. I'll leave their description below:
723 * 747 *
@@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
1000static int ocfs2_cluster_group_search(struct inode *inode, 1024static int ocfs2_cluster_group_search(struct inode *inode,
1001 struct buffer_head *group_bh, 1025 struct buffer_head *group_bh,
1002 u32 bits_wanted, u32 min_bits, 1026 u32 bits_wanted, u32 min_bits,
1027 u64 max_block,
1003 u16 *bit_off, u16 *bits_found) 1028 u16 *bit_off, u16 *bits_found)
1004{ 1029{
1005 int search = -ENOSPC; 1030 int search = -ENOSPC;
1006 int ret; 1031 int ret;
1032 u64 blkoff;
1007 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1033 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1034 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1008 u16 tmp_off, tmp_found; 1035 u16 tmp_off, tmp_found;
1009 unsigned int max_bits, gd_cluster_off; 1036 unsigned int max_bits, gd_cluster_off;
1010 1037
@@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1037 if (ret) 1064 if (ret)
1038 return ret; 1065 return ret;
1039 1066
1067 if (max_block) {
1068 blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1069 gd_cluster_off +
1070 tmp_off + tmp_found);
1071 mlog(0, "Checking %llu against %llu\n",
1072 (unsigned long long)blkoff,
1073 (unsigned long long)max_block);
1074 if (blkoff > max_block)
1075 return -ENOSPC;
1076 }
1077
1040 /* ocfs2_block_group_find_clear_bits() might 1078 /* ocfs2_block_group_find_clear_bits() might
1041 * return success, but we still want to return 1079 * return success, but we still want to return
1042 * -ENOSPC unless it found the minimum number 1080 * -ENOSPC unless it found the minimum number
@@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1045 *bit_off = tmp_off; 1083 *bit_off = tmp_off;
1046 *bits_found = tmp_found; 1084 *bits_found = tmp_found;
1047 search = 0; /* success */ 1085 search = 0; /* success */
1086 } else if (tmp_found) {
1087 /*
1088 * Don't show bits which we'll be returning
1089 * for allocation to the local alloc bitmap.
1090 */
1091 ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1048 } 1092 }
1049 } 1093 }
1050 1094
@@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1054static int ocfs2_block_group_search(struct inode *inode, 1098static int ocfs2_block_group_search(struct inode *inode,
1055 struct buffer_head *group_bh, 1099 struct buffer_head *group_bh,
1056 u32 bits_wanted, u32 min_bits, 1100 u32 bits_wanted, u32 min_bits,
1101 u64 max_block,
1057 u16 *bit_off, u16 *bits_found) 1102 u16 *bit_off, u16 *bits_found)
1058{ 1103{
1059 int ret = -ENOSPC; 1104 int ret = -ENOSPC;
1105 u64 blkoff;
1060 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1106 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1061 1107
1062 BUG_ON(min_bits != 1); 1108 BUG_ON(min_bits != 1);
1063 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1109 BUG_ON(ocfs2_is_cluster_bitmap(inode));
1064 1110
1065 if (bg->bg_free_bits_count) 1111 if (bg->bg_free_bits_count) {
1066 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1112 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1067 group_bh, bits_wanted, 1113 group_bh, bits_wanted,
1068 le16_to_cpu(bg->bg_bits), 1114 le16_to_cpu(bg->bg_bits),
1069 bit_off, bits_found); 1115 bit_off, bits_found);
1116 if (!ret && max_block) {
1117 blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1118 *bits_found;
1119 mlog(0, "Checking %llu against %llu\n",
1120 (unsigned long long)blkoff,
1121 (unsigned long long)max_block);
1122 if (blkoff > max_block)
1123 ret = -ENOSPC;
1124 }
1125 }
1070 1126
1071 return ret; 1127 return ret;
1072} 1128}
@@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1116 struct ocfs2_group_desc *gd; 1172 struct ocfs2_group_desc *gd;
1117 struct inode *alloc_inode = ac->ac_inode; 1173 struct inode *alloc_inode = ac->ac_inode;
1118 1174
1119 ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, 1175 ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
1120 &group_bh, OCFS2_BH_CACHED, alloc_inode);
1121 if (ret < 0) { 1176 if (ret < 0) {
1122 mlog_errno(ret); 1177 mlog_errno(ret);
1123 return ret; 1178 return ret;
@@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1131 } 1186 }
1132 1187
1133 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1188 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1134 bit_off, &found); 1189 ac->ac_max_block, bit_off, &found);
1135 if (ret < 0) { 1190 if (ret < 0) {
1136 if (ret != -ENOSPC) 1191 if (ret != -ENOSPC)
1137 mlog_errno(ret); 1192 mlog_errno(ret);
@@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1186 bits_wanted, chain, 1241 bits_wanted, chain,
1187 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1242 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1188 1243
1189 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), 1244 status = ocfs2_read_block(alloc_inode,
1190 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1245 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1191 &group_bh, OCFS2_BH_CACHED, alloc_inode); 1246 &group_bh);
1192 if (status < 0) { 1247 if (status < 0) {
1193 mlog_errno(status); 1248 mlog_errno(status);
1194 goto bail; 1249 goto bail;
@@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1204 /* for now, the chain search is a bit simplistic. We just use 1259 /* for now, the chain search is a bit simplistic. We just use
1205 * the 1st group with any empty bits. */ 1260 * the 1st group with any empty bits. */
1206 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1261 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1207 bits_wanted, min_bits, bit_off, 1262 bits_wanted, min_bits,
1263 ac->ac_max_block, bit_off,
1208 &tmp_bits)) == -ENOSPC) { 1264 &tmp_bits)) == -ENOSPC) {
1209 if (!bg->bg_next_group) 1265 if (!bg->bg_next_group)
1210 break; 1266 break;
1211 1267
1212 if (prev_group_bh) { 1268 brelse(prev_group_bh);
1213 brelse(prev_group_bh); 1269 prev_group_bh = NULL;
1214 prev_group_bh = NULL; 1270
1215 }
1216 next_group = le64_to_cpu(bg->bg_next_group); 1271 next_group = le64_to_cpu(bg->bg_next_group);
1217 prev_group_bh = group_bh; 1272 prev_group_bh = group_bh;
1218 group_bh = NULL; 1273 group_bh = NULL;
1219 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), 1274 status = ocfs2_read_block(alloc_inode,
1220 next_group, &group_bh, 1275 next_group, &group_bh);
1221 OCFS2_BH_CACHED, alloc_inode);
1222 if (status < 0) { 1276 if (status < 0) {
1223 mlog_errno(status); 1277 mlog_errno(status);
1224 goto bail; 1278 goto bail;
@@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1307 *bg_blkno = le64_to_cpu(bg->bg_blkno); 1361 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1308 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1362 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1309bail: 1363bail:
1310 if (group_bh) 1364 brelse(group_bh);
1311 brelse(group_bh); 1365 brelse(prev_group_bh);
1312 if (prev_group_bh)
1313 brelse(prev_group_bh);
1314 1366
1315 mlog_exit(status); 1367 mlog_exit(status);
1316 return status; 1368 return status;
@@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1723{ 1775{
1724 int status = 0; 1776 int status = 0;
1725 u32 tmp_used; 1777 u32 tmp_used;
1726 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1727 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 1778 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1728 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 1779 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1729 struct buffer_head *group_bh = NULL; 1780 struct buffer_head *group_bh = NULL;
@@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1742 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 1793 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1743 (unsigned long long)bg_blkno, start_bit); 1794 (unsigned long long)bg_blkno, start_bit);
1744 1795
1745 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, 1796 status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
1746 alloc_inode);
1747 if (status < 0) { 1797 if (status < 0) {
1748 mlog_errno(status); 1798 mlog_errno(status);
1749 goto bail; 1799 goto bail;
@@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1784 } 1834 }
1785 1835
1786bail: 1836bail:
1787 if (group_bh) 1837 brelse(group_bh);
1788 brelse(group_bh);
1789 1838
1790 mlog_exit(status); 1839 mlog_exit(status);
1791 return status; 1840 return status;
@@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle,
1838 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 1887 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1839 bg_start_bit, bg_blkno, 1888 bg_start_bit, bg_blkno,
1840 num_clusters); 1889 num_clusters);
1841 if (status < 0) 1890 if (status < 0) {
1842 mlog_errno(status); 1891 mlog_errno(status);
1892 goto out;
1893 }
1843 1894
1895 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
1896 num_clusters);
1897
1898out:
1844 mlog_exit(status); 1899 mlog_exit(status);
1845 return status; 1900 return status;
1846} 1901}
@@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1891 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); 1946 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
1892 } 1947 }
1893} 1948}
1949
1950/*
1951 * For a given allocation, determine which allocators will need to be
1952 * accessed, and lock them, reserving the appropriate number of bits.
1953 *
1954 * Sparse file systems call this from ocfs2_write_begin_nolock()
1955 * and ocfs2_allocate_unwritten_extents().
1956 *
1957 * File systems which don't support holes call this from
1958 * ocfs2_extend_allocation().
1959 */
1960int ocfs2_lock_allocators(struct inode *inode,
1961 struct ocfs2_extent_tree *et,
1962 u32 clusters_to_add, u32 extents_to_split,
1963 struct ocfs2_alloc_context **data_ac,
1964 struct ocfs2_alloc_context **meta_ac)
1965{
1966 int ret = 0, num_free_extents;
1967 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
1968 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1969
1970 *meta_ac = NULL;
1971 if (data_ac)
1972 *data_ac = NULL;
1973
1974 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
1975
1976 num_free_extents = ocfs2_num_free_extents(osb, inode, et);
1977 if (num_free_extents < 0) {
1978 ret = num_free_extents;
1979 mlog_errno(ret);
1980 goto out;
1981 }
1982
1983 /*
1984 * Sparse allocation file systems need to be more conservative
1985 * with reserving room for expansion - the actual allocation
1986 * happens while we've got a journal handle open so re-taking
1987 * a cluster lock (because we ran out of room for another
1988 * extent) will violate ordering rules.
1989 *
1990 * Most of the time we'll only be seeing this 1 cluster at a time
1991 * anyway.
1992 *
1993 * Always lock for any unwritten extents - we might want to
1994 * add blocks during a split.
1995 */
1996 if (!num_free_extents ||
1997 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
1998 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
1999 if (ret < 0) {
2000 if (ret != -ENOSPC)
2001 mlog_errno(ret);
2002 goto out;
2003 }
2004 }
2005
2006 if (clusters_to_add == 0)
2007 goto out;
2008
2009 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2010 if (ret < 0) {
2011 if (ret != -ENOSPC)
2012 mlog_errno(ret);
2013 goto out;
2014 }
2015
2016out:
2017 if (ret) {
2018 if (*meta_ac) {
2019 ocfs2_free_alloc_context(*meta_ac);
2020 *meta_ac = NULL;
2021 }
2022
2023 /*
2024 * We cannot have an error and a non null *data_ac.
2025 */
2026 }
2027
2028 return ret;
2029}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600662bd..4df159d8f450 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -28,10 +28,11 @@
28 28
29typedef int (group_search_t)(struct inode *, 29typedef int (group_search_t)(struct inode *,
30 struct buffer_head *, 30 struct buffer_head *,
31 u32, 31 u32, /* bits_wanted */
32 u32, 32 u32, /* min_bits */
33 u16 *, 33 u64, /* max_block */
34 u16 *); 34 u16 *, /* *bit_off */
35 u16 *); /* *bits_found */
35 36
36struct ocfs2_alloc_context { 37struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */ 38 struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
51 group_search_t *ac_group_search; 52 group_search_t *ac_group_search;
52 53
53 u64 ac_last_group; 54 u64 ac_last_group;
55 u64 ac_max_block; /* Highest block number to allocate. 0 is
56 is the same as ~0 - unlimited */
54}; 57};
55 58
56void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 59void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
@@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
59 return ac->ac_bits_wanted - ac->ac_bits_given; 62 return ac->ac_bits_wanted - ac->ac_bits_given;
60} 63}
61 64
65/*
66 * Please note that the caller must make sure that root_el is the root
67 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
68 * the result may be wrong.
69 */
62int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 70int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
63 struct ocfs2_dinode *fe, 71 struct ocfs2_extent_list *root_el,
64 struct ocfs2_alloc_context **ac); 72 struct ocfs2_alloc_context **ac);
73int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
74 int blocks,
75 struct ocfs2_alloc_context **ac);
65int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 76int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
66 struct ocfs2_alloc_context **ac); 77 struct ocfs2_alloc_context **ac);
67int ocfs2_reserve_clusters(struct ocfs2_super *osb, 78int ocfs2_reserve_clusters(struct ocfs2_super *osb,
@@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
147 * apis above. */ 158 * apis above. */
148int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 159int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
149 struct ocfs2_alloc_context *ac); 160 struct ocfs2_alloc_context *ac);
161void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
150 162
151/* given a cluster offset, calculate which block group it belongs to 163/* given a cluster offset, calculate which block group it belongs to
152 * and return that block offset. */ 164 * and return that block offset. */
@@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
156int ocfs2_check_group_descriptor(struct super_block *sb, 168int ocfs2_check_group_descriptor(struct super_block *sb,
157 struct ocfs2_dinode *di, 169 struct ocfs2_dinode *di,
158 struct ocfs2_group_desc *gd); 170 struct ocfs2_group_desc *gd);
171int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
172 u32 clusters_to_add, u32 extents_to_split,
173 struct ocfs2_alloc_context **data_ac,
174 struct ocfs2_alloc_context **meta_ac);
159#endif /* _CHAINALLOC_H_ */ 175#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 88255d3f52b4..304b63ac78cf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
64#include "sysfile.h" 64#include "sysfile.h"
65#include "uptodate.h" 65#include "uptodate.h"
66#include "ver.h" 66#include "ver.h"
67#include "xattr.h"
67 68
68#include "buffer_head_io.h" 69#include "buffer_head_io.h"
69 70
@@ -154,10 +155,13 @@ enum {
154 Opt_localalloc, 155 Opt_localalloc,
155 Opt_localflocks, 156 Opt_localflocks,
156 Opt_stack, 157 Opt_stack,
158 Opt_user_xattr,
159 Opt_nouser_xattr,
160 Opt_inode64,
157 Opt_err, 161 Opt_err,
158}; 162};
159 163
160static match_table_t tokens = { 164static const match_table_t tokens = {
161 {Opt_barrier, "barrier=%u"}, 165 {Opt_barrier, "barrier=%u"},
162 {Opt_err_panic, "errors=panic"}, 166 {Opt_err_panic, "errors=panic"},
163 {Opt_err_ro, "errors=remount-ro"}, 167 {Opt_err_ro, "errors=remount-ro"},
@@ -173,6 +177,9 @@ static match_table_t tokens = {
173 {Opt_localalloc, "localalloc=%d"}, 177 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"}, 178 {Opt_localflocks, "localflocks"},
175 {Opt_stack, "cluster_stack=%s"}, 179 {Opt_stack, "cluster_stack=%s"},
180 {Opt_user_xattr, "user_xattr"},
181 {Opt_nouser_xattr, "nouser_xattr"},
182 {Opt_inode64, "inode64"},
176 {Opt_err, NULL} 183 {Opt_err, NULL}
177}; 184};
178 185
@@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
205 ocfs2_schedule_truncate_log_flush(osb, 0); 212 ocfs2_schedule_truncate_log_flush(osb, 0);
206 } 213 }
207 214
208 if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { 215 if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
216 &target)) {
209 if (wait) 217 if (wait)
210 log_wait_commit(OCFS2_SB(sb)->journal->j_journal, 218 jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
211 target); 219 target);
212 } 220 }
213 return 0; 221 return 0;
214} 222}
@@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
325 if (!oi) 333 if (!oi)
326 return NULL; 334 return NULL;
327 335
336 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
328 return &oi->vfs_inode; 337 return &oi->vfs_inode;
329} 338}
330 339
@@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
406 goto out; 415 goto out;
407 } 416 }
408 417
418 /* Probably don't want this on remount; it might
419 * mess with other nodes */
420 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
421 (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
422 ret = -EINVAL;
423 mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
424 goto out;
425 }
426
409 /* We're going to/from readonly mode. */ 427 /* We're going to/from readonly mode. */
410 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { 428 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
411 /* Lock here so the check of HARD_RO and the potential 429 /* Lock here so the check of HARD_RO and the potential
@@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
637 osb->s_atime_quantum = parsed_options.atime_quantum; 655 osb->s_atime_quantum = parsed_options.atime_quantum;
638 osb->preferred_slot = parsed_options.slot; 656 osb->preferred_slot = parsed_options.slot;
639 osb->osb_commit_interval = parsed_options.commit_interval; 657 osb->osb_commit_interval = parsed_options.commit_interval;
640 osb->local_alloc_size = parsed_options.localalloc_opt; 658 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
659 osb->local_alloc_bits = osb->local_alloc_default_bits;
641 660
642 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 661 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
643 if (status) 662 if (status)
@@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
743 return status; 762 return status;
744 763
745read_super_error: 764read_super_error:
746 if (bh != NULL) 765 brelse(bh);
747 brelse(bh);
748 766
749 if (inode) 767 if (inode)
750 iput(inode); 768 iput(inode);
@@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb,
847 case Opt_data_writeback: 865 case Opt_data_writeback:
848 mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; 866 mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
849 break; 867 break;
868 case Opt_user_xattr:
869 mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
870 break;
871 case Opt_nouser_xattr:
872 mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
873 break;
850 case Opt_atime_quantum: 874 case Opt_atime_quantum:
851 if (match_int(&args[0], &option)) { 875 if (match_int(&args[0], &option)) {
852 status = 0; 876 status = 0;
@@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb,
873 if (option < 0) 897 if (option < 0)
874 return 0; 898 return 0;
875 if (option == 0) 899 if (option == 0)
876 option = JBD_DEFAULT_MAX_COMMIT_AGE; 900 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
877 mopt->commit_interval = HZ * option; 901 mopt->commit_interval = HZ * option;
878 break; 902 break;
879 case Opt_localalloc: 903 case Opt_localalloc:
@@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb,
918 OCFS2_STACK_LABEL_LEN); 942 OCFS2_STACK_LABEL_LEN);
919 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; 943 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
920 break; 944 break;
945 case Opt_inode64:
946 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
947 break;
921 default: 948 default:
922 mlog(ML_ERROR, 949 mlog(ML_ERROR,
923 "Unrecognized mount option \"%s\" " 950 "Unrecognized mount option \"%s\" "
@@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
938{ 965{
939 struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); 966 struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
940 unsigned long opts = osb->s_mount_opt; 967 unsigned long opts = osb->s_mount_opt;
968 unsigned int local_alloc_megs;
941 969
942 if (opts & OCFS2_MOUNT_HB_LOCAL) 970 if (opts & OCFS2_MOUNT_HB_LOCAL)
943 seq_printf(s, ",_netdev,heartbeat=local"); 971 seq_printf(s, ",_netdev,heartbeat=local");
@@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
970 seq_printf(s, ",commit=%u", 998 seq_printf(s, ",commit=%u",
971 (unsigned) (osb->osb_commit_interval / HZ)); 999 (unsigned) (osb->osb_commit_interval / HZ));
972 1000
973 if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) 1001 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
974 seq_printf(s, ",localalloc=%d", osb->local_alloc_size); 1002 if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
1003 seq_printf(s, ",localalloc=%d", local_alloc_megs);
975 1004
976 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 1005 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
977 seq_printf(s, ",localflocks,"); 1006 seq_printf(s, ",localflocks,");
@@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
980 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, 1009 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
981 osb->osb_cluster_stack); 1010 osb->osb_cluster_stack);
982 1011
1012 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1013 seq_printf(s, ",nouser_xattr");
1014 else
1015 seq_printf(s, ",user_xattr");
1016
1017 if (opts & OCFS2_MOUNT_INODE64)
1018 seq_printf(s, ",inode64");
1019
983 return 0; 1020 return 0;
984} 1021}
985 1022
@@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data)
1132 oi->ip_dir_start_lookup = 0; 1169 oi->ip_dir_start_lookup = 0;
1133 1170
1134 init_rwsem(&oi->ip_alloc_sem); 1171 init_rwsem(&oi->ip_alloc_sem);
1172 init_rwsem(&oi->ip_xattr_sem);
1135 mutex_init(&oi->ip_io_mutex); 1173 mutex_init(&oi->ip_io_mutex);
1136 1174
1137 oi->ip_blkno = 0ULL; 1175 oi->ip_blkno = 0ULL;
@@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1375 sb->s_fs_info = osb; 1413 sb->s_fs_info = osb;
1376 sb->s_op = &ocfs2_sops; 1414 sb->s_op = &ocfs2_sops;
1377 sb->s_export_op = &ocfs2_export_ops; 1415 sb->s_export_op = &ocfs2_export_ops;
1416 sb->s_xattr = ocfs2_xattr_handlers;
1378 sb->s_time_gran = 1; 1417 sb->s_time_gran = 1;
1379 sb->s_flags |= MS_NOATIME; 1418 sb->s_flags |= MS_NOATIME;
1380 /* this is needed to support O_LARGEFILE */ 1419 /* this is needed to support O_LARGEFILE */
@@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
1421 1460
1422 osb->slot_num = OCFS2_INVALID_SLOT; 1461 osb->slot_num = OCFS2_INVALID_SLOT;
1423 1462
1463 osb->s_xattr_inline_size = le16_to_cpu(
1464 di->id2.i_super.s_xattr_inline_size);
1465
1424 osb->local_alloc_state = OCFS2_LA_UNUSED; 1466 osb->local_alloc_state = OCFS2_LA_UNUSED;
1425 osb->local_alloc_bh = NULL; 1467 osb->local_alloc_bh = NULL;
1468 INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
1426 1469
1427 init_waitqueue_head(&osb->osb_mount_event); 1470 init_waitqueue_head(&osb->osb_mount_event);
1428 1471
@@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1568 osb->first_cluster_group_blkno = 1611 osb->first_cluster_group_blkno =
1569 le64_to_cpu(di->id2.i_super.s_first_cluster_group); 1612 le64_to_cpu(di->id2.i_super.s_first_cluster_group);
1570 osb->fs_generation = le32_to_cpu(di->i_fs_generation); 1613 osb->fs_generation = le32_to_cpu(di->i_fs_generation);
1614 osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
1571 mlog(0, "vol_label: %s\n", osb->vol_label); 1615 mlog(0, "vol_label: %s\n", osb->vol_label);
1572 mlog(0, "uuid: %s\n", osb->uuid_str); 1616 mlog(0, "uuid: %s\n", osb->uuid_str);
1573 mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", 1617 mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ba9dbb51d25b..cbd03dfdc7b9 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -50,6 +50,7 @@
50#include "inode.h" 50#include "inode.h"
51#include "journal.h" 51#include "journal.h"
52#include "symlink.h" 52#include "symlink.h"
53#include "xattr.h"
53 54
54#include "buffer_head_io.h" 55#include "buffer_head_io.h"
55 56
@@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
83 84
84 mlog_entry_void(); 85 mlog_entry_void();
85 86
86 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 87 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
87 OCFS2_I(inode)->ip_blkno,
88 bh,
89 OCFS2_BH_CACHED,
90 inode);
91 if (status < 0) { 88 if (status < 0) {
92 mlog_errno(status); 89 mlog_errno(status);
93 link = ERR_PTR(status); 90 link = ERR_PTR(status);
@@ -157,8 +154,7 @@ bail:
157 kunmap(page); 154 kunmap(page);
158 page_cache_release(page); 155 page_cache_release(page);
159 } 156 }
160 if (bh) 157 brelse(bh);
161 brelse(bh);
162 158
163 return ERR_PTR(status); 159 return ERR_PTR(status);
164} 160}
@@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
168 .follow_link = ocfs2_follow_link, 164 .follow_link = ocfs2_follow_link,
169 .getattr = ocfs2_getattr, 165 .getattr = ocfs2_getattr,
170 .setattr = ocfs2_setattr, 166 .setattr = ocfs2_setattr,
167 .setxattr = generic_setxattr,
168 .getxattr = generic_getxattr,
169 .listxattr = ocfs2_listxattr,
170 .removexattr = generic_removexattr,
171}; 171};
172const struct inode_operations ocfs2_fast_symlink_inode_operations = { 172const struct inode_operations ocfs2_fast_symlink_inode_operations = {
173 .readlink = ocfs2_readlink, 173 .readlink = ocfs2_readlink,
174 .follow_link = ocfs2_follow_link, 174 .follow_link = ocfs2_follow_link,
175 .getattr = ocfs2_getattr, 175 .getattr = ocfs2_getattr,
176 .setattr = ocfs2_setattr, 176 .setattr = ocfs2_setattr,
177 .setxattr = generic_setxattr,
178 .getxattr = generic_getxattr,
179 .listxattr = ocfs2_listxattr,
180 .removexattr = generic_removexattr,
177}; 181};
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 4da8851f2b23..187b99ff0368 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,7 +53,11 @@
53#include <linux/highmem.h> 53#include <linux/highmem.h>
54#include <linux/buffer_head.h> 54#include <linux/buffer_head.h>
55#include <linux/rbtree.h> 55#include <linux/rbtree.h>
56#include <linux/jbd.h> 56#ifndef CONFIG_OCFS2_COMPAT_JBD
57# include <linux/jbd2.h>
58#else
59# include <linux/jbd.h>
60#endif
57 61
58#define MLOG_MASK_PREFIX ML_UPTODATE 62#define MLOG_MASK_PREFIX ML_UPTODATE
59 63
@@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
511 ci->ci_num_cached--; 515 ci->ci_num_cached--;
512} 516}
513 517
514/* Called when we remove a chunk of metadata from an inode. We don't 518static void ocfs2_remove_block_from_cache(struct inode *inode,
515 * bother reverting things to an inlined array in the case of a remove 519 sector_t block)
516 * which moves us back under the limit. */
517void ocfs2_remove_from_cache(struct inode *inode,
518 struct buffer_head *bh)
519{ 520{
520 int index; 521 int index;
521 sector_t block = bh->b_blocknr;
522 struct ocfs2_meta_cache_item *item = NULL; 522 struct ocfs2_meta_cache_item *item = NULL;
523 struct ocfs2_inode_info *oi = OCFS2_I(inode); 523 struct ocfs2_inode_info *oi = OCFS2_I(inode);
524 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 524 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
544 kmem_cache_free(ocfs2_uptodate_cachep, item); 544 kmem_cache_free(ocfs2_uptodate_cachep, item);
545} 545}
546 546
547/*
548 * Called when we remove a chunk of metadata from an inode. We don't
549 * bother reverting things to an inlined array in the case of a remove
550 * which moves us back under the limit.
551 */
552void ocfs2_remove_from_cache(struct inode *inode,
553 struct buffer_head *bh)
554{
555 sector_t block = bh->b_blocknr;
556
557 ocfs2_remove_block_from_cache(inode, block);
558}
559
560/* Called when we remove xattr clusters from an inode. */
561void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
562 sector_t block,
563 u32 c_len)
564{
565 unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
566
567 for (i = 0; i < b_len; i++, block++)
568 ocfs2_remove_block_from_cache(inode, block);
569}
570
547int __init init_ocfs2_uptodate_cache(void) 571int __init init_ocfs2_uptodate_cache(void)
548{ 572{
549 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", 573 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a8..531b4b3a0c47 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
40 struct buffer_head *bh); 40 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode, 41void ocfs2_remove_from_cache(struct inode *inode,
42 struct buffer_head *bh); 42 struct buffer_head *bh);
43void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
44 sector_t block,
45 u32 c_len);
43int ocfs2_buffer_read_ahead(struct inode *inode, 46int ocfs2_buffer_read_ahead(struct inode *inode,
44 struct buffer_head *bh); 47 struct buffer_head *bh);
45 48
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 000000000000..c25780a70dfd
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,4834 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * xattr.c
5 *
6 * Copyright (C) 2008 Oracle. All rights reserved.
7 *
8 * CREDITS:
9 * Lots of code in this file is taken from ext3.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/capability.h>
28#include <linux/fs.h>
29#include <linux/types.h>
30#include <linux/slab.h>
31#include <linux/highmem.h>
32#include <linux/pagemap.h>
33#include <linux/uio.h>
34#include <linux/sched.h>
35#include <linux/splice.h>
36#include <linux/mount.h>
37#include <linux/writeback.h>
38#include <linux/falloc.h>
39#include <linux/sort.h>
40#include <linux/init.h>
41#include <linux/module.h>
42#include <linux/string.h>
43
44#define MLOG_MASK_PREFIX ML_XATTR
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48#include "alloc.h"
49#include "dlmglue.h"
50#include "file.h"
51#include "symlink.h"
52#include "sysfile.h"
53#include "inode.h"
54#include "journal.h"
55#include "ocfs2_fs.h"
56#include "suballoc.h"
57#include "uptodate.h"
58#include "buffer_head_io.h"
59#include "super.h"
60#include "xattr.h"
61
62
63struct ocfs2_xattr_def_value_root {
64 struct ocfs2_xattr_value_root xv;
65 struct ocfs2_extent_rec er;
66};
67
68struct ocfs2_xattr_bucket {
69 struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
70 struct ocfs2_xattr_header *xh;
71};
72
73#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
74#define OCFS2_XATTR_INLINE_SIZE 80
75
76static struct ocfs2_xattr_def_value_root def_xv = {
77 .xv.xr_list.l_count = cpu_to_le16(1),
78};
79
80struct xattr_handler *ocfs2_xattr_handlers[] = {
81 &ocfs2_xattr_user_handler,
82 &ocfs2_xattr_trusted_handler,
83 NULL
84};
85
86static struct xattr_handler *ocfs2_xattr_handler_map[] = {
87 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
88 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
89};
90
91struct ocfs2_xattr_info {
92 int name_index;
93 const char *name;
94 const void *value;
95 size_t value_len;
96};
97
98struct ocfs2_xattr_search {
99 struct buffer_head *inode_bh;
100 /*
101 * xattr_bh point to the block buffer head which has extended attribute
102 * when extended attribute in inode, xattr_bh is equal to inode_bh.
103 */
104 struct buffer_head *xattr_bh;
105 struct ocfs2_xattr_header *header;
106 struct ocfs2_xattr_bucket bucket;
107 void *base;
108 void *end;
109 struct ocfs2_xattr_entry *here;
110 int not_found;
111};
112
113static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
114 struct ocfs2_xattr_header *xh,
115 int index,
116 int *block_off,
117 int *new_offset);
118
119static int ocfs2_xattr_index_block_find(struct inode *inode,
120 struct buffer_head *root_bh,
121 int name_index,
122 const char *name,
123 struct ocfs2_xattr_search *xs);
124
125static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
126 struct ocfs2_xattr_tree_root *xt,
127 char *buffer,
128 size_t buffer_size);
129
130static int ocfs2_xattr_create_index_block(struct inode *inode,
131 struct ocfs2_xattr_search *xs);
132
133static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
134 struct ocfs2_xattr_info *xi,
135 struct ocfs2_xattr_search *xs);
136
137static int ocfs2_delete_xattr_index_block(struct inode *inode,
138 struct buffer_head *xb_bh);
139
140static inline const char *ocfs2_xattr_prefix(int name_index)
141{
142 struct xattr_handler *handler = NULL;
143
144 if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
145 handler = ocfs2_xattr_handler_map[name_index];
146
147 return handler ? handler->prefix : NULL;
148}
149
150static u32 ocfs2_xattr_name_hash(struct inode *inode,
151 const char *name,
152 int name_len)
153{
154 /* Get hash value of uuid from super block */
155 u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
156 int i;
157
158 /* hash extended attribute name */
159 for (i = 0; i < name_len; i++) {
160 hash = (hash << OCFS2_HASH_SHIFT) ^
161 (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
162 *name++;
163 }
164
165 return hash;
166}
167
168/*
169 * ocfs2_xattr_hash_entry()
170 *
171 * Compute the hash of an extended attribute.
172 */
173static void ocfs2_xattr_hash_entry(struct inode *inode,
174 struct ocfs2_xattr_header *header,
175 struct ocfs2_xattr_entry *entry)
176{
177 u32 hash = 0;
178 char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
179
180 hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
181 entry->xe_name_hash = cpu_to_le32(hash);
182
183 return;
184}
185
186static int ocfs2_xattr_extend_allocation(struct inode *inode,
187 u32 clusters_to_add,
188 struct buffer_head *xattr_bh,
189 struct ocfs2_xattr_value_root *xv)
190{
191 int status = 0;
192 int restart_func = 0;
193 int credits = 0;
194 handle_t *handle = NULL;
195 struct ocfs2_alloc_context *data_ac = NULL;
196 struct ocfs2_alloc_context *meta_ac = NULL;
197 enum ocfs2_alloc_restarted why;
198 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
199 u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
200 struct ocfs2_extent_tree et;
201
202 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
203
204 ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
205
206restart_all:
207
208 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
209 &data_ac, &meta_ac);
210 if (status) {
211 mlog_errno(status);
212 goto leave;
213 }
214
215 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
216 clusters_to_add);
217 handle = ocfs2_start_trans(osb, credits);
218 if (IS_ERR(handle)) {
219 status = PTR_ERR(handle);
220 handle = NULL;
221 mlog_errno(status);
222 goto leave;
223 }
224
225restarted_transaction:
226 status = ocfs2_journal_access(handle, inode, xattr_bh,
227 OCFS2_JOURNAL_ACCESS_WRITE);
228 if (status < 0) {
229 mlog_errno(status);
230 goto leave;
231 }
232
233 prev_clusters = le32_to_cpu(xv->xr_clusters);
234 status = ocfs2_add_clusters_in_btree(osb,
235 inode,
236 &logical_start,
237 clusters_to_add,
238 0,
239 &et,
240 handle,
241 data_ac,
242 meta_ac,
243 &why);
244 if ((status < 0) && (status != -EAGAIN)) {
245 if (status != -ENOSPC)
246 mlog_errno(status);
247 goto leave;
248 }
249
250 status = ocfs2_journal_dirty(handle, xattr_bh);
251 if (status < 0) {
252 mlog_errno(status);
253 goto leave;
254 }
255
256 clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
257
258 if (why != RESTART_NONE && clusters_to_add) {
259 if (why == RESTART_META) {
260 mlog(0, "restarting function.\n");
261 restart_func = 1;
262 } else {
263 BUG_ON(why != RESTART_TRANS);
264
265 mlog(0, "restarting transaction.\n");
266 /* TODO: This can be more intelligent. */
267 credits = ocfs2_calc_extend_credits(osb->sb,
268 et.et_root_el,
269 clusters_to_add);
270 status = ocfs2_extend_trans(handle, credits);
271 if (status < 0) {
272 /* handle still has to be committed at
273 * this point. */
274 status = -ENOMEM;
275 mlog_errno(status);
276 goto leave;
277 }
278 goto restarted_transaction;
279 }
280 }
281
282leave:
283 if (handle) {
284 ocfs2_commit_trans(osb, handle);
285 handle = NULL;
286 }
287 if (data_ac) {
288 ocfs2_free_alloc_context(data_ac);
289 data_ac = NULL;
290 }
291 if (meta_ac) {
292 ocfs2_free_alloc_context(meta_ac);
293 meta_ac = NULL;
294 }
295 if ((!status) && restart_func) {
296 restart_func = 0;
297 goto restart_all;
298 }
299
300 return status;
301}
302
303static int __ocfs2_remove_xattr_range(struct inode *inode,
304 struct buffer_head *root_bh,
305 struct ocfs2_xattr_value_root *xv,
306 u32 cpos, u32 phys_cpos, u32 len,
307 struct ocfs2_cached_dealloc_ctxt *dealloc)
308{
309 int ret;
310 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
312 struct inode *tl_inode = osb->osb_tl_inode;
313 handle_t *handle;
314 struct ocfs2_alloc_context *meta_ac = NULL;
315 struct ocfs2_extent_tree et;
316
317 ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
318
319 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
320 if (ret) {
321 mlog_errno(ret);
322 return ret;
323 }
324
325 mutex_lock(&tl_inode->i_mutex);
326
327 if (ocfs2_truncate_log_needs_flush(osb)) {
328 ret = __ocfs2_flush_truncate_log(osb);
329 if (ret < 0) {
330 mlog_errno(ret);
331 goto out;
332 }
333 }
334
335 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
336 if (IS_ERR(handle)) {
337 ret = PTR_ERR(handle);
338 mlog_errno(ret);
339 goto out;
340 }
341
342 ret = ocfs2_journal_access(handle, inode, root_bh,
343 OCFS2_JOURNAL_ACCESS_WRITE);
344 if (ret) {
345 mlog_errno(ret);
346 goto out_commit;
347 }
348
349 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
350 dealloc);
351 if (ret) {
352 mlog_errno(ret);
353 goto out_commit;
354 }
355
356 le32_add_cpu(&xv->xr_clusters, -len);
357
358 ret = ocfs2_journal_dirty(handle, root_bh);
359 if (ret) {
360 mlog_errno(ret);
361 goto out_commit;
362 }
363
364 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
365 if (ret)
366 mlog_errno(ret);
367
368out_commit:
369 ocfs2_commit_trans(osb, handle);
370out:
371 mutex_unlock(&tl_inode->i_mutex);
372
373 if (meta_ac)
374 ocfs2_free_alloc_context(meta_ac);
375
376 return ret;
377}
378
379static int ocfs2_xattr_shrink_size(struct inode *inode,
380 u32 old_clusters,
381 u32 new_clusters,
382 struct buffer_head *root_bh,
383 struct ocfs2_xattr_value_root *xv)
384{
385 int ret = 0;
386 u32 trunc_len, cpos, phys_cpos, alloc_size;
387 u64 block;
388 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
389 struct ocfs2_cached_dealloc_ctxt dealloc;
390
391 ocfs2_init_dealloc_ctxt(&dealloc);
392
393 if (old_clusters <= new_clusters)
394 return 0;
395
396 cpos = new_clusters;
397 trunc_len = old_clusters - new_clusters;
398 while (trunc_len) {
399 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
400 &alloc_size, &xv->xr_list);
401 if (ret) {
402 mlog_errno(ret);
403 goto out;
404 }
405
406 if (alloc_size > trunc_len)
407 alloc_size = trunc_len;
408
409 ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
410 phys_cpos, alloc_size,
411 &dealloc);
412 if (ret) {
413 mlog_errno(ret);
414 goto out;
415 }
416
417 block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
418 ocfs2_remove_xattr_clusters_from_cache(inode, block,
419 alloc_size);
420 cpos += alloc_size;
421 trunc_len -= alloc_size;
422 }
423
424out:
425 ocfs2_schedule_truncate_log_flush(osb, 1);
426 ocfs2_run_deallocs(osb, &dealloc);
427
428 return ret;
429}
430
431static int ocfs2_xattr_value_truncate(struct inode *inode,
432 struct buffer_head *root_bh,
433 struct ocfs2_xattr_value_root *xv,
434 int len)
435{
436 int ret;
437 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
438 u32 old_clusters = le32_to_cpu(xv->xr_clusters);
439
440 if (new_clusters == old_clusters)
441 return 0;
442
443 if (new_clusters > old_clusters)
444 ret = ocfs2_xattr_extend_allocation(inode,
445 new_clusters - old_clusters,
446 root_bh, xv);
447 else
448 ret = ocfs2_xattr_shrink_size(inode,
449 old_clusters, new_clusters,
450 root_bh, xv);
451
452 return ret;
453}
454
455static int ocfs2_xattr_list_entry(char *buffer, size_t size,
456 size_t *result, const char *prefix,
457 const char *name, int name_len)
458{
459 char *p = buffer + *result;
460 int prefix_len = strlen(prefix);
461 int total_len = prefix_len + name_len + 1;
462
463 *result += total_len;
464
465 /* we are just looking for how big our buffer needs to be */
466 if (!size)
467 return 0;
468
469 if (*result > size)
470 return -ERANGE;
471
472 memcpy(p, prefix, prefix_len);
473 memcpy(p + prefix_len, name, name_len);
474 p[prefix_len + name_len] = '\0';
475
476 return 0;
477}
478
479static int ocfs2_xattr_list_entries(struct inode *inode,
480 struct ocfs2_xattr_header *header,
481 char *buffer, size_t buffer_size)
482{
483 size_t result = 0;
484 int i, type, ret;
485 const char *prefix, *name;
486
487 for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
488 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
489 type = ocfs2_xattr_get_type(entry);
490 prefix = ocfs2_xattr_prefix(type);
491
492 if (prefix) {
493 name = (const char *)header +
494 le16_to_cpu(entry->xe_name_offset);
495
496 ret = ocfs2_xattr_list_entry(buffer, buffer_size,
497 &result, prefix, name,
498 entry->xe_name_len);
499 if (ret)
500 return ret;
501 }
502 }
503
504 return result;
505}
506
507static int ocfs2_xattr_ibody_list(struct inode *inode,
508 struct ocfs2_dinode *di,
509 char *buffer,
510 size_t buffer_size)
511{
512 struct ocfs2_xattr_header *header = NULL;
513 struct ocfs2_inode_info *oi = OCFS2_I(inode);
514 int ret = 0;
515
516 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
517 return ret;
518
519 header = (struct ocfs2_xattr_header *)
520 ((void *)di + inode->i_sb->s_blocksize -
521 le16_to_cpu(di->i_xattr_inline_size));
522
523 ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
524
525 return ret;
526}
527
528static int ocfs2_xattr_block_list(struct inode *inode,
529 struct ocfs2_dinode *di,
530 char *buffer,
531 size_t buffer_size)
532{
533 struct buffer_head *blk_bh = NULL;
534 struct ocfs2_xattr_block *xb;
535 int ret = 0;
536
537 if (!di->i_xattr_loc)
538 return ret;
539
540 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
541 if (ret < 0) {
542 mlog_errno(ret);
543 return ret;
544 }
545 /*Verify the signature of xattr block*/
546 if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
547 strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
548 ret = -EFAULT;
549 goto cleanup;
550 }
551
552 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
553
554 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
555 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
556 ret = ocfs2_xattr_list_entries(inode, header,
557 buffer, buffer_size);
558 } else {
559 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
560 ret = ocfs2_xattr_tree_list_index_block(inode, xt,
561 buffer, buffer_size);
562 }
563cleanup:
564 brelse(blk_bh);
565
566 return ret;
567}
568
569ssize_t ocfs2_listxattr(struct dentry *dentry,
570 char *buffer,
571 size_t size)
572{
573 int ret = 0, i_ret = 0, b_ret = 0;
574 struct buffer_head *di_bh = NULL;
575 struct ocfs2_dinode *di = NULL;
576 struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
577
578 if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
579 return -EOPNOTSUPP;
580
581 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
582 return ret;
583
584 ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
585 if (ret < 0) {
586 mlog_errno(ret);
587 return ret;
588 }
589
590 di = (struct ocfs2_dinode *)di_bh->b_data;
591
592 down_read(&oi->ip_xattr_sem);
593 i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
594 if (i_ret < 0)
595 b_ret = 0;
596 else {
597 if (buffer) {
598 buffer += i_ret;
599 size -= i_ret;
600 }
601 b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
602 buffer, size);
603 if (b_ret < 0)
604 i_ret = 0;
605 }
606 up_read(&oi->ip_xattr_sem);
607 ocfs2_inode_unlock(dentry->d_inode, 0);
608
609 brelse(di_bh);
610
611 return i_ret + b_ret;
612}
613
614static int ocfs2_xattr_find_entry(int name_index,
615 const char *name,
616 struct ocfs2_xattr_search *xs)
617{
618 struct ocfs2_xattr_entry *entry;
619 size_t name_len;
620 int i, cmp = 1;
621
622 if (name == NULL)
623 return -EINVAL;
624
625 name_len = strlen(name);
626 entry = xs->here;
627 for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
628 cmp = name_index - ocfs2_xattr_get_type(entry);
629 if (!cmp)
630 cmp = name_len - entry->xe_name_len;
631 if (!cmp)
632 cmp = memcmp(name, (xs->base +
633 le16_to_cpu(entry->xe_name_offset)),
634 name_len);
635 if (cmp == 0)
636 break;
637 entry += 1;
638 }
639 xs->here = entry;
640
641 return cmp ? -ENODATA : 0;
642}
643
644static int ocfs2_xattr_get_value_outside(struct inode *inode,
645 struct ocfs2_xattr_value_root *xv,
646 void *buffer,
647 size_t len)
648{
649 u32 cpos, p_cluster, num_clusters, bpc, clusters;
650 u64 blkno;
651 int i, ret = 0;
652 size_t cplen, blocksize;
653 struct buffer_head *bh = NULL;
654 struct ocfs2_extent_list *el;
655
656 el = &xv->xr_list;
657 clusters = le32_to_cpu(xv->xr_clusters);
658 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
659 blocksize = inode->i_sb->s_blocksize;
660
661 cpos = 0;
662 while (cpos < clusters) {
663 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
664 &num_clusters, el);
665 if (ret) {
666 mlog_errno(ret);
667 goto out;
668 }
669
670 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
671 /* Copy ocfs2_xattr_value */
672 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
673 ret = ocfs2_read_block(inode, blkno, &bh);
674 if (ret) {
675 mlog_errno(ret);
676 goto out;
677 }
678
679 cplen = len >= blocksize ? blocksize : len;
680 memcpy(buffer, bh->b_data, cplen);
681 len -= cplen;
682 buffer += cplen;
683
684 brelse(bh);
685 bh = NULL;
686 if (len == 0)
687 break;
688 }
689 cpos += num_clusters;
690 }
691out:
692 return ret;
693}
694
695static int ocfs2_xattr_ibody_get(struct inode *inode,
696 int name_index,
697 const char *name,
698 void *buffer,
699 size_t buffer_size,
700 struct ocfs2_xattr_search *xs)
701{
702 struct ocfs2_inode_info *oi = OCFS2_I(inode);
703 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
704 struct ocfs2_xattr_value_root *xv;
705 size_t size;
706 int ret = 0;
707
708 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
709 return -ENODATA;
710
711 xs->end = (void *)di + inode->i_sb->s_blocksize;
712 xs->header = (struct ocfs2_xattr_header *)
713 (xs->end - le16_to_cpu(di->i_xattr_inline_size));
714 xs->base = (void *)xs->header;
715 xs->here = xs->header->xh_entries;
716
717 ret = ocfs2_xattr_find_entry(name_index, name, xs);
718 if (ret)
719 return ret;
720 size = le64_to_cpu(xs->here->xe_value_size);
721 if (buffer) {
722 if (size > buffer_size)
723 return -ERANGE;
724 if (ocfs2_xattr_is_local(xs->here)) {
725 memcpy(buffer, (void *)xs->base +
726 le16_to_cpu(xs->here->xe_name_offset) +
727 OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
728 } else {
729 xv = (struct ocfs2_xattr_value_root *)
730 (xs->base + le16_to_cpu(
731 xs->here->xe_name_offset) +
732 OCFS2_XATTR_SIZE(xs->here->xe_name_len));
733 ret = ocfs2_xattr_get_value_outside(inode, xv,
734 buffer, size);
735 if (ret < 0) {
736 mlog_errno(ret);
737 return ret;
738 }
739 }
740 }
741
742 return size;
743}
744
745static int ocfs2_xattr_block_get(struct inode *inode,
746 int name_index,
747 const char *name,
748 void *buffer,
749 size_t buffer_size,
750 struct ocfs2_xattr_search *xs)
751{
752 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
753 struct buffer_head *blk_bh = NULL;
754 struct ocfs2_xattr_block *xb;
755 struct ocfs2_xattr_value_root *xv;
756 size_t size;
757 int ret = -ENODATA, name_offset, name_len, block_off, i;
758
759 if (!di->i_xattr_loc)
760 return ret;
761
762 memset(&xs->bucket, 0, sizeof(xs->bucket));
763
764 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
765 if (ret < 0) {
766 mlog_errno(ret);
767 return ret;
768 }
769 /*Verify the signature of xattr block*/
770 if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
771 strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
772 ret = -EFAULT;
773 goto cleanup;
774 }
775
776 xs->xattr_bh = blk_bh;
777 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
778
779 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
780 xs->header = &xb->xb_attrs.xb_header;
781 xs->base = (void *)xs->header;
782 xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
783 xs->here = xs->header->xh_entries;
784
785 ret = ocfs2_xattr_find_entry(name_index, name, xs);
786 } else
787 ret = ocfs2_xattr_index_block_find(inode, blk_bh,
788 name_index,
789 name, xs);
790
791 if (ret)
792 goto cleanup;
793 size = le64_to_cpu(xs->here->xe_value_size);
794 if (buffer) {
795 ret = -ERANGE;
796 if (size > buffer_size)
797 goto cleanup;
798
799 name_offset = le16_to_cpu(xs->here->xe_name_offset);
800 name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
801 i = xs->here - xs->header->xh_entries;
802
803 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
804 ret = ocfs2_xattr_bucket_get_name_value(inode,
805 xs->bucket.xh,
806 i,
807 &block_off,
808 &name_offset);
809 xs->base = xs->bucket.bhs[block_off]->b_data;
810 }
811 if (ocfs2_xattr_is_local(xs->here)) {
812 memcpy(buffer, (void *)xs->base +
813 name_offset + name_len, size);
814 } else {
815 xv = (struct ocfs2_xattr_value_root *)
816 (xs->base + name_offset + name_len);
817 ret = ocfs2_xattr_get_value_outside(inode, xv,
818 buffer, size);
819 if (ret < 0) {
820 mlog_errno(ret);
821 goto cleanup;
822 }
823 }
824 }
825 ret = size;
826cleanup:
827 for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
828 brelse(xs->bucket.bhs[i]);
829 memset(&xs->bucket, 0, sizeof(xs->bucket));
830
831 brelse(blk_bh);
832 return ret;
833}
834
835/* ocfs2_xattr_get()
836 *
837 * Copy an extended attribute into the buffer provided.
838 * Buffer is NULL to compute the size of buffer required.
839 */
840int ocfs2_xattr_get(struct inode *inode,
841 int name_index,
842 const char *name,
843 void *buffer,
844 size_t buffer_size)
845{
846 int ret;
847 struct ocfs2_dinode *di = NULL;
848 struct buffer_head *di_bh = NULL;
849 struct ocfs2_inode_info *oi = OCFS2_I(inode);
850 struct ocfs2_xattr_search xis = {
851 .not_found = -ENODATA,
852 };
853 struct ocfs2_xattr_search xbs = {
854 .not_found = -ENODATA,
855 };
856
857 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
858 return -EOPNOTSUPP;
859
860 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
861 ret = -ENODATA;
862
863 ret = ocfs2_inode_lock(inode, &di_bh, 0);
864 if (ret < 0) {
865 mlog_errno(ret);
866 return ret;
867 }
868 xis.inode_bh = xbs.inode_bh = di_bh;
869 di = (struct ocfs2_dinode *)di_bh->b_data;
870
871 down_read(&oi->ip_xattr_sem);
872 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
873 buffer_size, &xis);
874 if (ret == -ENODATA)
875 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
876 buffer_size, &xbs);
877 up_read(&oi->ip_xattr_sem);
878 ocfs2_inode_unlock(inode, 0);
879
880 brelse(di_bh);
881
882 return ret;
883}
884
885static int __ocfs2_xattr_set_value_outside(struct inode *inode,
886 struct ocfs2_xattr_value_root *xv,
887 const void *value,
888 int value_len)
889{
890 int ret = 0, i, cp_len, credits;
891 u16 blocksize = inode->i_sb->s_blocksize;
892 u32 p_cluster, num_clusters;
893 u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
894 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
895 u64 blkno;
896 struct buffer_head *bh = NULL;
897 handle_t *handle;
898
899 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
900
901 credits = clusters * bpc;
902 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
903 if (IS_ERR(handle)) {
904 ret = PTR_ERR(handle);
905 mlog_errno(ret);
906 goto out;
907 }
908
909 while (cpos < clusters) {
910 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
911 &num_clusters, &xv->xr_list);
912 if (ret) {
913 mlog_errno(ret);
914 goto out_commit;
915 }
916
917 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
918
919 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
920 ret = ocfs2_read_block(inode, blkno, &bh);
921 if (ret) {
922 mlog_errno(ret);
923 goto out_commit;
924 }
925
926 ret = ocfs2_journal_access(handle,
927 inode,
928 bh,
929 OCFS2_JOURNAL_ACCESS_WRITE);
930 if (ret < 0) {
931 mlog_errno(ret);
932 goto out_commit;
933 }
934
935 cp_len = value_len > blocksize ? blocksize : value_len;
936 memcpy(bh->b_data, value, cp_len);
937 value_len -= cp_len;
938 value += cp_len;
939 if (cp_len < blocksize)
940 memset(bh->b_data + cp_len, 0,
941 blocksize - cp_len);
942
943 ret = ocfs2_journal_dirty(handle, bh);
944 if (ret < 0) {
945 mlog_errno(ret);
946 goto out_commit;
947 }
948 brelse(bh);
949 bh = NULL;
950
951 /*
952 * XXX: do we need to empty all the following
953 * blocks in this cluster?
954 */
955 if (!value_len)
956 break;
957 }
958 cpos += num_clusters;
959 }
960out_commit:
961 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
962out:
963 brelse(bh);
964
965 return ret;
966}
967
968static int ocfs2_xattr_cleanup(struct inode *inode,
969 struct ocfs2_xattr_info *xi,
970 struct ocfs2_xattr_search *xs,
971 size_t offs)
972{
973 handle_t *handle = NULL;
974 int ret = 0;
975 size_t name_len = strlen(xi->name);
976 void *val = xs->base + offs;
977 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
978
979 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
980 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
981 if (IS_ERR(handle)) {
982 ret = PTR_ERR(handle);
983 mlog_errno(ret);
984 goto out;
985 }
986 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
987 OCFS2_JOURNAL_ACCESS_WRITE);
988 if (ret) {
989 mlog_errno(ret);
990 goto out_commit;
991 }
992 /* Decrease xattr count */
993 le16_add_cpu(&xs->header->xh_count, -1);
994 /* Remove the xattr entry and tree root which has already be set*/
995 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
996 memset(val, 0, size);
997
998 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
999 if (ret < 0)
1000 mlog_errno(ret);
1001out_commit:
1002 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1003out:
1004 return ret;
1005}
1006
1007static int ocfs2_xattr_update_entry(struct inode *inode,
1008 struct ocfs2_xattr_info *xi,
1009 struct ocfs2_xattr_search *xs,
1010 size_t offs)
1011{
1012 handle_t *handle = NULL;
1013 int ret = 0;
1014
1015 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
1016 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1017 if (IS_ERR(handle)) {
1018 ret = PTR_ERR(handle);
1019 mlog_errno(ret);
1020 goto out;
1021 }
1022 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1023 OCFS2_JOURNAL_ACCESS_WRITE);
1024 if (ret) {
1025 mlog_errno(ret);
1026 goto out_commit;
1027 }
1028
1029 xs->here->xe_name_offset = cpu_to_le16(offs);
1030 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1031 if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
1032 ocfs2_xattr_set_local(xs->here, 1);
1033 else
1034 ocfs2_xattr_set_local(xs->here, 0);
1035 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1036
1037 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
1038 if (ret < 0)
1039 mlog_errno(ret);
1040out_commit:
1041 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1042out:
1043 return ret;
1044}
1045
1046/*
1047 * ocfs2_xattr_set_value_outside()
1048 *
1049 * Set large size value in B tree.
1050 */
1051static int ocfs2_xattr_set_value_outside(struct inode *inode,
1052 struct ocfs2_xattr_info *xi,
1053 struct ocfs2_xattr_search *xs,
1054 size_t offs)
1055{
1056 size_t name_len = strlen(xi->name);
1057 void *val = xs->base + offs;
1058 struct ocfs2_xattr_value_root *xv = NULL;
1059 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1060 int ret = 0;
1061
1062 memset(val, 0, size);
1063 memcpy(val, xi->name, name_len);
1064 xv = (struct ocfs2_xattr_value_root *)
1065 (val + OCFS2_XATTR_SIZE(name_len));
1066 xv->xr_clusters = 0;
1067 xv->xr_last_eb_blk = 0;
1068 xv->xr_list.l_tree_depth = 0;
1069 xv->xr_list.l_count = cpu_to_le16(1);
1070 xv->xr_list.l_next_free_rec = 0;
1071
1072 ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
1073 xi->value_len);
1074 if (ret < 0) {
1075 mlog_errno(ret);
1076 return ret;
1077 }
1078 ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
1079 xi->value_len);
1080 if (ret < 0) {
1081 mlog_errno(ret);
1082 return ret;
1083 }
1084 ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
1085 if (ret < 0)
1086 mlog_errno(ret);
1087
1088 return ret;
1089}
1090
1091/*
1092 * ocfs2_xattr_set_entry_local()
1093 *
1094 * Set, replace or remove extended attribute in local.
1095 */
1096static void ocfs2_xattr_set_entry_local(struct inode *inode,
1097 struct ocfs2_xattr_info *xi,
1098 struct ocfs2_xattr_search *xs,
1099 struct ocfs2_xattr_entry *last,
1100 size_t min_offs)
1101{
1102 size_t name_len = strlen(xi->name);
1103 int i;
1104
1105 if (xi->value && xs->not_found) {
1106 /* Insert the new xattr entry. */
1107 le16_add_cpu(&xs->header->xh_count, 1);
1108 ocfs2_xattr_set_type(last, xi->name_index);
1109 ocfs2_xattr_set_local(last, 1);
1110 last->xe_name_len = name_len;
1111 } else {
1112 void *first_val;
1113 void *val;
1114 size_t offs, size;
1115
1116 first_val = xs->base + min_offs;
1117 offs = le16_to_cpu(xs->here->xe_name_offset);
1118 val = xs->base + offs;
1119
1120 if (le64_to_cpu(xs->here->xe_value_size) >
1121 OCFS2_XATTR_INLINE_SIZE)
1122 size = OCFS2_XATTR_SIZE(name_len) +
1123 OCFS2_XATTR_ROOT_SIZE;
1124 else
1125 size = OCFS2_XATTR_SIZE(name_len) +
1126 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1127
1128 if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
1129 OCFS2_XATTR_SIZE(xi->value_len)) {
1130 /* The old and the new value have the
1131 same size. Just replace the value. */
1132 ocfs2_xattr_set_local(xs->here, 1);
1133 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1134 /* Clear value bytes. */
1135 memset(val + OCFS2_XATTR_SIZE(name_len),
1136 0,
1137 OCFS2_XATTR_SIZE(xi->value_len));
1138 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1139 xi->value,
1140 xi->value_len);
1141 return;
1142 }
1143 /* Remove the old name+value. */
1144 memmove(first_val + size, first_val, val - first_val);
1145 memset(first_val, 0, size);
1146 xs->here->xe_name_hash = 0;
1147 xs->here->xe_name_offset = 0;
1148 ocfs2_xattr_set_local(xs->here, 1);
1149 xs->here->xe_value_size = 0;
1150
1151 min_offs += size;
1152
1153 /* Adjust all value offsets. */
1154 last = xs->header->xh_entries;
1155 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1156 size_t o = le16_to_cpu(last->xe_name_offset);
1157
1158 if (o < offs)
1159 last->xe_name_offset = cpu_to_le16(o + size);
1160 last += 1;
1161 }
1162
1163 if (!xi->value) {
1164 /* Remove the old entry. */
1165 last -= 1;
1166 memmove(xs->here, xs->here + 1,
1167 (void *)last - (void *)xs->here);
1168 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
1169 le16_add_cpu(&xs->header->xh_count, -1);
1170 }
1171 }
1172 if (xi->value) {
1173 /* Insert the new name+value. */
1174 size_t size = OCFS2_XATTR_SIZE(name_len) +
1175 OCFS2_XATTR_SIZE(xi->value_len);
1176 void *val = xs->base + min_offs - size;
1177
1178 xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
1179 memset(val, 0, size);
1180 memcpy(val, xi->name, name_len);
1181 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1182 xi->value,
1183 xi->value_len);
1184 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1185 ocfs2_xattr_set_local(xs->here, 1);
1186 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1187 }
1188
1189 return;
1190}
1191
1192/*
1193 * ocfs2_xattr_set_entry()
1194 *
1195 * Set extended attribute entry into inode or block.
1196 *
1197 * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
1198 * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
1199 * then set value in B tree with set_value_outside().
1200 */
1201static int ocfs2_xattr_set_entry(struct inode *inode,
1202 struct ocfs2_xattr_info *xi,
1203 struct ocfs2_xattr_search *xs,
1204 int flag)
1205{
1206 struct ocfs2_xattr_entry *last;
1207 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1208 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1209 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
1210 size_t size_l = 0;
1211 handle_t *handle = NULL;
1212 int free, i, ret;
1213 struct ocfs2_xattr_info xi_l = {
1214 .name_index = xi->name_index,
1215 .name = xi->name,
1216 .value = xi->value,
1217 .value_len = xi->value_len,
1218 };
1219
1220 /* Compute min_offs, last and free space. */
1221 last = xs->header->xh_entries;
1222
1223 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1224 size_t offs = le16_to_cpu(last->xe_name_offset);
1225 if (offs < min_offs)
1226 min_offs = offs;
1227 last += 1;
1228 }
1229
1230 free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
1231 if (free < 0)
1232 return -EFAULT;
1233
1234 if (!xs->not_found) {
1235 size_t size = 0;
1236 if (ocfs2_xattr_is_local(xs->here))
1237 size = OCFS2_XATTR_SIZE(name_len) +
1238 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1239 else
1240 size = OCFS2_XATTR_SIZE(name_len) +
1241 OCFS2_XATTR_ROOT_SIZE;
1242 free += (size + sizeof(struct ocfs2_xattr_entry));
1243 }
1244 /* Check free space in inode or block */
1245 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1246 if (free < sizeof(struct ocfs2_xattr_entry) +
1247 OCFS2_XATTR_SIZE(name_len) +
1248 OCFS2_XATTR_ROOT_SIZE) {
1249 ret = -ENOSPC;
1250 goto out;
1251 }
1252 size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1253 xi_l.value = (void *)&def_xv;
1254 xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
1255 } else if (xi->value) {
1256 if (free < sizeof(struct ocfs2_xattr_entry) +
1257 OCFS2_XATTR_SIZE(name_len) +
1258 OCFS2_XATTR_SIZE(xi->value_len)) {
1259 ret = -ENOSPC;
1260 goto out;
1261 }
1262 }
1263
1264 if (!xs->not_found) {
1265 /* For existing extended attribute */
1266 size_t size = OCFS2_XATTR_SIZE(name_len) +
1267 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1268 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1269 void *val = xs->base + offs;
1270
1271 if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
1272 /* Replace existing local xattr with tree root */
1273 ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
1274 offs);
1275 if (ret < 0)
1276 mlog_errno(ret);
1277 goto out;
1278 } else if (!ocfs2_xattr_is_local(xs->here)) {
1279 /* For existing xattr which has value outside */
1280 struct ocfs2_xattr_value_root *xv = NULL;
1281 xv = (struct ocfs2_xattr_value_root *)(val +
1282 OCFS2_XATTR_SIZE(name_len));
1283
1284 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1285 /*
1286 * If new value need set outside also,
1287 * first truncate old value to new value,
1288 * then set new value with set_value_outside().
1289 */
1290 ret = ocfs2_xattr_value_truncate(inode,
1291 xs->xattr_bh,
1292 xv,
1293 xi->value_len);
1294 if (ret < 0) {
1295 mlog_errno(ret);
1296 goto out;
1297 }
1298
1299 ret = __ocfs2_xattr_set_value_outside(inode,
1300 xv,
1301 xi->value,
1302 xi->value_len);
1303 if (ret < 0) {
1304 mlog_errno(ret);
1305 goto out;
1306 }
1307
1308 ret = ocfs2_xattr_update_entry(inode,
1309 xi,
1310 xs,
1311 offs);
1312 if (ret < 0)
1313 mlog_errno(ret);
1314 goto out;
1315 } else {
1316 /*
1317 * If new value need set in local,
1318 * just trucate old value to zero.
1319 */
1320 ret = ocfs2_xattr_value_truncate(inode,
1321 xs->xattr_bh,
1322 xv,
1323 0);
1324 if (ret < 0)
1325 mlog_errno(ret);
1326 }
1327 }
1328 }
1329
1330 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
1331 OCFS2_INODE_UPDATE_CREDITS);
1332 if (IS_ERR(handle)) {
1333 ret = PTR_ERR(handle);
1334 mlog_errno(ret);
1335 goto out;
1336 }
1337
1338 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) {
1341 mlog_errno(ret);
1342 goto out_commit;
1343 }
1344
1345 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1346 /* set extended attribute in external block. */
1347 ret = ocfs2_extend_trans(handle,
1348 OCFS2_INODE_UPDATE_CREDITS +
1349 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1350 if (ret) {
1351 mlog_errno(ret);
1352 goto out_commit;
1353 }
1354 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1355 OCFS2_JOURNAL_ACCESS_WRITE);
1356 if (ret) {
1357 mlog_errno(ret);
1358 goto out_commit;
1359 }
1360 }
1361
1362 /*
1363 * Set value in local, include set tree root in local.
1364 * This is the first step for value size >INLINE_SIZE.
1365 */
1366 ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
1367
1368 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1369 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
1370 if (ret < 0) {
1371 mlog_errno(ret);
1372 goto out_commit;
1373 }
1374 }
1375
1376 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
1377 (flag & OCFS2_INLINE_XATTR_FL)) {
1378 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1379 unsigned int xattrsize = osb->s_xattr_inline_size;
1380
1381 /*
1382 * Adjust extent record count or inline data size
1383 * to reserve space for extended attribute.
1384 */
1385 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1386 struct ocfs2_inline_data *idata = &di->id2.i_data;
1387 le16_add_cpu(&idata->id_count, -xattrsize);
1388 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
1389 struct ocfs2_extent_list *el = &di->id2.i_list;
1390 le16_add_cpu(&el->l_count, -(xattrsize /
1391 sizeof(struct ocfs2_extent_rec)));
1392 }
1393 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
1394 }
1395 /* Update xattr flag */
1396 spin_lock(&oi->ip_lock);
1397 oi->ip_dyn_features |= flag;
1398 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1399 spin_unlock(&oi->ip_lock);
1400 /* Update inode ctime */
1401 inode->i_ctime = CURRENT_TIME;
1402 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1403 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1404
1405 ret = ocfs2_journal_dirty(handle, xs->inode_bh);
1406 if (ret < 0)
1407 mlog_errno(ret);
1408
1409out_commit:
1410 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1411
1412 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1413 /*
1414 * Set value outside in B tree.
1415 * This is the second step for value size > INLINE_SIZE.
1416 */
1417 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1418 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
1419 if (ret < 0) {
1420 int ret2;
1421
1422 mlog_errno(ret);
1423 /*
1424 * If set value outside failed, we have to clean
1425 * the junk tree root we have already set in local.
1426 */
1427 ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
1428 if (ret2 < 0)
1429 mlog_errno(ret2);
1430 }
1431 }
1432out:
1433 return ret;
1434
1435}
1436
1437static int ocfs2_remove_value_outside(struct inode*inode,
1438 struct buffer_head *bh,
1439 struct ocfs2_xattr_header *header)
1440{
1441 int ret = 0, i;
1442
1443 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
1444 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
1445
1446 if (!ocfs2_xattr_is_local(entry)) {
1447 struct ocfs2_xattr_value_root *xv;
1448 void *val;
1449
1450 val = (void *)header +
1451 le16_to_cpu(entry->xe_name_offset);
1452 xv = (struct ocfs2_xattr_value_root *)
1453 (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
1454 ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
1455 if (ret < 0) {
1456 mlog_errno(ret);
1457 return ret;
1458 }
1459 }
1460 }
1461
1462 return ret;
1463}
1464
1465static int ocfs2_xattr_ibody_remove(struct inode *inode,
1466 struct buffer_head *di_bh)
1467{
1468
1469 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1470 struct ocfs2_xattr_header *header;
1471 int ret;
1472
1473 header = (struct ocfs2_xattr_header *)
1474 ((void *)di + inode->i_sb->s_blocksize -
1475 le16_to_cpu(di->i_xattr_inline_size));
1476
1477 ret = ocfs2_remove_value_outside(inode, di_bh, header);
1478
1479 return ret;
1480}
1481
1482static int ocfs2_xattr_block_remove(struct inode *inode,
1483 struct buffer_head *blk_bh)
1484{
1485 struct ocfs2_xattr_block *xb;
1486 int ret = 0;
1487
1488 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1489 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1490 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
1491 ret = ocfs2_remove_value_outside(inode, blk_bh, header);
1492 } else
1493 ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
1494
1495 return ret;
1496}
1497
1498static int ocfs2_xattr_free_block(struct inode *inode,
1499 u64 block)
1500{
1501 struct inode *xb_alloc_inode;
1502 struct buffer_head *xb_alloc_bh = NULL;
1503 struct buffer_head *blk_bh = NULL;
1504 struct ocfs2_xattr_block *xb;
1505 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1506 handle_t *handle;
1507 int ret = 0;
1508 u64 blk, bg_blkno;
1509 u16 bit;
1510
1511 ret = ocfs2_read_block(inode, block, &blk_bh);
1512 if (ret < 0) {
1513 mlog_errno(ret);
1514 goto out;
1515 }
1516
1517 /*Verify the signature of xattr block*/
1518 if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
1519 strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
1520 ret = -EFAULT;
1521 goto out;
1522 }
1523
1524 ret = ocfs2_xattr_block_remove(inode, blk_bh);
1525 if (ret < 0) {
1526 mlog_errno(ret);
1527 goto out;
1528 }
1529
1530 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1531 blk = le64_to_cpu(xb->xb_blkno);
1532 bit = le16_to_cpu(xb->xb_suballoc_bit);
1533 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1534
1535 xb_alloc_inode = ocfs2_get_system_file_inode(osb,
1536 EXTENT_ALLOC_SYSTEM_INODE,
1537 le16_to_cpu(xb->xb_suballoc_slot));
1538 if (!xb_alloc_inode) {
1539 ret = -ENOMEM;
1540 mlog_errno(ret);
1541 goto out;
1542 }
1543 mutex_lock(&xb_alloc_inode->i_mutex);
1544
1545 ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
1546 if (ret < 0) {
1547 mlog_errno(ret);
1548 goto out_mutex;
1549 }
1550
1551 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
1552 if (IS_ERR(handle)) {
1553 ret = PTR_ERR(handle);
1554 mlog_errno(ret);
1555 goto out_unlock;
1556 }
1557
1558 ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
1559 bit, bg_blkno, 1);
1560 if (ret < 0)
1561 mlog_errno(ret);
1562
1563 ocfs2_commit_trans(osb, handle);
1564out_unlock:
1565 ocfs2_inode_unlock(xb_alloc_inode, 1);
1566 brelse(xb_alloc_bh);
1567out_mutex:
1568 mutex_unlock(&xb_alloc_inode->i_mutex);
1569 iput(xb_alloc_inode);
1570out:
1571 brelse(blk_bh);
1572 return ret;
1573}
1574
1575/*
1576 * ocfs2_xattr_remove()
1577 *
1578 * Free extended attribute resources associated with this inode.
1579 */
1580int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1581{
1582 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1583 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1584 handle_t *handle;
1585 int ret;
1586
1587 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
1588 return 0;
1589
1590 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
1591 return 0;
1592
1593 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
1594 ret = ocfs2_xattr_ibody_remove(inode, di_bh);
1595 if (ret < 0) {
1596 mlog_errno(ret);
1597 goto out;
1598 }
1599 }
1600
1601 if (di->i_xattr_loc) {
1602 ret = ocfs2_xattr_free_block(inode,
1603 le64_to_cpu(di->i_xattr_loc));
1604 if (ret < 0) {
1605 mlog_errno(ret);
1606 goto out;
1607 }
1608 }
1609
1610 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
1611 OCFS2_INODE_UPDATE_CREDITS);
1612 if (IS_ERR(handle)) {
1613 ret = PTR_ERR(handle);
1614 mlog_errno(ret);
1615 goto out;
1616 }
1617 ret = ocfs2_journal_access(handle, inode, di_bh,
1618 OCFS2_JOURNAL_ACCESS_WRITE);
1619 if (ret) {
1620 mlog_errno(ret);
1621 goto out_commit;
1622 }
1623
1624 di->i_xattr_loc = 0;
1625
1626 spin_lock(&oi->ip_lock);
1627 oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
1628 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1629 spin_unlock(&oi->ip_lock);
1630
1631 ret = ocfs2_journal_dirty(handle, di_bh);
1632 if (ret < 0)
1633 mlog_errno(ret);
1634out_commit:
1635 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1636out:
1637 return ret;
1638}
1639
1640static int ocfs2_xattr_has_space_inline(struct inode *inode,
1641 struct ocfs2_dinode *di)
1642{
1643 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1644 unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
1645 int free;
1646
1647 if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
1648 return 0;
1649
1650 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1651 struct ocfs2_inline_data *idata = &di->id2.i_data;
1652 free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
1653 } else if (ocfs2_inode_is_fast_symlink(inode)) {
1654 free = ocfs2_fast_symlink_chars(inode->i_sb) -
1655 le64_to_cpu(di->i_size);
1656 } else {
1657 struct ocfs2_extent_list *el = &di->id2.i_list;
1658 free = (le16_to_cpu(el->l_count) -
1659 le16_to_cpu(el->l_next_free_rec)) *
1660 sizeof(struct ocfs2_extent_rec);
1661 }
1662 if (free >= xattrsize)
1663 return 1;
1664
1665 return 0;
1666}
1667
1668/*
1669 * ocfs2_xattr_ibody_find()
1670 *
1671 * Find extended attribute in inode block and
1672 * fill search info into struct ocfs2_xattr_search.
1673 */
1674static int ocfs2_xattr_ibody_find(struct inode *inode,
1675 int name_index,
1676 const char *name,
1677 struct ocfs2_xattr_search *xs)
1678{
1679 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1680 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1681 int ret;
1682 int has_space = 0;
1683
1684 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
1685 return 0;
1686
1687 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
1688 down_read(&oi->ip_alloc_sem);
1689 has_space = ocfs2_xattr_has_space_inline(inode, di);
1690 up_read(&oi->ip_alloc_sem);
1691 if (!has_space)
1692 return 0;
1693 }
1694
1695 xs->xattr_bh = xs->inode_bh;
1696 xs->end = (void *)di + inode->i_sb->s_blocksize;
1697 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
1698 xs->header = (struct ocfs2_xattr_header *)
1699 (xs->end - le16_to_cpu(di->i_xattr_inline_size));
1700 else
1701 xs->header = (struct ocfs2_xattr_header *)
1702 (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
1703 xs->base = (void *)xs->header;
1704 xs->here = xs->header->xh_entries;
1705
1706 /* Find the named attribute. */
1707 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
1708 ret = ocfs2_xattr_find_entry(name_index, name, xs);
1709 if (ret && ret != -ENODATA)
1710 return ret;
1711 xs->not_found = ret;
1712 }
1713
1714 return 0;
1715}
1716
1717/*
1718 * ocfs2_xattr_ibody_set()
1719 *
1720 * Set, replace or remove an extended attribute into inode block.
1721 *
1722 */
1723static int ocfs2_xattr_ibody_set(struct inode *inode,
1724 struct ocfs2_xattr_info *xi,
1725 struct ocfs2_xattr_search *xs)
1726{
1727 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1728 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1729 int ret;
1730
1731 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
1732 return -ENOSPC;
1733
1734 down_write(&oi->ip_alloc_sem);
1735 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
1736 if (!ocfs2_xattr_has_space_inline(inode, di)) {
1737 ret = -ENOSPC;
1738 goto out;
1739 }
1740 }
1741
1742 ret = ocfs2_xattr_set_entry(inode, xi, xs,
1743 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
1744out:
1745 up_write(&oi->ip_alloc_sem);
1746
1747 return ret;
1748}
1749
1750/*
1751 * ocfs2_xattr_block_find()
1752 *
1753 * Find extended attribute in external block and
1754 * fill search info into struct ocfs2_xattr_search.
1755 */
1756static int ocfs2_xattr_block_find(struct inode *inode,
1757 int name_index,
1758 const char *name,
1759 struct ocfs2_xattr_search *xs)
1760{
1761 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1762 struct buffer_head *blk_bh = NULL;
1763 struct ocfs2_xattr_block *xb;
1764 int ret = 0;
1765
1766 if (!di->i_xattr_loc)
1767 return ret;
1768
1769 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
1770 if (ret < 0) {
1771 mlog_errno(ret);
1772 return ret;
1773 }
1774 /*Verify the signature of xattr block*/
1775 if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
1776 strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
1777 ret = -EFAULT;
1778 goto cleanup;
1779 }
1780
1781 xs->xattr_bh = blk_bh;
1782 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1783
1784 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1785 xs->header = &xb->xb_attrs.xb_header;
1786 xs->base = (void *)xs->header;
1787 xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
1788 xs->here = xs->header->xh_entries;
1789
1790 ret = ocfs2_xattr_find_entry(name_index, name, xs);
1791 } else
1792 ret = ocfs2_xattr_index_block_find(inode, blk_bh,
1793 name_index,
1794 name, xs);
1795
1796 if (ret && ret != -ENODATA) {
1797 xs->xattr_bh = NULL;
1798 goto cleanup;
1799 }
1800 xs->not_found = ret;
1801 return 0;
1802cleanup:
1803 brelse(blk_bh);
1804
1805 return ret;
1806}
1807
1808/*
1809 * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
1810 * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
1811 * re-initialized.
1812 */
1813static int ocfs2_restore_xattr_block(struct inode *inode,
1814 struct ocfs2_xattr_search *xs)
1815{
1816 int ret;
1817 handle_t *handle;
1818 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1819 struct ocfs2_xattr_block *xb =
1820 (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
1821 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
1822 u16 xb_flags = le16_to_cpu(xb->xb_flags);
1823
1824 BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
1825 le16_to_cpu(el->l_next_free_rec) != 0);
1826
1827 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1828 if (IS_ERR(handle)) {
1829 ret = PTR_ERR(handle);
1830 handle = NULL;
1831 goto out;
1832 }
1833
1834 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1835 OCFS2_JOURNAL_ACCESS_WRITE);
1836 if (ret < 0) {
1837 mlog_errno(ret);
1838 goto out_commit;
1839 }
1840
1841 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
1842 offsetof(struct ocfs2_xattr_block, xb_attrs));
1843
1844 xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
1845
1846 ocfs2_journal_dirty(handle, xs->xattr_bh);
1847
1848out_commit:
1849 ocfs2_commit_trans(osb, handle);
1850out:
1851 return ret;
1852}
1853
1854/*
1855 * ocfs2_xattr_block_set()
1856 *
1857 * Set, replace or remove an extended attribute into external block.
1858 *
1859 */
1860static int ocfs2_xattr_block_set(struct inode *inode,
1861 struct ocfs2_xattr_info *xi,
1862 struct ocfs2_xattr_search *xs)
1863{
1864 struct buffer_head *new_bh = NULL;
1865 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1866 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1867 struct ocfs2_alloc_context *meta_ac = NULL;
1868 handle_t *handle = NULL;
1869 struct ocfs2_xattr_block *xblk = NULL;
1870 u16 suballoc_bit_start;
1871 u32 num_got;
1872 u64 first_blkno;
1873 int ret;
1874
1875 if (!xs->xattr_bh) {
1876 /*
1877 * Alloc one external block for extended attribute
1878 * outside of inode.
1879 */
1880 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
1881 if (ret < 0) {
1882 mlog_errno(ret);
1883 goto out;
1884 }
1885 handle = ocfs2_start_trans(osb,
1886 OCFS2_XATTR_BLOCK_CREATE_CREDITS);
1887 if (IS_ERR(handle)) {
1888 ret = PTR_ERR(handle);
1889 mlog_errno(ret);
1890 goto out;
1891 }
1892 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1893 OCFS2_JOURNAL_ACCESS_CREATE);
1894 if (ret < 0) {
1895 mlog_errno(ret);
1896 goto out_commit;
1897 }
1898
1899 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
1900 &suballoc_bit_start, &num_got,
1901 &first_blkno);
1902 if (ret < 0) {
1903 mlog_errno(ret);
1904 goto out_commit;
1905 }
1906
1907 new_bh = sb_getblk(inode->i_sb, first_blkno);
1908 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1909
1910 ret = ocfs2_journal_access(handle, inode, new_bh,
1911 OCFS2_JOURNAL_ACCESS_CREATE);
1912 if (ret < 0) {
1913 mlog_errno(ret);
1914 goto out_commit;
1915 }
1916
1917 /* Initialize ocfs2_xattr_block */
1918 xs->xattr_bh = new_bh;
1919 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
1920 memset(xblk, 0, inode->i_sb->s_blocksize);
1921 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
1922 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
1923 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1924 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
1925 xblk->xb_blkno = cpu_to_le64(first_blkno);
1926
1927 xs->header = &xblk->xb_attrs.xb_header;
1928 xs->base = (void *)xs->header;
1929 xs->end = (void *)xblk + inode->i_sb->s_blocksize;
1930 xs->here = xs->header->xh_entries;
1931
1932
1933 ret = ocfs2_journal_dirty(handle, new_bh);
1934 if (ret < 0) {
1935 mlog_errno(ret);
1936 goto out_commit;
1937 }
1938 di->i_xattr_loc = cpu_to_le64(first_blkno);
1939 ret = ocfs2_journal_dirty(handle, xs->inode_bh);
1940 if (ret < 0)
1941 mlog_errno(ret);
1942out_commit:
1943 ocfs2_commit_trans(osb, handle);
1944out:
1945 if (meta_ac)
1946 ocfs2_free_alloc_context(meta_ac);
1947 if (ret < 0)
1948 return ret;
1949 } else
1950 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
1951
1952 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
1953 /* Set extended attribute into external block */
1954 ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
1955 if (!ret || ret != -ENOSPC)
1956 goto end;
1957
1958 ret = ocfs2_xattr_create_index_block(inode, xs);
1959 if (ret)
1960 goto end;
1961 }
1962
1963 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
1964 if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
1965 ret = ocfs2_restore_xattr_block(inode, xs);
1966
1967end:
1968
1969 return ret;
1970}
1971
1972/*
1973 * ocfs2_xattr_set()
1974 *
1975 * Set, replace or remove an extended attribute for this inode.
1976 * value is NULL to remove an existing extended attribute, else either
1977 * create or replace an extended attribute.
1978 */
1979int ocfs2_xattr_set(struct inode *inode,
1980 int name_index,
1981 const char *name,
1982 const void *value,
1983 size_t value_len,
1984 int flags)
1985{
1986 struct buffer_head *di_bh = NULL;
1987 struct ocfs2_dinode *di;
1988 int ret;
1989 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
1990
1991 struct ocfs2_xattr_info xi = {
1992 .name_index = name_index,
1993 .name = name,
1994 .value = value,
1995 .value_len = value_len,
1996 };
1997
1998 struct ocfs2_xattr_search xis = {
1999 .not_found = -ENODATA,
2000 };
2001
2002 struct ocfs2_xattr_search xbs = {
2003 .not_found = -ENODATA,
2004 };
2005
2006 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
2007 return -EOPNOTSUPP;
2008
2009 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2010 if (ret < 0) {
2011 mlog_errno(ret);
2012 return ret;
2013 }
2014 xis.inode_bh = xbs.inode_bh = di_bh;
2015 di = (struct ocfs2_dinode *)di_bh->b_data;
2016
2017 down_write(&OCFS2_I(inode)->ip_xattr_sem);
2018 /*
2019 * Scan inode and external block to find the same name
2020 * extended attribute and collect search infomation.
2021 */
2022 ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
2023 if (ret)
2024 goto cleanup;
2025 if (xis.not_found) {
2026 ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
2027 if (ret)
2028 goto cleanup;
2029 }
2030
2031 if (xis.not_found && xbs.not_found) {
2032 ret = -ENODATA;
2033 if (flags & XATTR_REPLACE)
2034 goto cleanup;
2035 ret = 0;
2036 if (!value)
2037 goto cleanup;
2038 } else {
2039 ret = -EEXIST;
2040 if (flags & XATTR_CREATE)
2041 goto cleanup;
2042 }
2043
2044 if (!value) {
2045 /* Remove existing extended attribute */
2046 if (!xis.not_found)
2047 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2048 else if (!xbs.not_found)
2049 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2050 } else {
2051 /* We always try to set extended attribute into inode first*/
2052 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2053 if (!ret && !xbs.not_found) {
2054 /*
2055 * If succeed and that extended attribute existing in
2056 * external block, then we will remove it.
2057 */
2058 xi.value = NULL;
2059 xi.value_len = 0;
2060 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2061 } else if (ret == -ENOSPC) {
2062 if (di->i_xattr_loc && !xbs.xattr_bh) {
2063 ret = ocfs2_xattr_block_find(inode, name_index,
2064 name, &xbs);
2065 if (ret)
2066 goto cleanup;
2067 }
2068 /*
2069 * If no space in inode, we will set extended attribute
2070 * into external block.
2071 */
2072 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2073 if (ret)
2074 goto cleanup;
2075 if (!xis.not_found) {
2076 /*
2077 * If succeed and that extended attribute
2078 * existing in inode, we will remove it.
2079 */
2080 xi.value = NULL;
2081 xi.value_len = 0;
2082 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2083 }
2084 }
2085 }
2086cleanup:
2087 up_write(&OCFS2_I(inode)->ip_xattr_sem);
2088 ocfs2_inode_unlock(inode, 1);
2089 brelse(di_bh);
2090 brelse(xbs.xattr_bh);
2091 for (i = 0; i < blk_per_bucket; i++)
2092 brelse(xbs.bucket.bhs[i]);
2093
2094 return ret;
2095}
2096
2097/*
2098 * Find the xattr extent rec which may contains name_hash.
2099 * e_cpos will be the first name hash of the xattr rec.
2100 * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
2101 */
2102static int ocfs2_xattr_get_rec(struct inode *inode,
2103 u32 name_hash,
2104 u64 *p_blkno,
2105 u32 *e_cpos,
2106 u32 *num_clusters,
2107 struct ocfs2_extent_list *el)
2108{
2109 int ret = 0, i;
2110 struct buffer_head *eb_bh = NULL;
2111 struct ocfs2_extent_block *eb;
2112 struct ocfs2_extent_rec *rec = NULL;
2113 u64 e_blkno = 0;
2114
2115 if (el->l_tree_depth) {
2116 ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
2117 if (ret) {
2118 mlog_errno(ret);
2119 goto out;
2120 }
2121
2122 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2123 el = &eb->h_list;
2124
2125 if (el->l_tree_depth) {
2126 ocfs2_error(inode->i_sb,
2127 "Inode %lu has non zero tree depth in "
2128 "xattr tree block %llu\n", inode->i_ino,
2129 (unsigned long long)eb_bh->b_blocknr);
2130 ret = -EROFS;
2131 goto out;
2132 }
2133 }
2134
2135 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
2136 rec = &el->l_recs[i];
2137
2138 if (le32_to_cpu(rec->e_cpos) <= name_hash) {
2139 e_blkno = le64_to_cpu(rec->e_blkno);
2140 break;
2141 }
2142 }
2143
2144 if (!e_blkno) {
2145 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
2146 "record (%u, %u, 0) in xattr", inode->i_ino,
2147 le32_to_cpu(rec->e_cpos),
2148 ocfs2_rec_clusters(el, rec));
2149 ret = -EROFS;
2150 goto out;
2151 }
2152
2153 *p_blkno = le64_to_cpu(rec->e_blkno);
2154 *num_clusters = le16_to_cpu(rec->e_leaf_clusters);
2155 if (e_cpos)
2156 *e_cpos = le32_to_cpu(rec->e_cpos);
2157out:
2158 brelse(eb_bh);
2159 return ret;
2160}
2161
2162typedef int (xattr_bucket_func)(struct inode *inode,
2163 struct ocfs2_xattr_bucket *bucket,
2164 void *para);
2165
2166static int ocfs2_find_xe_in_bucket(struct inode *inode,
2167 struct buffer_head *header_bh,
2168 int name_index,
2169 const char *name,
2170 u32 name_hash,
2171 u16 *xe_index,
2172 int *found)
2173{
2174 int i, ret = 0, cmp = 1, block_off, new_offset;
2175 struct ocfs2_xattr_header *xh =
2176 (struct ocfs2_xattr_header *)header_bh->b_data;
2177 size_t name_len = strlen(name);
2178 struct ocfs2_xattr_entry *xe = NULL;
2179 struct buffer_head *name_bh = NULL;
2180 char *xe_name;
2181
2182 /*
2183 * We don't use binary search in the bucket because there
2184 * may be multiple entries with the same name hash.
2185 */
2186 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
2187 xe = &xh->xh_entries[i];
2188
2189 if (name_hash > le32_to_cpu(xe->xe_name_hash))
2190 continue;
2191 else if (name_hash < le32_to_cpu(xe->xe_name_hash))
2192 break;
2193
2194 cmp = name_index - ocfs2_xattr_get_type(xe);
2195 if (!cmp)
2196 cmp = name_len - xe->xe_name_len;
2197 if (cmp)
2198 continue;
2199
2200 ret = ocfs2_xattr_bucket_get_name_value(inode,
2201 xh,
2202 i,
2203 &block_off,
2204 &new_offset);
2205 if (ret) {
2206 mlog_errno(ret);
2207 break;
2208 }
2209
2210 ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
2211 &name_bh);
2212 if (ret) {
2213 mlog_errno(ret);
2214 break;
2215 }
2216 xe_name = name_bh->b_data + new_offset;
2217
2218 cmp = memcmp(name, xe_name, name_len);
2219 brelse(name_bh);
2220 name_bh = NULL;
2221
2222 if (cmp == 0) {
2223 *xe_index = i;
2224 *found = 1;
2225 ret = 0;
2226 break;
2227 }
2228 }
2229
2230 return ret;
2231}
2232
2233/*
2234 * Find the specified xattr entry in a series of buckets.
2235 * This series start from p_blkno and last for num_clusters.
2236 * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
2237 * the num of the valid buckets.
2238 *
2239 * Return the buffer_head this xattr should reside in. And if the xattr's
2240 * hash is in the gap of 2 buckets, return the lower bucket.
2241 */
2242static int ocfs2_xattr_bucket_find(struct inode *inode,
2243 int name_index,
2244 const char *name,
2245 u32 name_hash,
2246 u64 p_blkno,
2247 u32 first_hash,
2248 u32 num_clusters,
2249 struct ocfs2_xattr_search *xs)
2250{
2251 int ret, found = 0;
2252 struct buffer_head *bh = NULL;
2253 struct buffer_head *lower_bh = NULL;
2254 struct ocfs2_xattr_header *xh = NULL;
2255 struct ocfs2_xattr_entry *xe = NULL;
2256 u16 index = 0;
2257 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2258 int low_bucket = 0, bucket, high_bucket;
2259 u32 last_hash;
2260 u64 blkno;
2261
2262 ret = ocfs2_read_block(inode, p_blkno, &bh);
2263 if (ret) {
2264 mlog_errno(ret);
2265 goto out;
2266 }
2267
2268 xh = (struct ocfs2_xattr_header *)bh->b_data;
2269 high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
2270
2271 while (low_bucket <= high_bucket) {
2272 brelse(bh);
2273 bh = NULL;
2274 bucket = (low_bucket + high_bucket) / 2;
2275
2276 blkno = p_blkno + bucket * blk_per_bucket;
2277
2278 ret = ocfs2_read_block(inode, blkno, &bh);
2279 if (ret) {
2280 mlog_errno(ret);
2281 goto out;
2282 }
2283
2284 xh = (struct ocfs2_xattr_header *)bh->b_data;
2285 xe = &xh->xh_entries[0];
2286 if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
2287 high_bucket = bucket - 1;
2288 continue;
2289 }
2290
2291 /*
2292 * Check whether the hash of the last entry in our
2293 * bucket is larger than the search one. for an empty
2294 * bucket, the last one is also the first one.
2295 */
2296 if (xh->xh_count)
2297 xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
2298
2299 last_hash = le32_to_cpu(xe->xe_name_hash);
2300
2301 /* record lower_bh which may be the insert place. */
2302 brelse(lower_bh);
2303 lower_bh = bh;
2304 bh = NULL;
2305
2306 if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
2307 low_bucket = bucket + 1;
2308 continue;
2309 }
2310
2311 /* the searched xattr should reside in this bucket if exists. */
2312 ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
2313 name_index, name, name_hash,
2314 &index, &found);
2315 if (ret) {
2316 mlog_errno(ret);
2317 goto out;
2318 }
2319 break;
2320 }
2321
2322 /*
2323 * Record the bucket we have found.
2324 * When the xattr's hash value is in the gap of 2 buckets, we will
2325 * always set it to the previous bucket.
2326 */
2327 if (!lower_bh) {
2328 /*
2329 * We can't find any bucket whose first name_hash is less
2330 * than the find name_hash.
2331 */
2332 BUG_ON(bh->b_blocknr != p_blkno);
2333 lower_bh = bh;
2334 bh = NULL;
2335 }
2336 xs->bucket.bhs[0] = lower_bh;
2337 xs->bucket.xh = (struct ocfs2_xattr_header *)
2338 xs->bucket.bhs[0]->b_data;
2339 lower_bh = NULL;
2340
2341 xs->header = xs->bucket.xh;
2342 xs->base = xs->bucket.bhs[0]->b_data;
2343 xs->end = xs->base + inode->i_sb->s_blocksize;
2344
2345 if (found) {
2346 /*
2347 * If we have found the xattr enty, read all the blocks in
2348 * this bucket.
2349 */
2350 ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
2351 blk_per_bucket - 1, &xs->bucket.bhs[1],
2352 OCFS2_BH_CACHED);
2353 if (ret) {
2354 mlog_errno(ret);
2355 goto out;
2356 }
2357
2358 xs->here = &xs->header->xh_entries[index];
2359 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
2360 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
2361 } else
2362 ret = -ENODATA;
2363
2364out:
2365 brelse(bh);
2366 brelse(lower_bh);
2367 return ret;
2368}
2369
2370static int ocfs2_xattr_index_block_find(struct inode *inode,
2371 struct buffer_head *root_bh,
2372 int name_index,
2373 const char *name,
2374 struct ocfs2_xattr_search *xs)
2375{
2376 int ret;
2377 struct ocfs2_xattr_block *xb =
2378 (struct ocfs2_xattr_block *)root_bh->b_data;
2379 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
2380 struct ocfs2_extent_list *el = &xb_root->xt_list;
2381 u64 p_blkno = 0;
2382 u32 first_hash, num_clusters = 0;
2383 u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
2384
2385 if (le16_to_cpu(el->l_next_free_rec) == 0)
2386 return -ENODATA;
2387
2388 mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
2389 name, name_hash, name_index);
2390
2391 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
2392 &num_clusters, el);
2393 if (ret) {
2394 mlog_errno(ret);
2395 goto out;
2396 }
2397
2398 BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
2399
2400 mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
2401 "in the rec is %u\n", num_clusters, p_blkno, first_hash);
2402
2403 ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
2404 p_blkno, first_hash, num_clusters, xs);
2405
2406out:
2407 return ret;
2408}
2409
2410static int ocfs2_iterate_xattr_buckets(struct inode *inode,
2411 u64 blkno,
2412 u32 clusters,
2413 xattr_bucket_func *func,
2414 void *para)
2415{
2416 int i, j, ret = 0;
2417 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2418 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
2419 u32 num_buckets = clusters * bpc;
2420 struct ocfs2_xattr_bucket bucket;
2421
2422 memset(&bucket, 0, sizeof(bucket));
2423
2424 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
2425 clusters, blkno);
2426
2427 for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
2428 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
2429 bucket.bhs, OCFS2_BH_CACHED);
2430 if (ret) {
2431 mlog_errno(ret);
2432 goto out;
2433 }
2434
2435 bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
2436 /*
2437 * The real bucket num in this series of blocks is stored
2438 * in the 1st bucket.
2439 */
2440 if (i == 0)
2441 num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
2442
2443 mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
2444 le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
2445 if (func) {
2446 ret = func(inode, &bucket, para);
2447 if (ret) {
2448 mlog_errno(ret);
2449 break;
2450 }
2451 }
2452
2453 for (j = 0; j < blk_per_bucket; j++)
2454 brelse(bucket.bhs[j]);
2455 memset(&bucket, 0, sizeof(bucket));
2456 }
2457
2458out:
2459 for (j = 0; j < blk_per_bucket; j++)
2460 brelse(bucket.bhs[j]);
2461
2462 return ret;
2463}
2464
2465struct ocfs2_xattr_tree_list {
2466 char *buffer;
2467 size_t buffer_size;
2468 size_t result;
2469};
2470
2471static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
2472 struct ocfs2_xattr_header *xh,
2473 int index,
2474 int *block_off,
2475 int *new_offset)
2476{
2477 u16 name_offset;
2478
2479 if (index < 0 || index >= le16_to_cpu(xh->xh_count))
2480 return -EINVAL;
2481
2482 name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
2483
2484 *block_off = name_offset >> inode->i_sb->s_blocksize_bits;
2485 *new_offset = name_offset % inode->i_sb->s_blocksize;
2486
2487 return 0;
2488}
2489
2490static int ocfs2_list_xattr_bucket(struct inode *inode,
2491 struct ocfs2_xattr_bucket *bucket,
2492 void *para)
2493{
2494 int ret = 0, type;
2495 struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
2496 int i, block_off, new_offset;
2497 const char *prefix, *name;
2498
2499 for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
2500 struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
2501 type = ocfs2_xattr_get_type(entry);
2502 prefix = ocfs2_xattr_prefix(type);
2503
2504 if (prefix) {
2505 ret = ocfs2_xattr_bucket_get_name_value(inode,
2506 bucket->xh,
2507 i,
2508 &block_off,
2509 &new_offset);
2510 if (ret)
2511 break;
2512
2513 name = (const char *)bucket->bhs[block_off]->b_data +
2514 new_offset;
2515 ret = ocfs2_xattr_list_entry(xl->buffer,
2516 xl->buffer_size,
2517 &xl->result,
2518 prefix, name,
2519 entry->xe_name_len);
2520 if (ret)
2521 break;
2522 }
2523 }
2524
2525 return ret;
2526}
2527
2528static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
2529 struct ocfs2_xattr_tree_root *xt,
2530 char *buffer,
2531 size_t buffer_size)
2532{
2533 struct ocfs2_extent_list *el = &xt->xt_list;
2534 int ret = 0;
2535 u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
2536 u64 p_blkno = 0;
2537 struct ocfs2_xattr_tree_list xl = {
2538 .buffer = buffer,
2539 .buffer_size = buffer_size,
2540 .result = 0,
2541 };
2542
2543 if (le16_to_cpu(el->l_next_free_rec) == 0)
2544 return 0;
2545
2546 while (name_hash > 0) {
2547 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
2548 &e_cpos, &num_clusters, el);
2549 if (ret) {
2550 mlog_errno(ret);
2551 goto out;
2552 }
2553
2554 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
2555 ocfs2_list_xattr_bucket,
2556 &xl);
2557 if (ret) {
2558 mlog_errno(ret);
2559 goto out;
2560 }
2561
2562 if (e_cpos == 0)
2563 break;
2564
2565 name_hash = e_cpos - 1;
2566 }
2567
2568 ret = xl.result;
2569out:
2570 return ret;
2571}
2572
2573static int cmp_xe(const void *a, const void *b)
2574{
2575 const struct ocfs2_xattr_entry *l = a, *r = b;
2576 u32 l_hash = le32_to_cpu(l->xe_name_hash);
2577 u32 r_hash = le32_to_cpu(r->xe_name_hash);
2578
2579 if (l_hash > r_hash)
2580 return 1;
2581 if (l_hash < r_hash)
2582 return -1;
2583 return 0;
2584}
2585
2586static void swap_xe(void *a, void *b, int size)
2587{
2588 struct ocfs2_xattr_entry *l = a, *r = b, tmp;
2589
2590 tmp = *l;
2591 memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
2592 memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
2593}
2594
2595/*
2596 * When the ocfs2_xattr_block is filled up, new bucket will be created
2597 * and all the xattr entries will be moved to the new bucket.
2598 * Note: we need to sort the entries since they are not saved in order
2599 * in the ocfs2_xattr_block.
2600 */
2601static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2602 struct buffer_head *xb_bh,
2603 struct buffer_head *xh_bh,
2604 struct buffer_head *data_bh)
2605{
2606 int i, blocksize = inode->i_sb->s_blocksize;
2607 u16 offset, size, off_change;
2608 struct ocfs2_xattr_entry *xe;
2609 struct ocfs2_xattr_block *xb =
2610 (struct ocfs2_xattr_block *)xb_bh->b_data;
2611 struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
2612 struct ocfs2_xattr_header *xh =
2613 (struct ocfs2_xattr_header *)xh_bh->b_data;
2614 u16 count = le16_to_cpu(xb_xh->xh_count);
2615 char *target = xh_bh->b_data, *src = xb_bh->b_data;
2616
2617 mlog(0, "cp xattr from block %llu to bucket %llu\n",
2618 (unsigned long long)xb_bh->b_blocknr,
2619 (unsigned long long)xh_bh->b_blocknr);
2620
2621 memset(xh_bh->b_data, 0, blocksize);
2622 if (data_bh)
2623 memset(data_bh->b_data, 0, blocksize);
2624 /*
2625 * Since the xe_name_offset is based on ocfs2_xattr_header,
2626 * there is a offset change corresponding to the change of
2627 * ocfs2_xattr_header's position.
2628 */
2629 off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
2630 xe = &xb_xh->xh_entries[count - 1];
2631 offset = le16_to_cpu(xe->xe_name_offset) + off_change;
2632 size = blocksize - offset;
2633
2634 /* copy all the names and values. */
2635 if (data_bh)
2636 target = data_bh->b_data;
2637 memcpy(target + offset, src + offset, size);
2638
2639 /* Init new header now. */
2640 xh->xh_count = xb_xh->xh_count;
2641 xh->xh_num_buckets = cpu_to_le16(1);
2642 xh->xh_name_value_len = cpu_to_le16(size);
2643 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
2644
2645 /* copy all the entries. */
2646 target = xh_bh->b_data;
2647 offset = offsetof(struct ocfs2_xattr_header, xh_entries);
2648 size = count * sizeof(struct ocfs2_xattr_entry);
2649 memcpy(target + offset, (char *)xb_xh + offset, size);
2650
2651 /* Change the xe offset for all the xe because of the move. */
2652 off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
2653 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
2654 for (i = 0; i < count; i++)
2655 le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
2656
2657 mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
2658 offset, size, off_change);
2659
2660 sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
2661 cmp_xe, swap_xe);
2662}
2663
2664/*
2665 * After we move xattr from block to index btree, we have to
2666 * update ocfs2_xattr_search to the new xe and base.
2667 *
2668 * When the entry is in xattr block, xattr_bh indicates the storage place.
2669 * While if the entry is in index b-tree, "bucket" indicates the
2670 * real place of the xattr.
2671 */
2672static int ocfs2_xattr_update_xattr_search(struct inode *inode,
2673 struct ocfs2_xattr_search *xs,
2674 struct buffer_head *old_bh,
2675 struct buffer_head *new_bh)
2676{
2677 int ret = 0;
2678 char *buf = old_bh->b_data;
2679 struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
2680 struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
2681 int i, blocksize = inode->i_sb->s_blocksize;
2682 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2683
2684 xs->bucket.bhs[0] = new_bh;
2685 get_bh(new_bh);
2686 xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
2687 xs->header = xs->bucket.xh;
2688
2689 xs->base = new_bh->b_data;
2690 xs->end = xs->base + inode->i_sb->s_blocksize;
2691
2692 if (!xs->not_found) {
2693 if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
2694 ret = ocfs2_read_blocks(inode,
2695 xs->bucket.bhs[0]->b_blocknr + 1,
2696 blk_per_bucket - 1, &xs->bucket.bhs[1],
2697 OCFS2_BH_CACHED);
2698 if (ret) {
2699 mlog_errno(ret);
2700 return ret;
2701 }
2702
2703 i = xs->here - old_xh->xh_entries;
2704 xs->here = &xs->header->xh_entries[i];
2705 }
2706 }
2707
2708 return ret;
2709}
2710
2711static int ocfs2_xattr_create_index_block(struct inode *inode,
2712 struct ocfs2_xattr_search *xs)
2713{
2714 int ret, credits = OCFS2_SUBALLOC_ALLOC;
2715 u32 bit_off, len;
2716 u64 blkno;
2717 handle_t *handle;
2718 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2719 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2720 struct ocfs2_alloc_context *data_ac;
2721 struct buffer_head *xh_bh = NULL, *data_bh = NULL;
2722 struct buffer_head *xb_bh = xs->xattr_bh;
2723 struct ocfs2_xattr_block *xb =
2724 (struct ocfs2_xattr_block *)xb_bh->b_data;
2725 struct ocfs2_xattr_tree_root *xr;
2726 u16 xb_flags = le16_to_cpu(xb->xb_flags);
2727 u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2728
2729 mlog(0, "create xattr index block for %llu\n",
2730 (unsigned long long)xb_bh->b_blocknr);
2731
2732 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
2733
2734 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
2735 if (ret) {
2736 mlog_errno(ret);
2737 goto out;
2738 }
2739
2740 /*
2741 * XXX:
2742 * We can use this lock for now, and maybe move to a dedicated mutex
2743 * if performance becomes a problem later.
2744 */
2745 down_write(&oi->ip_alloc_sem);
2746
2747 /*
2748 * 3 more credits, one for xattr block update, one for the 1st block
2749 * of the new xattr bucket and one for the value/data.
2750 */
2751 credits += 3;
2752 handle = ocfs2_start_trans(osb, credits);
2753 if (IS_ERR(handle)) {
2754 ret = PTR_ERR(handle);
2755 mlog_errno(ret);
2756 goto out_sem;
2757 }
2758
2759 ret = ocfs2_journal_access(handle, inode, xb_bh,
2760 OCFS2_JOURNAL_ACCESS_WRITE);
2761 if (ret) {
2762 mlog_errno(ret);
2763 goto out_commit;
2764 }
2765
2766 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
2767 if (ret) {
2768 mlog_errno(ret);
2769 goto out_commit;
2770 }
2771
2772 /*
2773 * The bucket may spread in many blocks, and
2774 * we will only touch the 1st block and the last block
2775 * in the whole bucket(one for entry and one for data).
2776 */
2777 blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
2778
2779 mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
2780
2781 xh_bh = sb_getblk(inode->i_sb, blkno);
2782 if (!xh_bh) {
2783 ret = -EIO;
2784 mlog_errno(ret);
2785 goto out_commit;
2786 }
2787
2788 ocfs2_set_new_buffer_uptodate(inode, xh_bh);
2789
2790 ret = ocfs2_journal_access(handle, inode, xh_bh,
2791 OCFS2_JOURNAL_ACCESS_CREATE);
2792 if (ret) {
2793 mlog_errno(ret);
2794 goto out_commit;
2795 }
2796
2797 if (bpb > 1) {
2798 data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
2799 if (!data_bh) {
2800 ret = -EIO;
2801 mlog_errno(ret);
2802 goto out_commit;
2803 }
2804
2805 ocfs2_set_new_buffer_uptodate(inode, data_bh);
2806
2807 ret = ocfs2_journal_access(handle, inode, data_bh,
2808 OCFS2_JOURNAL_ACCESS_CREATE);
2809 if (ret) {
2810 mlog_errno(ret);
2811 goto out_commit;
2812 }
2813 }
2814
2815 ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
2816
2817 ocfs2_journal_dirty(handle, xh_bh);
2818 if (data_bh)
2819 ocfs2_journal_dirty(handle, data_bh);
2820
2821 ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
2822
2823 /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
2824 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
2825 offsetof(struct ocfs2_xattr_block, xb_attrs));
2826
2827 xr = &xb->xb_attrs.xb_root;
2828 xr->xt_clusters = cpu_to_le32(1);
2829 xr->xt_last_eb_blk = 0;
2830 xr->xt_list.l_tree_depth = 0;
2831 xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
2832 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2833
2834 xr->xt_list.l_recs[0].e_cpos = 0;
2835 xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
2836 xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
2837
2838 xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
2839
2840 ret = ocfs2_journal_dirty(handle, xb_bh);
2841 if (ret) {
2842 mlog_errno(ret);
2843 goto out_commit;
2844 }
2845
2846out_commit:
2847 ocfs2_commit_trans(osb, handle);
2848
2849out_sem:
2850 up_write(&oi->ip_alloc_sem);
2851
2852out:
2853 if (data_ac)
2854 ocfs2_free_alloc_context(data_ac);
2855
2856 brelse(xh_bh);
2857 brelse(data_bh);
2858
2859 return ret;
2860}
2861
2862static int cmp_xe_offset(const void *a, const void *b)
2863{
2864 const struct ocfs2_xattr_entry *l = a, *r = b;
2865 u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
2866 u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
2867
2868 if (l_name_offset < r_name_offset)
2869 return 1;
2870 if (l_name_offset > r_name_offset)
2871 return -1;
2872 return 0;
2873}
2874
2875/*
2876 * defrag a xattr bucket if we find that the bucket has some
2877 * holes beteen name/value pairs.
2878 * We will move all the name/value pairs to the end of the bucket
2879 * so that we can spare some space for insertion.
2880 */
2881static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2882 struct ocfs2_xattr_bucket *bucket)
2883{
2884 int ret, i;
2885 size_t end, offset, len, value_len;
2886 struct ocfs2_xattr_header *xh;
2887 char *entries, *buf, *bucket_buf = NULL;
2888 u64 blkno = bucket->bhs[0]->b_blocknr;
2889 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2890 u16 xh_free_start;
2891 size_t blocksize = inode->i_sb->s_blocksize;
2892 handle_t *handle;
2893 struct buffer_head **bhs;
2894 struct ocfs2_xattr_entry *xe;
2895
2896 bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
2897 GFP_NOFS);
2898 if (!bhs)
2899 return -ENOMEM;
2900
2901 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs,
2902 OCFS2_BH_CACHED);
2903 if (ret)
2904 goto out;
2905
2906 /*
2907 * In order to make the operation more efficient and generic,
2908 * we copy all the blocks into a contiguous memory and do the
2909 * defragment there, so if anything is error, we will not touch
2910 * the real block.
2911 */
2912 bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
2913 if (!bucket_buf) {
2914 ret = -EIO;
2915 goto out;
2916 }
2917
2918 buf = bucket_buf;
2919 for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
2920 memcpy(buf, bhs[i]->b_data, blocksize);
2921
2922 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
2923 if (IS_ERR(handle)) {
2924 ret = PTR_ERR(handle);
2925 handle = NULL;
2926 mlog_errno(ret);
2927 goto out;
2928 }
2929
2930 for (i = 0; i < blk_per_bucket; i++) {
2931 ret = ocfs2_journal_access(handle, inode, bhs[i],
2932 OCFS2_JOURNAL_ACCESS_WRITE);
2933 if (ret < 0) {
2934 mlog_errno(ret);
2935 goto commit;
2936 }
2937 }
2938
2939 xh = (struct ocfs2_xattr_header *)bucket_buf;
2940 entries = (char *)xh->xh_entries;
2941 xh_free_start = le16_to_cpu(xh->xh_free_start);
2942
2943 mlog(0, "adjust xattr bucket in %llu, count = %u, "
2944 "xh_free_start = %u, xh_name_value_len = %u.\n",
2945 blkno, le16_to_cpu(xh->xh_count), xh_free_start,
2946 le16_to_cpu(xh->xh_name_value_len));
2947
2948 /*
2949 * sort all the entries by their offset.
2950 * the largest will be the first, so that we can
2951 * move them to the end one by one.
2952 */
2953 sort(entries, le16_to_cpu(xh->xh_count),
2954 sizeof(struct ocfs2_xattr_entry),
2955 cmp_xe_offset, swap_xe);
2956
2957 /* Move all name/values to the end of the bucket. */
2958 xe = xh->xh_entries;
2959 end = OCFS2_XATTR_BUCKET_SIZE;
2960 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
2961 offset = le16_to_cpu(xe->xe_name_offset);
2962 if (ocfs2_xattr_is_local(xe))
2963 value_len = OCFS2_XATTR_SIZE(
2964 le64_to_cpu(xe->xe_value_size));
2965 else
2966 value_len = OCFS2_XATTR_ROOT_SIZE;
2967 len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
2968
2969 /*
2970 * We must make sure that the name/value pair
2971 * exist in the same block. So adjust end to
2972 * the previous block end if needed.
2973 */
2974 if (((end - len) / blocksize !=
2975 (end - 1) / blocksize))
2976 end = end - end % blocksize;
2977
2978 if (end > offset + len) {
2979 memmove(bucket_buf + end - len,
2980 bucket_buf + offset, len);
2981 xe->xe_name_offset = cpu_to_le16(end - len);
2982 }
2983
2984 mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
2985 "bucket %llu\n", (unsigned long long)blkno);
2986
2987 end -= len;
2988 }
2989
2990 mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
2991 "bucket %llu\n", (unsigned long long)blkno);
2992
2993 if (xh_free_start == end)
2994 goto commit;
2995
2996 memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
2997 xh->xh_free_start = cpu_to_le16(end);
2998
2999 /* sort the entries by their name_hash. */
3000 sort(entries, le16_to_cpu(xh->xh_count),
3001 sizeof(struct ocfs2_xattr_entry),
3002 cmp_xe, swap_xe);
3003
3004 buf = bucket_buf;
3005 for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
3006 memcpy(bhs[i]->b_data, buf, blocksize);
3007 ocfs2_journal_dirty(handle, bhs[i]);
3008 }
3009
3010commit:
3011 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3012out:
3013
3014 if (bhs) {
3015 for (i = 0; i < blk_per_bucket; i++)
3016 brelse(bhs[i]);
3017 }
3018 kfree(bhs);
3019
3020 kfree(bucket_buf);
3021 return ret;
3022}
3023
3024/*
3025 * Move half nums of the xattr bucket in the previous cluster to this new
3026 * cluster. We only touch the last cluster of the previous extend record.
3027 *
3028 * first_bh is the first buffer_head of a series of bucket in the same
3029 * extent rec and header_bh is the header of one bucket in this cluster.
3030 * They will be updated if we move the data header_bh contains to the new
3031 * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
3032 */
3033static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
3034 handle_t *handle,
3035 struct buffer_head **first_bh,
3036 struct buffer_head **header_bh,
3037 u64 new_blkno,
3038 u64 prev_blkno,
3039 u32 num_clusters,
3040 u32 *first_hash)
3041{
3042 int i, ret, credits;
3043 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3044 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3045 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
3046 int blocksize = inode->i_sb->s_blocksize;
3047 struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
3048 struct ocfs2_xattr_header *new_xh;
3049 struct ocfs2_xattr_header *xh =
3050 (struct ocfs2_xattr_header *)((*first_bh)->b_data);
3051
3052 BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
3053 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
3054
3055 prev_bh = *first_bh;
3056 get_bh(prev_bh);
3057 xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
3058
3059 prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
3060
3061 mlog(0, "move half of xattrs in cluster %llu to %llu\n",
3062 prev_blkno, new_blkno);
3063
3064 /*
3065 * We need to update the 1st half of the new cluster and
3066 * 1 more for the update of the 1st bucket of the previous
3067 * extent record.
3068 */
3069 credits = bpc / 2 + 1;
3070 ret = ocfs2_extend_trans(handle, credits);
3071 if (ret) {
3072 mlog_errno(ret);
3073 goto out;
3074 }
3075
3076 ret = ocfs2_journal_access(handle, inode, prev_bh,
3077 OCFS2_JOURNAL_ACCESS_WRITE);
3078 if (ret) {
3079 mlog_errno(ret);
3080 goto out;
3081 }
3082
3083 for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
3084 old_bh = new_bh = NULL;
3085 new_bh = sb_getblk(inode->i_sb, new_blkno);
3086 if (!new_bh) {
3087 ret = -EIO;
3088 mlog_errno(ret);
3089 goto out;
3090 }
3091
3092 ocfs2_set_new_buffer_uptodate(inode, new_bh);
3093
3094 ret = ocfs2_journal_access(handle, inode, new_bh,
3095 OCFS2_JOURNAL_ACCESS_CREATE);
3096 if (ret < 0) {
3097 mlog_errno(ret);
3098 brelse(new_bh);
3099 goto out;
3100 }
3101
3102 ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
3103 if (ret < 0) {
3104 mlog_errno(ret);
3105 brelse(new_bh);
3106 goto out;
3107 }
3108
3109 memcpy(new_bh->b_data, old_bh->b_data, blocksize);
3110
3111 if (i == 0) {
3112 new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
3113 new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
3114
3115 if (first_hash)
3116 *first_hash = le32_to_cpu(
3117 new_xh->xh_entries[0].xe_name_hash);
3118 new_first_bh = new_bh;
3119 get_bh(new_first_bh);
3120 }
3121
3122 ocfs2_journal_dirty(handle, new_bh);
3123
3124 if (*header_bh == old_bh) {
3125 brelse(*header_bh);
3126 *header_bh = new_bh;
3127 get_bh(*header_bh);
3128
3129 brelse(*first_bh);
3130 *first_bh = new_first_bh;
3131 get_bh(*first_bh);
3132 }
3133 brelse(new_bh);
3134 brelse(old_bh);
3135 }
3136
3137 le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
3138
3139 ocfs2_journal_dirty(handle, prev_bh);
3140out:
3141 brelse(prev_bh);
3142 brelse(new_first_bh);
3143 return ret;
3144}
3145
3146static int ocfs2_read_xattr_bucket(struct inode *inode,
3147 u64 blkno,
3148 struct buffer_head **bhs,
3149 int new)
3150{
3151 int ret = 0;
3152 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3153
3154 if (!new)
3155 return ocfs2_read_blocks(inode, blkno,
3156 blk_per_bucket, bhs,
3157 OCFS2_BH_CACHED);
3158
3159 for (i = 0; i < blk_per_bucket; i++) {
3160 bhs[i] = sb_getblk(inode->i_sb, blkno + i);
3161 if (bhs[i] == NULL) {
3162 ret = -EIO;
3163 mlog_errno(ret);
3164 break;
3165 }
3166 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
3167 }
3168
3169 return ret;
3170}
3171
3172/*
3173 * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
3174 * first_hash will record the 1st hash of the new bucket.
3175 */
3176static int ocfs2_half_xattr_bucket(struct inode *inode,
3177 handle_t *handle,
3178 u64 blk,
3179 u64 new_blk,
3180 u32 *first_hash,
3181 int new_bucket_head)
3182{
3183 int ret, i;
3184 u16 count, start, len, name_value_len, xe_len, name_offset;
3185 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3186 struct buffer_head **s_bhs, **t_bhs = NULL;
3187 struct ocfs2_xattr_header *xh;
3188 struct ocfs2_xattr_entry *xe;
3189 int blocksize = inode->i_sb->s_blocksize;
3190
3191 mlog(0, "move half of xattrs from bucket %llu to %llu\n",
3192 blk, new_blk);
3193
3194 s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
3195 if (!s_bhs)
3196 return -ENOMEM;
3197
3198 ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
3199 if (ret) {
3200 mlog_errno(ret);
3201 goto out;
3202 }
3203
3204 ret = ocfs2_journal_access(handle, inode, s_bhs[0],
3205 OCFS2_JOURNAL_ACCESS_WRITE);
3206 if (ret) {
3207 mlog_errno(ret);
3208 goto out;
3209 }
3210
3211 t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
3212 if (!t_bhs) {
3213 ret = -ENOMEM;
3214 goto out;
3215 }
3216
3217 ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
3218 if (ret) {
3219 mlog_errno(ret);
3220 goto out;
3221 }
3222
3223 for (i = 0; i < blk_per_bucket; i++) {
3224 ret = ocfs2_journal_access(handle, inode, t_bhs[i],
3225 OCFS2_JOURNAL_ACCESS_CREATE);
3226 if (ret) {
3227 mlog_errno(ret);
3228 goto out;
3229 }
3230 }
3231
3232 /* copy the whole bucket to the new first. */
3233 for (i = 0; i < blk_per_bucket; i++)
3234 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3235
3236 /* update the new bucket. */
3237 xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
3238 count = le16_to_cpu(xh->xh_count);
3239 start = count / 2;
3240
3241 /*
3242 * Calculate the total name/value len and xh_free_start for
3243 * the old bucket first.
3244 */
3245 name_offset = OCFS2_XATTR_BUCKET_SIZE;
3246 name_value_len = 0;
3247 for (i = 0; i < start; i++) {
3248 xe = &xh->xh_entries[i];
3249 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
3250 if (ocfs2_xattr_is_local(xe))
3251 xe_len +=
3252 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
3253 else
3254 xe_len += OCFS2_XATTR_ROOT_SIZE;
3255 name_value_len += xe_len;
3256 if (le16_to_cpu(xe->xe_name_offset) < name_offset)
3257 name_offset = le16_to_cpu(xe->xe_name_offset);
3258 }
3259
3260 /*
3261 * Now begin the modification to the new bucket.
3262 *
3263 * In the new bucket, We just move the xattr entry to the beginning
3264 * and don't touch the name/value. So there will be some holes in the
3265 * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
3266 * called.
3267 */
3268 xe = &xh->xh_entries[start];
3269 len = sizeof(struct ocfs2_xattr_entry) * (count - start);
3270 mlog(0, "mv xattr entry len %d from %d to %d\n", len,
3271 (int)((char *)xe - (char *)xh),
3272 (int)((char *)xh->xh_entries - (char *)xh));
3273 memmove((char *)xh->xh_entries, (char *)xe, len);
3274 xe = &xh->xh_entries[count - start];
3275 len = sizeof(struct ocfs2_xattr_entry) * start;
3276 memset((char *)xe, 0, len);
3277
3278 le16_add_cpu(&xh->xh_count, -start);
3279 le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
3280
3281 /* Calculate xh_free_start for the new bucket. */
3282 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
3283 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
3284 xe = &xh->xh_entries[i];
3285 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
3286 if (ocfs2_xattr_is_local(xe))
3287 xe_len +=
3288 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
3289 else
3290 xe_len += OCFS2_XATTR_ROOT_SIZE;
3291 if (le16_to_cpu(xe->xe_name_offset) <
3292 le16_to_cpu(xh->xh_free_start))
3293 xh->xh_free_start = xe->xe_name_offset;
3294 }
3295
3296 /* set xh->xh_num_buckets for the new xh. */
3297 if (new_bucket_head)
3298 xh->xh_num_buckets = cpu_to_le16(1);
3299 else
3300 xh->xh_num_buckets = 0;
3301
3302 for (i = 0; i < blk_per_bucket; i++) {
3303 ocfs2_journal_dirty(handle, t_bhs[i]);
3304 if (ret)
3305 mlog_errno(ret);
3306 }
3307
3308 /* store the first_hash of the new bucket. */
3309 if (first_hash)
3310 *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
3311
3312 /*
3313 * Now only update the 1st block of the old bucket.
3314 * Please note that the entry has been sorted already above.
3315 */
3316 xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
3317 memset(&xh->xh_entries[start], 0,
3318 sizeof(struct ocfs2_xattr_entry) * (count - start));
3319 xh->xh_count = cpu_to_le16(start);
3320 xh->xh_free_start = cpu_to_le16(name_offset);
3321 xh->xh_name_value_len = cpu_to_le16(name_value_len);
3322
3323 ocfs2_journal_dirty(handle, s_bhs[0]);
3324 if (ret)
3325 mlog_errno(ret);
3326
3327out:
3328 if (s_bhs) {
3329 for (i = 0; i < blk_per_bucket; i++)
3330 brelse(s_bhs[i]);
3331 }
3332 kfree(s_bhs);
3333
3334 if (t_bhs) {
3335 for (i = 0; i < blk_per_bucket; i++)
3336 brelse(t_bhs[i]);
3337 }
3338 kfree(t_bhs);
3339
3340 return ret;
3341}
3342
3343/*
3344 * Copy xattr from one bucket to another bucket.
3345 *
3346 * The caller must make sure that the journal transaction
3347 * has enough space for journaling.
3348 */
3349static int ocfs2_cp_xattr_bucket(struct inode *inode,
3350 handle_t *handle,
3351 u64 s_blkno,
3352 u64 t_blkno,
3353 int t_is_new)
3354{
3355 int ret, i;
3356 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3357 int blocksize = inode->i_sb->s_blocksize;
3358 struct buffer_head **s_bhs, **t_bhs = NULL;
3359
3360 BUG_ON(s_blkno == t_blkno);
3361
3362 mlog(0, "cp bucket %llu to %llu, target is %d\n",
3363 s_blkno, t_blkno, t_is_new);
3364
3365 s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
3366 GFP_NOFS);
3367 if (!s_bhs)
3368 return -ENOMEM;
3369
3370 ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
3371 if (ret)
3372 goto out;
3373
3374 t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
3375 GFP_NOFS);
3376 if (!t_bhs) {
3377 ret = -ENOMEM;
3378 goto out;
3379 }
3380
3381 ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
3382 if (ret)
3383 goto out;
3384
3385 for (i = 0; i < blk_per_bucket; i++) {
3386 ret = ocfs2_journal_access(handle, inode, t_bhs[i],
3387 OCFS2_JOURNAL_ACCESS_WRITE);
3388 if (ret)
3389 goto out;
3390 }
3391
3392 for (i = 0; i < blk_per_bucket; i++) {
3393 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3394 ocfs2_journal_dirty(handle, t_bhs[i]);
3395 }
3396
3397out:
3398 if (s_bhs) {
3399 for (i = 0; i < blk_per_bucket; i++)
3400 brelse(s_bhs[i]);
3401 }
3402 kfree(s_bhs);
3403
3404 if (t_bhs) {
3405 for (i = 0; i < blk_per_bucket; i++)
3406 brelse(t_bhs[i]);
3407 }
3408 kfree(t_bhs);
3409
3410 return ret;
3411}
3412
3413/*
3414 * Copy one xattr cluster from src_blk to to_blk.
3415 * The to_blk will become the first bucket header of the cluster, so its
3416 * xh_num_buckets will be initialized as the bucket num in the cluster.
3417 */
3418static int ocfs2_cp_xattr_cluster(struct inode *inode,
3419 handle_t *handle,
3420 struct buffer_head *first_bh,
3421 u64 src_blk,
3422 u64 to_blk,
3423 u32 *first_hash)
3424{
3425 int i, ret, credits;
3426 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3427 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3428 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
3429 struct buffer_head *bh = NULL;
3430 struct ocfs2_xattr_header *xh;
3431 u64 to_blk_start = to_blk;
3432
3433 mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
3434
3435 /*
3436 * We need to update the new cluster and 1 more for the update of
3437 * the 1st bucket of the previous extent rec.
3438 */
3439 credits = bpc + 1;
3440 ret = ocfs2_extend_trans(handle, credits);
3441 if (ret) {
3442 mlog_errno(ret);
3443 goto out;
3444 }
3445
3446 ret = ocfs2_journal_access(handle, inode, first_bh,
3447 OCFS2_JOURNAL_ACCESS_WRITE);
3448 if (ret) {
3449 mlog_errno(ret);
3450 goto out;
3451 }
3452
3453 for (i = 0; i < num_buckets; i++) {
3454 ret = ocfs2_cp_xattr_bucket(inode, handle,
3455 src_blk, to_blk, 1);
3456 if (ret) {
3457 mlog_errno(ret);
3458 goto out;
3459 }
3460
3461 src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3462 to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3463 }
3464
3465 /* update the old bucket header. */
3466 xh = (struct ocfs2_xattr_header *)first_bh->b_data;
3467 le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
3468
3469 ocfs2_journal_dirty(handle, first_bh);
3470
3471 /* update the new bucket header. */
3472 ret = ocfs2_read_block(inode, to_blk_start, &bh);
3473 if (ret < 0) {
3474 mlog_errno(ret);
3475 goto out;
3476 }
3477
3478 ret = ocfs2_journal_access(handle, inode, bh,
3479 OCFS2_JOURNAL_ACCESS_WRITE);
3480 if (ret) {
3481 mlog_errno(ret);
3482 goto out;
3483 }
3484
3485 xh = (struct ocfs2_xattr_header *)bh->b_data;
3486 xh->xh_num_buckets = cpu_to_le16(num_buckets);
3487
3488 ocfs2_journal_dirty(handle, bh);
3489
3490 if (first_hash)
3491 *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
3492out:
3493 brelse(bh);
3494 return ret;
3495}
3496
3497/*
3498 * Move half of the xattrs in this cluster to the new cluster.
3499 * This function should only be called when bucket size == cluster size.
3500 * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
3501 */
3502static int ocfs2_half_xattr_cluster(struct inode *inode,
3503 handle_t *handle,
3504 u64 prev_blk,
3505 u64 new_blk,
3506 u32 *first_hash)
3507{
3508 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3509 int ret, credits = 2 * blk_per_bucket;
3510
3511 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
3512
3513 ret = ocfs2_extend_trans(handle, credits);
3514 if (ret) {
3515 mlog_errno(ret);
3516 return ret;
3517 }
3518
3519 /* Move half of the xattr in start_blk to the next bucket. */
3520 return ocfs2_half_xattr_bucket(inode, handle, prev_blk,
3521 new_blk, first_hash, 1);
3522}
3523
3524/*
3525 * Move some xattrs from the old cluster to the new one since they are not
3526 * contiguous in ocfs2 xattr tree.
3527 *
3528 * new_blk starts a new separate cluster, and we will move some xattrs from
3529 * prev_blk to it. v_start will be set as the first name hash value in this
3530 * new cluster so that it can be used as e_cpos during tree insertion and
3531 * don't collide with our original b-tree operations. first_bh and header_bh
3532 * will also be updated since they will be used in ocfs2_extend_xattr_bucket
3533 * to extend the insert bucket.
3534 *
3535 * The problem is how much xattr should we move to the new one and when should
3536 * we update first_bh and header_bh?
3537 * 1. If cluster size > bucket size, that means the previous cluster has more
3538 * than 1 bucket, so just move half nums of bucket into the new cluster and
3539 * update the first_bh and header_bh if the insert bucket has been moved
3540 * to the new cluster.
3541 * 2. If cluster_size == bucket_size:
3542 * a) If the previous extent rec has more than one cluster and the insert
3543 * place isn't in the last cluster, copy the entire last cluster to the
3544 * new one. This time, we don't need to upate the first_bh and header_bh
3545 * since they will not be moved into the new cluster.
3546 * b) Otherwise, move the bottom half of the xattrs in the last cluster into
3547 * the new one. And we set the extend flag to zero if the insert place is
3548 * moved into the new allocated cluster since no extend is needed.
3549 */
3550static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
3551 handle_t *handle,
3552 struct buffer_head **first_bh,
3553 struct buffer_head **header_bh,
3554 u64 new_blk,
3555 u64 prev_blk,
3556 u32 prev_clusters,
3557 u32 *v_start,
3558 int *extend)
3559{
3560 int ret = 0;
3561 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3562
3563 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
3564 prev_blk, prev_clusters, new_blk);
3565
3566 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
3567 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
3568 handle,
3569 first_bh,
3570 header_bh,
3571 new_blk,
3572 prev_blk,
3573 prev_clusters,
3574 v_start);
3575 else {
3576 u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
3577
3578 if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
3579 ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
3580 last_blk, new_blk,
3581 v_start);
3582 else {
3583 ret = ocfs2_half_xattr_cluster(inode, handle,
3584 last_blk, new_blk,
3585 v_start);
3586
3587 if ((*header_bh)->b_blocknr == last_blk && extend)
3588 *extend = 0;
3589 }
3590 }
3591
3592 return ret;
3593}
3594
3595/*
3596 * Add a new cluster for xattr storage.
3597 *
3598 * If the new cluster is contiguous with the previous one, it will be
3599 * appended to the same extent record, and num_clusters will be updated.
3600 * If not, we will insert a new extent for it and move some xattrs in
3601 * the last cluster into the new allocated one.
3602 * We also need to limit the maximum size of a btree leaf, otherwise we'll
3603 * lose the benefits of hashing because we'll have to search large leaves.
3604 * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
3605 * if it's bigger).
3606 *
3607 * first_bh is the first block of the previous extent rec and header_bh
3608 * indicates the bucket we will insert the new xattrs. They will be updated
3609 * when the header_bh is moved into the new cluster.
3610 */
3611static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3612 struct buffer_head *root_bh,
3613 struct buffer_head **first_bh,
3614 struct buffer_head **header_bh,
3615 u32 *num_clusters,
3616 u32 prev_cpos,
3617 u64 prev_blkno,
3618 int *extend)
3619{
3620 int ret, credits;
3621 u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3622 u32 prev_clusters = *num_clusters;
3623 u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
3624 u64 block;
3625 handle_t *handle = NULL;
3626 struct ocfs2_alloc_context *data_ac = NULL;
3627 struct ocfs2_alloc_context *meta_ac = NULL;
3628 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3629 struct ocfs2_extent_tree et;
3630
3631 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
3632 "previous xattr blkno = %llu\n",
3633 (unsigned long long)OCFS2_I(inode)->ip_blkno,
3634 prev_cpos, prev_blkno);
3635
3636 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
3637
3638 ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
3639 &data_ac, &meta_ac);
3640 if (ret) {
3641 mlog_errno(ret);
3642 goto leave;
3643 }
3644
3645 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
3646 clusters_to_add);
3647 handle = ocfs2_start_trans(osb, credits);
3648 if (IS_ERR(handle)) {
3649 ret = PTR_ERR(handle);
3650 handle = NULL;
3651 mlog_errno(ret);
3652 goto leave;
3653 }
3654
3655 ret = ocfs2_journal_access(handle, inode, root_bh,
3656 OCFS2_JOURNAL_ACCESS_WRITE);
3657 if (ret < 0) {
3658 mlog_errno(ret);
3659 goto leave;
3660 }
3661
3662 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
3663 clusters_to_add, &bit_off, &num_bits);
3664 if (ret < 0) {
3665 if (ret != -ENOSPC)
3666 mlog_errno(ret);
3667 goto leave;
3668 }
3669
3670 BUG_ON(num_bits > clusters_to_add);
3671
3672 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
3673 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
3674 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
3675
3676 if (prev_blkno + prev_clusters * bpc == block &&
3677 (prev_clusters + num_bits) << osb->s_clustersize_bits <=
3678 OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
3679 /*
3680 * If this cluster is contiguous with the old one and
3681 * adding this new cluster, we don't surpass the limit of
3682 * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
3683 * initialized and used like other buckets in the previous
3684 * cluster.
3685 * So add it as a contiguous one. The caller will handle
3686 * its init process.
3687 */
3688 v_start = prev_cpos + prev_clusters;
3689 *num_clusters = prev_clusters + num_bits;
3690 mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
3691 num_bits);
3692 } else {
3693 ret = ocfs2_adjust_xattr_cross_cluster(inode,
3694 handle,
3695 first_bh,
3696 header_bh,
3697 block,
3698 prev_blkno,
3699 prev_clusters,
3700 &v_start,
3701 extend);
3702 if (ret) {
3703 mlog_errno(ret);
3704 goto leave;
3705 }
3706 }
3707
3708 if (handle->h_buffer_credits < credits) {
3709 /*
3710 * The journal has been restarted before, and don't
3711 * have enough space for the insertion, so extend it
3712 * here.
3713 */
3714 ret = ocfs2_extend_trans(handle, credits);
3715 if (ret) {
3716 mlog_errno(ret);
3717 goto leave;
3718 }
3719 }
3720 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
3721 num_bits, block, v_start);
3722 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
3723 num_bits, 0, meta_ac);
3724 if (ret < 0) {
3725 mlog_errno(ret);
3726 goto leave;
3727 }
3728
3729 ret = ocfs2_journal_dirty(handle, root_bh);
3730 if (ret < 0) {
3731 mlog_errno(ret);
3732 goto leave;
3733 }
3734
3735leave:
3736 if (handle)
3737 ocfs2_commit_trans(osb, handle);
3738 if (data_ac)
3739 ocfs2_free_alloc_context(data_ac);
3740 if (meta_ac)
3741 ocfs2_free_alloc_context(meta_ac);
3742
3743 return ret;
3744}
3745
3746/*
3747 * Extend a new xattr bucket and move xattrs to the end one by one until
3748 * We meet with start_bh. Only move half of the xattrs to the bucket after it.
3749 */
3750static int ocfs2_extend_xattr_bucket(struct inode *inode,
3751 struct buffer_head *first_bh,
3752 struct buffer_head *start_bh,
3753 u32 num_clusters)
3754{
3755 int ret, credits;
3756 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3757 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3758 u64 start_blk = start_bh->b_blocknr, end_blk;
3759 u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
3760 handle_t *handle;
3761 struct ocfs2_xattr_header *first_xh =
3762 (struct ocfs2_xattr_header *)first_bh->b_data;
3763 u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
3764
3765 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
3766 "from %llu, len = %u\n", start_blk,
3767 (unsigned long long)first_bh->b_blocknr, num_clusters);
3768
3769 BUG_ON(bucket >= num_buckets);
3770
3771 end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
3772
3773 /*
3774 * We will touch all the buckets after the start_bh(include it).
3775 * Add one more bucket and modify the first_bh.
3776 */
3777 credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
3778 handle = ocfs2_start_trans(osb, credits);
3779 if (IS_ERR(handle)) {
3780 ret = PTR_ERR(handle);
3781 handle = NULL;
3782 mlog_errno(ret);
3783 goto out;
3784 }
3785
3786 ret = ocfs2_journal_access(handle, inode, first_bh,
3787 OCFS2_JOURNAL_ACCESS_WRITE);
3788 if (ret) {
3789 mlog_errno(ret);
3790 goto commit;
3791 }
3792
3793 while (end_blk != start_blk) {
3794 ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
3795 end_blk + blk_per_bucket, 0);
3796 if (ret)
3797 goto commit;
3798 end_blk -= blk_per_bucket;
3799 }
3800
3801 /* Move half of the xattr in start_blk to the next bucket. */
3802 ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
3803 start_blk + blk_per_bucket, NULL, 0);
3804
3805 le16_add_cpu(&first_xh->xh_num_buckets, 1);
3806 ocfs2_journal_dirty(handle, first_bh);
3807
3808commit:
3809 ocfs2_commit_trans(osb, handle);
3810out:
3811 return ret;
3812}
3813
3814/*
3815 * Add new xattr bucket in an extent record and adjust the buckets accordingly.
3816 * xb_bh is the ocfs2_xattr_block.
3817 * We will move all the buckets starting from header_bh to the next place. As
3818 * for this one, half num of its xattrs will be moved to the next one.
3819 *
3820 * We will allocate a new cluster if current cluster is full and adjust
3821 * header_bh and first_bh if the insert place is moved to the new cluster.
3822 */
3823static int ocfs2_add_new_xattr_bucket(struct inode *inode,
3824 struct buffer_head *xb_bh,
3825 struct buffer_head *header_bh)
3826{
3827 struct ocfs2_xattr_header *first_xh = NULL;
3828 struct buffer_head *first_bh = NULL;
3829 struct ocfs2_xattr_block *xb =
3830 (struct ocfs2_xattr_block *)xb_bh->b_data;
3831 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
3832 struct ocfs2_extent_list *el = &xb_root->xt_list;
3833 struct ocfs2_xattr_header *xh =
3834 (struct ocfs2_xattr_header *)header_bh->b_data;
3835 u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
3836 struct super_block *sb = inode->i_sb;
3837 struct ocfs2_super *osb = OCFS2_SB(sb);
3838 int ret, num_buckets, extend = 1;
3839 u64 p_blkno;
3840 u32 e_cpos, num_clusters;
3841
3842 mlog(0, "Add new xattr bucket starting form %llu\n",
3843 (unsigned long long)header_bh->b_blocknr);
3844
3845 /*
3846 * Add refrence for header_bh here because it may be
3847 * changed in ocfs2_add_new_xattr_cluster and we need
3848 * to free it in the end.
3849 */
3850 get_bh(header_bh);
3851
3852 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
3853 &num_clusters, el);
3854 if (ret) {
3855 mlog_errno(ret);
3856 goto out;
3857 }
3858
3859 ret = ocfs2_read_block(inode, p_blkno, &first_bh);
3860 if (ret) {
3861 mlog_errno(ret);
3862 goto out;
3863 }
3864
3865 num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
3866 first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
3867
3868 if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
3869 ret = ocfs2_add_new_xattr_cluster(inode,
3870 xb_bh,
3871 &first_bh,
3872 &header_bh,
3873 &num_clusters,
3874 e_cpos,
3875 p_blkno,
3876 &extend);
3877 if (ret) {
3878 mlog_errno(ret);
3879 goto out;
3880 }
3881 }
3882
3883 if (extend)
3884 ret = ocfs2_extend_xattr_bucket(inode,
3885 first_bh,
3886 header_bh,
3887 num_clusters);
3888 if (ret)
3889 mlog_errno(ret);
3890out:
3891 brelse(first_bh);
3892 brelse(header_bh);
3893 return ret;
3894}
3895
3896static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
3897 struct ocfs2_xattr_bucket *bucket,
3898 int offs)
3899{
3900 int block_off = offs >> inode->i_sb->s_blocksize_bits;
3901
3902 offs = offs % inode->i_sb->s_blocksize;
3903 return bucket->bhs[block_off]->b_data + offs;
3904}
3905
3906/*
3907 * Handle the normal xattr set, including replace, delete and new.
3908 *
3909 * Note: "local" indicates the real data's locality. So we can't
3910 * just its bucket locality by its length.
3911 */
3912static void ocfs2_xattr_set_entry_normal(struct inode *inode,
3913 struct ocfs2_xattr_info *xi,
3914 struct ocfs2_xattr_search *xs,
3915 u32 name_hash,
3916 int local)
3917{
3918 struct ocfs2_xattr_entry *last, *xe;
3919 int name_len = strlen(xi->name);
3920 struct ocfs2_xattr_header *xh = xs->header;
3921 u16 count = le16_to_cpu(xh->xh_count), start;
3922 size_t blocksize = inode->i_sb->s_blocksize;
3923 char *val;
3924 size_t offs, size, new_size;
3925
3926 last = &xh->xh_entries[count];
3927 if (!xs->not_found) {
3928 xe = xs->here;
3929 offs = le16_to_cpu(xe->xe_name_offset);
3930 if (ocfs2_xattr_is_local(xe))
3931 size = OCFS2_XATTR_SIZE(name_len) +
3932 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
3933 else
3934 size = OCFS2_XATTR_SIZE(name_len) +
3935 OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
3936
3937 /*
3938 * If the new value will be stored outside, xi->value has been
3939 * initalized as an empty ocfs2_xattr_value_root, and the same
3940 * goes with xi->value_len, so we can set new_size safely here.
3941 * See ocfs2_xattr_set_in_bucket.
3942 */
3943 new_size = OCFS2_XATTR_SIZE(name_len) +
3944 OCFS2_XATTR_SIZE(xi->value_len);
3945
3946 le16_add_cpu(&xh->xh_name_value_len, -size);
3947 if (xi->value) {
3948 if (new_size > size)
3949 goto set_new_name_value;
3950
3951 /* Now replace the old value with new one. */
3952 if (local)
3953 xe->xe_value_size = cpu_to_le64(xi->value_len);
3954 else
3955 xe->xe_value_size = 0;
3956
3957 val = ocfs2_xattr_bucket_get_val(inode,
3958 &xs->bucket, offs);
3959 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
3960 size - OCFS2_XATTR_SIZE(name_len));
3961 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
3962 memcpy(val + OCFS2_XATTR_SIZE(name_len),
3963 xi->value, xi->value_len);
3964
3965 le16_add_cpu(&xh->xh_name_value_len, new_size);
3966 ocfs2_xattr_set_local(xe, local);
3967 return;
3968 } else {
3969 /*
3970 * Remove the old entry if there is more than one.
3971 * We don't remove the last entry so that we can
3972 * use it to indicate the hash value of the empty
3973 * bucket.
3974 */
3975 last -= 1;
3976 le16_add_cpu(&xh->xh_count, -1);
3977 if (xh->xh_count) {
3978 memmove(xe, xe + 1,
3979 (void *)last - (void *)xe);
3980 memset(last, 0,
3981 sizeof(struct ocfs2_xattr_entry));
3982 } else
3983 xh->xh_free_start =
3984 cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
3985
3986 return;
3987 }
3988 } else {
3989 /* find a new entry for insert. */
3990 int low = 0, high = count - 1, tmp;
3991 struct ocfs2_xattr_entry *tmp_xe;
3992
3993 while (low <= high && count) {
3994 tmp = (low + high) / 2;
3995 tmp_xe = &xh->xh_entries[tmp];
3996
3997 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
3998 low = tmp + 1;
3999 else if (name_hash <
4000 le32_to_cpu(tmp_xe->xe_name_hash))
4001 high = tmp - 1;
4002 else {
4003 low = tmp;
4004 break;
4005 }
4006 }
4007
4008 xe = &xh->xh_entries[low];
4009 if (low != count)
4010 memmove(xe + 1, xe, (void *)last - (void *)xe);
4011
4012 le16_add_cpu(&xh->xh_count, 1);
4013 memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
4014 xe->xe_name_hash = cpu_to_le32(name_hash);
4015 xe->xe_name_len = name_len;
4016 ocfs2_xattr_set_type(xe, xi->name_index);
4017 }
4018
4019set_new_name_value:
4020 /* Insert the new name+value. */
4021 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
4022
4023 /*
4024 * We must make sure that the name/value pair
4025 * exists in the same block.
4026 */
4027 offs = le16_to_cpu(xh->xh_free_start);
4028 start = offs - size;
4029
4030 if (start >> inode->i_sb->s_blocksize_bits !=
4031 (offs - 1) >> inode->i_sb->s_blocksize_bits) {
4032 offs = offs - offs % blocksize;
4033 xh->xh_free_start = cpu_to_le16(offs);
4034 }
4035
4036 val = ocfs2_xattr_bucket_get_val(inode,
4037 &xs->bucket, offs - size);
4038 xe->xe_name_offset = cpu_to_le16(offs - size);
4039
4040 memset(val, 0, size);
4041 memcpy(val, xi->name, name_len);
4042 memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
4043
4044 xe->xe_value_size = cpu_to_le64(xi->value_len);
4045 ocfs2_xattr_set_local(xe, local);
4046 xs->here = xe;
4047 le16_add_cpu(&xh->xh_free_start, -size);
4048 le16_add_cpu(&xh->xh_name_value_len, size);
4049
4050 return;
4051}
4052
4053static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
4054 handle_t *handle,
4055 struct ocfs2_xattr_search *xs,
4056 struct buffer_head **bhs,
4057 u16 bh_num)
4058{
4059 int ret = 0, off, block_off;
4060 struct ocfs2_xattr_entry *xe = xs->here;
4061
4062 /*
4063 * First calculate all the blocks we should journal_access
4064 * and journal_dirty. The first block should always be touched.
4065 */
4066 ret = ocfs2_journal_dirty(handle, bhs[0]);
4067 if (ret)
4068 mlog_errno(ret);
4069
4070 /* calc the data. */
4071 off = le16_to_cpu(xe->xe_name_offset);
4072 block_off = off >> inode->i_sb->s_blocksize_bits;
4073 ret = ocfs2_journal_dirty(handle, bhs[block_off]);
4074 if (ret)
4075 mlog_errno(ret);
4076
4077 return ret;
4078}
4079
4080/*
4081 * Set the xattr entry in the specified bucket.
4082 * The bucket is indicated by xs->bucket and it should have the enough
4083 * space for the xattr insertion.
4084 */
4085static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4086 struct ocfs2_xattr_info *xi,
4087 struct ocfs2_xattr_search *xs,
4088 u32 name_hash,
4089 int local)
4090{
4091 int i, ret;
4092 handle_t *handle = NULL;
4093 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4094 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4095
4096 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4097 (unsigned long)xi->value_len, xi->name_index,
4098 (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
4099
4100 if (!xs->bucket.bhs[1]) {
4101 ret = ocfs2_read_blocks(inode,
4102 xs->bucket.bhs[0]->b_blocknr + 1,
4103 blk_per_bucket - 1, &xs->bucket.bhs[1],
4104 OCFS2_BH_CACHED);
4105 if (ret) {
4106 mlog_errno(ret);
4107 goto out;
4108 }
4109 }
4110
4111 handle = ocfs2_start_trans(osb, blk_per_bucket);
4112 if (IS_ERR(handle)) {
4113 ret = PTR_ERR(handle);
4114 handle = NULL;
4115 mlog_errno(ret);
4116 goto out;
4117 }
4118
4119 for (i = 0; i < blk_per_bucket; i++) {
4120 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
4121 OCFS2_JOURNAL_ACCESS_WRITE);
4122 if (ret < 0) {
4123 mlog_errno(ret);
4124 goto out;
4125 }
4126 }
4127
4128 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4129
4130 /*Only dirty the blocks we have touched in set xattr. */
4131 ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
4132 xs->bucket.bhs, blk_per_bucket);
4133 if (ret)
4134 mlog_errno(ret);
4135out:
4136 ocfs2_commit_trans(osb, handle);
4137
4138 return ret;
4139}
4140
4141static int ocfs2_xattr_value_update_size(struct inode *inode,
4142 struct buffer_head *xe_bh,
4143 struct ocfs2_xattr_entry *xe,
4144 u64 new_size)
4145{
4146 int ret;
4147 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4148 handle_t *handle = NULL;
4149
4150 handle = ocfs2_start_trans(osb, 1);
4151 if (handle == NULL) {
4152 ret = -ENOMEM;
4153 mlog_errno(ret);
4154 goto out;
4155 }
4156
4157 ret = ocfs2_journal_access(handle, inode, xe_bh,
4158 OCFS2_JOURNAL_ACCESS_WRITE);
4159 if (ret < 0) {
4160 mlog_errno(ret);
4161 goto out_commit;
4162 }
4163
4164 xe->xe_value_size = cpu_to_le64(new_size);
4165
4166 ret = ocfs2_journal_dirty(handle, xe_bh);
4167 if (ret < 0)
4168 mlog_errno(ret);
4169
4170out_commit:
4171 ocfs2_commit_trans(osb, handle);
4172out:
4173 return ret;
4174}
4175
4176/*
4177 * Truncate the specified xe_off entry in xattr bucket.
4178 * bucket is indicated by header_bh and len is the new length.
4179 * Both the ocfs2_xattr_value_root and the entry will be updated here.
4180 *
4181 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
4182 */
4183static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4184 struct buffer_head *header_bh,
4185 int xe_off,
4186 int len)
4187{
4188 int ret, offset;
4189 u64 value_blk;
4190 struct buffer_head *value_bh = NULL;
4191 struct ocfs2_xattr_value_root *xv;
4192 struct ocfs2_xattr_entry *xe;
4193 struct ocfs2_xattr_header *xh =
4194 (struct ocfs2_xattr_header *)header_bh->b_data;
4195 size_t blocksize = inode->i_sb->s_blocksize;
4196
4197 xe = &xh->xh_entries[xe_off];
4198
4199 BUG_ON(!xe || ocfs2_xattr_is_local(xe));
4200
4201 offset = le16_to_cpu(xe->xe_name_offset) +
4202 OCFS2_XATTR_SIZE(xe->xe_name_len);
4203
4204 value_blk = offset / blocksize;
4205
4206 /* We don't allow ocfs2_xattr_value to be stored in different block. */
4207 BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
4208 value_blk += header_bh->b_blocknr;
4209
4210 ret = ocfs2_read_block(inode, value_blk, &value_bh);
4211 if (ret) {
4212 mlog_errno(ret);
4213 goto out;
4214 }
4215
4216 xv = (struct ocfs2_xattr_value_root *)
4217 (value_bh->b_data + offset % blocksize);
4218
4219 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
4220 xe_off, (unsigned long long)header_bh->b_blocknr, len);
4221 ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
4222 if (ret) {
4223 mlog_errno(ret);
4224 goto out;
4225 }
4226
4227 ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
4228 if (ret) {
4229 mlog_errno(ret);
4230 goto out;
4231 }
4232
4233out:
4234 brelse(value_bh);
4235 return ret;
4236}
4237
4238static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
4239 struct ocfs2_xattr_search *xs,
4240 int len)
4241{
4242 int ret, offset;
4243 struct ocfs2_xattr_entry *xe = xs->here;
4244 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
4245
4246 BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
4247
4248 offset = xe - xh->xh_entries;
4249 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
4250 offset, len);
4251 if (ret)
4252 mlog_errno(ret);
4253
4254 return ret;
4255}
4256
4257static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4258 struct ocfs2_xattr_search *xs,
4259 char *val,
4260 int value_len)
4261{
4262 int offset;
4263 struct ocfs2_xattr_value_root *xv;
4264 struct ocfs2_xattr_entry *xe = xs->here;
4265
4266 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
4267
4268 offset = le16_to_cpu(xe->xe_name_offset) +
4269 OCFS2_XATTR_SIZE(xe->xe_name_len);
4270
4271 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
4272
4273 return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
4274}
4275
4276static int ocfs2_rm_xattr_cluster(struct inode *inode,
4277 struct buffer_head *root_bh,
4278 u64 blkno,
4279 u32 cpos,
4280 u32 len)
4281{
4282 int ret;
4283 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4284 struct inode *tl_inode = osb->osb_tl_inode;
4285 handle_t *handle;
4286 struct ocfs2_xattr_block *xb =
4287 (struct ocfs2_xattr_block *)root_bh->b_data;
4288 struct ocfs2_alloc_context *meta_ac = NULL;
4289 struct ocfs2_cached_dealloc_ctxt dealloc;
4290 struct ocfs2_extent_tree et;
4291
4292 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
4293
4294 ocfs2_init_dealloc_ctxt(&dealloc);
4295
4296 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
4297 cpos, len, (unsigned long long)blkno);
4298
4299 ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
4300
4301 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
4302 if (ret) {
4303 mlog_errno(ret);
4304 return ret;
4305 }
4306
4307 mutex_lock(&tl_inode->i_mutex);
4308
4309 if (ocfs2_truncate_log_needs_flush(osb)) {
4310 ret = __ocfs2_flush_truncate_log(osb);
4311 if (ret < 0) {
4312 mlog_errno(ret);
4313 goto out;
4314 }
4315 }
4316
4317 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
4318 if (handle == NULL) {
4319 ret = -ENOMEM;
4320 mlog_errno(ret);
4321 goto out;
4322 }
4323
4324 ret = ocfs2_journal_access(handle, inode, root_bh,
4325 OCFS2_JOURNAL_ACCESS_WRITE);
4326 if (ret) {
4327 mlog_errno(ret);
4328 goto out_commit;
4329 }
4330
4331 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
4332 &dealloc);
4333 if (ret) {
4334 mlog_errno(ret);
4335 goto out_commit;
4336 }
4337
4338 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
4339
4340 ret = ocfs2_journal_dirty(handle, root_bh);
4341 if (ret) {
4342 mlog_errno(ret);
4343 goto out_commit;
4344 }
4345
4346 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
4347 if (ret)
4348 mlog_errno(ret);
4349
4350out_commit:
4351 ocfs2_commit_trans(osb, handle);
4352out:
4353 ocfs2_schedule_truncate_log_flush(osb, 1);
4354
4355 mutex_unlock(&tl_inode->i_mutex);
4356
4357 if (meta_ac)
4358 ocfs2_free_alloc_context(meta_ac);
4359
4360 ocfs2_run_deallocs(osb, &dealloc);
4361
4362 return ret;
4363}
4364
4365static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
4366 struct ocfs2_xattr_search *xs)
4367{
4368 handle_t *handle = NULL;
4369 struct ocfs2_xattr_header *xh = xs->bucket.xh;
4370 struct ocfs2_xattr_entry *last = &xh->xh_entries[
4371 le16_to_cpu(xh->xh_count) - 1];
4372 int ret = 0;
4373
4374 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
4375 if (IS_ERR(handle)) {
4376 ret = PTR_ERR(handle);
4377 mlog_errno(ret);
4378 return;
4379 }
4380
4381 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
4382 OCFS2_JOURNAL_ACCESS_WRITE);
4383 if (ret) {
4384 mlog_errno(ret);
4385 goto out_commit;
4386 }
4387
4388 /* Remove the old entry. */
4389 memmove(xs->here, xs->here + 1,
4390 (void *)last - (void *)xs->here);
4391 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
4392 le16_add_cpu(&xh->xh_count, -1);
4393
4394 ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
4395 if (ret < 0)
4396 mlog_errno(ret);
4397out_commit:
4398 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
4399}
4400
4401/*
4402 * Set the xattr name/value in the bucket specified in xs.
4403 *
4404 * As the new value in xi may be stored in the bucket or in an outside cluster,
4405 * we divide the whole process into 3 steps:
4406 * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
4407 * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
4408 * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
4409 * 4. If the clusters for the new outside value can't be allocated, we need
4410 * to free the xattr we allocated in set.
4411 */
4412static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4413 struct ocfs2_xattr_info *xi,
4414 struct ocfs2_xattr_search *xs)
4415{
4416 int ret, local = 1;
4417 size_t value_len;
4418 char *val = (char *)xi->value;
4419 struct ocfs2_xattr_entry *xe = xs->here;
4420 u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
4421 strlen(xi->name));
4422
4423 if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
4424 /*
4425 * We need to truncate the xattr storage first.
4426 *
4427 * If both the old and new value are stored to
4428 * outside block, we only need to truncate
4429 * the storage and then set the value outside.
4430 *
4431 * If the new value should be stored within block,
4432 * we should free all the outside block first and
4433 * the modification to the xattr block will be done
4434 * by following steps.
4435 */
4436 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
4437 value_len = xi->value_len;
4438 else
4439 value_len = 0;
4440
4441 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4442 value_len);
4443 if (ret)
4444 goto out;
4445
4446 if (value_len)
4447 goto set_value_outside;
4448 }
4449
4450 value_len = xi->value_len;
4451 /* So we have to handle the inside block change now. */
4452 if (value_len > OCFS2_XATTR_INLINE_SIZE) {
4453 /*
4454 * If the new value will be stored outside of block,
4455 * initalize a new empty value root and insert it first.
4456 */
4457 local = 0;
4458 xi->value = &def_xv;
4459 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
4460 }
4461
4462 ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
4463 if (ret) {
4464 mlog_errno(ret);
4465 goto out;
4466 }
4467
4468 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
4469 goto out;
4470
4471 /* allocate the space now for the outside block storage. */
4472 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4473 value_len);
4474 if (ret) {
4475 mlog_errno(ret);
4476
4477 if (xs->not_found) {
4478 /*
4479 * We can't allocate enough clusters for outside
4480 * storage and we have allocated xattr already,
4481 * so need to remove it.
4482 */
4483 ocfs2_xattr_bucket_remove_xs(inode, xs);
4484 }
4485 goto out;
4486 }
4487
4488set_value_outside:
4489 ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
4490out:
4491 return ret;
4492}
4493
4494/* check whether the xattr bucket is filled up with the same hash value. */
4495static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4496 struct ocfs2_xattr_bucket *bucket)
4497{
4498 struct ocfs2_xattr_header *xh = bucket->xh;
4499
4500 if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
4501 xh->xh_entries[0].xe_name_hash) {
4502 mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
4503 "hash = %u\n",
4504 (unsigned long long)bucket->bhs[0]->b_blocknr,
4505 le32_to_cpu(xh->xh_entries[0].xe_name_hash));
4506 return -ENOSPC;
4507 }
4508
4509 return 0;
4510}
4511
4512static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
4513 struct ocfs2_xattr_info *xi,
4514 struct ocfs2_xattr_search *xs)
4515{
4516 struct ocfs2_xattr_header *xh;
4517 struct ocfs2_xattr_entry *xe;
4518 u16 count, header_size, xh_free_start;
4519 int i, free, max_free, need, old;
4520 size_t value_size = 0, name_len = strlen(xi->name);
4521 size_t blocksize = inode->i_sb->s_blocksize;
4522 int ret, allocation = 0;
4523 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4524
4525 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
4526
4527try_again:
4528 xh = xs->header;
4529 count = le16_to_cpu(xh->xh_count);
4530 xh_free_start = le16_to_cpu(xh->xh_free_start);
4531 header_size = sizeof(struct ocfs2_xattr_header) +
4532 count * sizeof(struct ocfs2_xattr_entry);
4533 max_free = OCFS2_XATTR_BUCKET_SIZE -
4534 le16_to_cpu(xh->xh_name_value_len) - header_size;
4535
4536 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
4537 "of %u which exceed block size\n",
4538 (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
4539 header_size);
4540
4541 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
4542 value_size = OCFS2_XATTR_ROOT_SIZE;
4543 else if (xi->value)
4544 value_size = OCFS2_XATTR_SIZE(xi->value_len);
4545
4546 if (xs->not_found)
4547 need = sizeof(struct ocfs2_xattr_entry) +
4548 OCFS2_XATTR_SIZE(name_len) + value_size;
4549 else {
4550 need = value_size + OCFS2_XATTR_SIZE(name_len);
4551
4552 /*
4553 * We only replace the old value if the new length is smaller
4554 * than the old one. Otherwise we will allocate new space in the
4555 * bucket to store it.
4556 */
4557 xe = xs->here;
4558 if (ocfs2_xattr_is_local(xe))
4559 old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4560 else
4561 old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
4562
4563 if (old >= value_size)
4564 need = 0;
4565 }
4566
4567 free = xh_free_start - header_size;
4568 /*
4569 * We need to make sure the new name/value pair
4570 * can exist in the same block.
4571 */
4572 if (xh_free_start % blocksize < need)
4573 free -= xh_free_start % blocksize;
4574
4575 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
4576 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
4577 " %u\n", xs->not_found,
4578 (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
4579 free, need, max_free, le16_to_cpu(xh->xh_free_start),
4580 le16_to_cpu(xh->xh_name_value_len));
4581
4582 if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
4583 if (need <= max_free &&
4584 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
4585 /*
4586 * We can create the space by defragment. Since only the
4587 * name/value will be moved, the xe shouldn't be changed
4588 * in xs.
4589 */
4590 ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
4591 if (ret) {
4592 mlog_errno(ret);
4593 goto out;
4594 }
4595
4596 xh_free_start = le16_to_cpu(xh->xh_free_start);
4597 free = xh_free_start - header_size;
4598 if (xh_free_start % blocksize < need)
4599 free -= xh_free_start % blocksize;
4600
4601 if (free >= need)
4602 goto xattr_set;
4603
4604 mlog(0, "Can't get enough space for xattr insert by "
4605 "defragment. Need %u bytes, but we have %d, so "
4606 "allocate new bucket for it.\n", need, free);
4607 }
4608
4609 /*
4610 * We have to add new buckets or clusters and one
4611 * allocation should leave us enough space for insert.
4612 */
4613 BUG_ON(allocation);
4614
4615 /*
4616 * We do not allow for overlapping ranges between buckets. And
4617 * the maximum number of collisions we will allow for then is
4618 * one bucket's worth, so check it here whether we need to
4619 * add a new bucket for the insert.
4620 */
4621 ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
4622 if (ret) {
4623 mlog_errno(ret);
4624 goto out;
4625 }
4626
4627 ret = ocfs2_add_new_xattr_bucket(inode,
4628 xs->xattr_bh,
4629 xs->bucket.bhs[0]);
4630 if (ret) {
4631 mlog_errno(ret);
4632 goto out;
4633 }
4634
4635 for (i = 0; i < blk_per_bucket; i++)
4636 brelse(xs->bucket.bhs[i]);
4637
4638 memset(&xs->bucket, 0, sizeof(xs->bucket));
4639
4640 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
4641 xi->name_index,
4642 xi->name, xs);
4643 if (ret && ret != -ENODATA)
4644 goto out;
4645 xs->not_found = ret;
4646 allocation = 1;
4647 goto try_again;
4648 }
4649
4650xattr_set:
4651 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
4652out:
4653 mlog_exit(ret);
4654 return ret;
4655}
4656
4657static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
4658 struct ocfs2_xattr_bucket *bucket,
4659 void *para)
4660{
4661 int ret = 0;
4662 struct ocfs2_xattr_header *xh = bucket->xh;
4663 u16 i;
4664 struct ocfs2_xattr_entry *xe;
4665
4666 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4667 xe = &xh->xh_entries[i];
4668 if (ocfs2_xattr_is_local(xe))
4669 continue;
4670
4671 ret = ocfs2_xattr_bucket_value_truncate(inode,
4672 bucket->bhs[0],
4673 i, 0);
4674 if (ret) {
4675 mlog_errno(ret);
4676 break;
4677 }
4678 }
4679
4680 return ret;
4681}
4682
4683static int ocfs2_delete_xattr_index_block(struct inode *inode,
4684 struct buffer_head *xb_bh)
4685{
4686 struct ocfs2_xattr_block *xb =
4687 (struct ocfs2_xattr_block *)xb_bh->b_data;
4688 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
4689 int ret = 0;
4690 u32 name_hash = UINT_MAX, e_cpos, num_clusters;
4691 u64 p_blkno;
4692
4693 if (le16_to_cpu(el->l_next_free_rec) == 0)
4694 return 0;
4695
4696 while (name_hash > 0) {
4697 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
4698 &e_cpos, &num_clusters, el);
4699 if (ret) {
4700 mlog_errno(ret);
4701 goto out;
4702 }
4703
4704 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
4705 ocfs2_delete_xattr_in_bucket,
4706 NULL);
4707 if (ret) {
4708 mlog_errno(ret);
4709 goto out;
4710 }
4711
4712 ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
4713 p_blkno, e_cpos, num_clusters);
4714 if (ret) {
4715 mlog_errno(ret);
4716 break;
4717 }
4718
4719 if (e_cpos == 0)
4720 break;
4721
4722 name_hash = e_cpos - 1;
4723 }
4724
4725out:
4726 return ret;
4727}
4728
4729/*
4730 * 'trusted' attributes support
4731 */
4732
4733#define XATTR_TRUSTED_PREFIX "trusted."
4734
4735static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
4736 size_t list_size, const char *name,
4737 size_t name_len)
4738{
4739 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
4740 const size_t total_len = prefix_len + name_len + 1;
4741
4742 if (list && total_len <= list_size) {
4743 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
4744 memcpy(list + prefix_len, name, name_len);
4745 list[prefix_len + name_len] = '\0';
4746 }
4747 return total_len;
4748}
4749
4750static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
4751 void *buffer, size_t size)
4752{
4753 if (strcmp(name, "") == 0)
4754 return -EINVAL;
4755 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
4756 buffer, size);
4757}
4758
4759static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
4760 const void *value, size_t size, int flags)
4761{
4762 if (strcmp(name, "") == 0)
4763 return -EINVAL;
4764
4765 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
4766 size, flags);
4767}
4768
4769struct xattr_handler ocfs2_xattr_trusted_handler = {
4770 .prefix = XATTR_TRUSTED_PREFIX,
4771 .list = ocfs2_xattr_trusted_list,
4772 .get = ocfs2_xattr_trusted_get,
4773 .set = ocfs2_xattr_trusted_set,
4774};
4775
4776
4777/*
4778 * 'user' attributes support
4779 */
4780
4781#define XATTR_USER_PREFIX "user."
4782
4783static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
4784 size_t list_size, const char *name,
4785 size_t name_len)
4786{
4787 const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
4788 const size_t total_len = prefix_len + name_len + 1;
4789 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4790
4791 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
4792 return 0;
4793
4794 if (list && total_len <= list_size) {
4795 memcpy(list, XATTR_USER_PREFIX, prefix_len);
4796 memcpy(list + prefix_len, name, name_len);
4797 list[prefix_len + name_len] = '\0';
4798 }
4799 return total_len;
4800}
4801
4802static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
4803 void *buffer, size_t size)
4804{
4805 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4806
4807 if (strcmp(name, "") == 0)
4808 return -EINVAL;
4809 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
4810 return -EOPNOTSUPP;
4811 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
4812 buffer, size);
4813}
4814
4815static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
4816 const void *value, size_t size, int flags)
4817{
4818 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4819
4820 if (strcmp(name, "") == 0)
4821 return -EINVAL;
4822 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
4823 return -EOPNOTSUPP;
4824
4825 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
4826 size, flags);
4827}
4828
4829struct xattr_handler ocfs2_xattr_user_handler = {
4830 .prefix = XATTR_USER_PREFIX,
4831 .list = ocfs2_xattr_user_list,
4832 .get = ocfs2_xattr_user_get,
4833 .set = ocfs2_xattr_user_set,
4834};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 000000000000..c25c7c62a059
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,68 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * xattr.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_XATTR_H
27#define OCFS2_XATTR_H
28
29#include <linux/init.h>
30#include <linux/xattr.h>
31
32enum ocfs2_xattr_type {
33 OCFS2_XATTR_INDEX_USER = 1,
34 OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
35 OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
36 OCFS2_XATTR_INDEX_TRUSTED,
37 OCFS2_XATTR_INDEX_SECURITY,
38 OCFS2_XATTR_MAX
39};
40
41extern struct xattr_handler ocfs2_xattr_user_handler;
42extern struct xattr_handler ocfs2_xattr_trusted_handler;
43
44extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
45extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
46extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
47 size_t, int);
48extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
49extern struct xattr_handler *ocfs2_xattr_handlers[];
50
51static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
52{
53 return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
54}
55
56static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
57{
58 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
59}
60
61static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
62{
63 u16 len = sb->s_blocksize -
64 offsetof(struct ocfs2_xattr_header, xh_entries);
65
66 return len / sizeof(struct ocfs2_xattr_entry);
67}
68#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index 697663b01bae..e1c0ec0ae989 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -92,7 +92,7 @@ int omfs_allocate_block(struct super_block *sb, u64 block)
92 struct buffer_head *bh; 92 struct buffer_head *bh;
93 struct omfs_sb_info *sbi = OMFS_SB(sb); 93 struct omfs_sb_info *sbi = OMFS_SB(sb);
94 int bits_per_entry = 8 * sb->s_blocksize; 94 int bits_per_entry = 8 * sb->s_blocksize;
95 int map, bit; 95 unsigned int map, bit;
96 int ret = 0; 96 int ret = 0;
97 u64 tmp; 97 u64 tmp;
98 98
@@ -176,7 +176,8 @@ int omfs_clear_range(struct super_block *sb, u64 block, int count)
176 struct omfs_sb_info *sbi = OMFS_SB(sb); 176 struct omfs_sb_info *sbi = OMFS_SB(sb);
177 int bits_per_entry = 8 * sb->s_blocksize; 177 int bits_per_entry = 8 * sb->s_blocksize;
178 u64 tmp; 178 u64 tmp;
179 int map, bit, ret; 179 unsigned int map, bit;
180 int ret;
180 181
181 tmp = block; 182 tmp = block;
182 bit = do_div(tmp, bits_per_entry); 183 bit = do_div(tmp, bits_per_entry);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 7e2499053e4d..834b2331f6b3 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -26,6 +26,13 @@ static int omfs_sync_file(struct file *file, struct dentry *dentry,
26 return err ? -EIO : 0; 26 return err ? -EIO : 0;
27} 27}
28 28
29static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
30{
31 return (sbi->s_sys_blocksize - offset -
32 sizeof(struct omfs_extent)) /
33 sizeof(struct omfs_extent_entry) + 1;
34}
35
29void omfs_make_empty_table(struct buffer_head *bh, int offset) 36void omfs_make_empty_table(struct buffer_head *bh, int offset)
30{ 37{
31 struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset]; 38 struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
@@ -45,6 +52,7 @@ int omfs_shrink_inode(struct inode *inode)
45 struct buffer_head *bh; 52 struct buffer_head *bh;
46 u64 next, last; 53 u64 next, last;
47 u32 extent_count; 54 u32 extent_count;
55 u32 max_extents;
48 int ret; 56 int ret;
49 57
50 /* traverse extent table, freeing each entry that is greater 58 /* traverse extent table, freeing each entry that is greater
@@ -62,15 +70,18 @@ int omfs_shrink_inode(struct inode *inode)
62 goto out; 70 goto out;
63 71
64 oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); 72 oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
73 max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
65 74
66 for (;;) { 75 for (;;) {
67 76
68 if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) { 77 if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
69 brelse(bh); 78 goto out_brelse;
70 goto out;
71 }
72 79
73 extent_count = be32_to_cpu(oe->e_extent_count); 80 extent_count = be32_to_cpu(oe->e_extent_count);
81
82 if (extent_count > max_extents)
83 goto out_brelse;
84
74 last = next; 85 last = next;
75 next = be64_to_cpu(oe->e_next); 86 next = be64_to_cpu(oe->e_next);
76 entry = &oe->e_entry; 87 entry = &oe->e_entry;
@@ -98,10 +109,14 @@ int omfs_shrink_inode(struct inode *inode)
98 if (!bh) 109 if (!bh)
99 goto out; 110 goto out;
100 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); 111 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
112 max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
101 } 113 }
102 ret = 0; 114 ret = 0;
103out: 115out:
104 return ret; 116 return ret;
117out_brelse:
118 brelse(bh);
119 return ret;
105} 120}
106 121
107static void omfs_truncate(struct inode *inode) 122static void omfs_truncate(struct inode *inode)
@@ -154,9 +169,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
154 goto out; 169 goto out;
155 } 170 }
156 } 171 }
157 max_count = (sbi->s_sys_blocksize - OMFS_EXTENT_START - 172 max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);
158 sizeof(struct omfs_extent)) /
159 sizeof(struct omfs_extent_entry) + 1;
160 173
161 /* TODO: add a continuation block here */ 174 /* TODO: add a continuation block here */
162 if (be32_to_cpu(oe->e_extent_count) > max_count-1) 175 if (be32_to_cpu(oe->e_extent_count) > max_count-1)
@@ -225,6 +238,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
225 sector_t next, offset; 238 sector_t next, offset;
226 int ret; 239 int ret;
227 u64 new_block; 240 u64 new_block;
241 u32 max_extents;
228 int extent_count; 242 int extent_count;
229 struct omfs_extent *oe; 243 struct omfs_extent *oe;
230 struct omfs_extent_entry *entry; 244 struct omfs_extent_entry *entry;
@@ -238,6 +252,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
238 goto out; 252 goto out;
239 253
240 oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); 254 oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
255 max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
241 next = inode->i_ino; 256 next = inode->i_ino;
242 257
243 for (;;) { 258 for (;;) {
@@ -249,6 +264,9 @@ static int omfs_get_block(struct inode *inode, sector_t block,
249 next = be64_to_cpu(oe->e_next); 264 next = be64_to_cpu(oe->e_next);
250 entry = &oe->e_entry; 265 entry = &oe->e_entry;
251 266
267 if (extent_count > max_extents)
268 goto out_brelse;
269
252 offset = find_block(inode, entry, block, extent_count, &remain); 270 offset = find_block(inode, entry, block, extent_count, &remain);
253 if (offset > 0) { 271 if (offset > 0) {
254 ret = 0; 272 ret = 0;
@@ -266,6 +284,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
266 if (!bh) 284 if (!bh)
267 goto out; 285 goto out;
268 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); 286 oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
287 max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
269 } 288 }
270 if (create) { 289 if (create) {
271 ret = omfs_grow_extent(inode, oe, &new_block); 290 ret = omfs_grow_extent(inode, oe, &new_block);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index a95fe5984f4b..cbf047a847c5 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -232,8 +232,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
232 inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); 232 inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
233 inode->i_op = &omfs_dir_inops; 233 inode->i_op = &omfs_dir_inops;
234 inode->i_fop = &omfs_dir_operations; 234 inode->i_fop = &omfs_dir_operations;
235 inode->i_size = be32_to_cpu(oi->i_head.h_body_size) + 235 inode->i_size = sbi->s_sys_blocksize;
236 sizeof(struct omfs_header);
237 inc_nlink(inode); 236 inc_nlink(inode);
238 break; 237 break;
239 case OMFS_FILE: 238 case OMFS_FILE:
@@ -347,7 +346,7 @@ enum {
347 Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask 346 Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
348}; 347};
349 348
350static match_table_t tokens = { 349static const match_table_t tokens = {
351 {Opt_uid, "uid=%u"}, 350 {Opt_uid, "uid=%u"},
352 {Opt_gid, "gid=%u"}, 351 {Opt_gid, "gid=%u"},
353 {Opt_umask, "umask=%o"}, 352 {Opt_umask, "umask=%o"},
diff --git a/fs/open.c b/fs/open.c
index 07da9359481c..5596049863bf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1141,8 +1141,7 @@ EXPORT_SYMBOL(sys_close);
1141asmlinkage long sys_vhangup(void) 1141asmlinkage long sys_vhangup(void)
1142{ 1142{
1143 if (capable(CAP_SYS_TTY_CONFIG)) { 1143 if (capable(CAP_SYS_TTY_CONFIG)) {
1144 /* XXX: this needs locking */ 1144 tty_vhangup_self();
1145 tty_vhangup(current->signal->tty);
1146 return 0; 1145 return 0;
1147 } 1146 }
1148 return -EPERM; 1147 return -EPERM;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7d6b34e201db..7408227c49c9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
120 * a pointer to that same buffer (for convenience). 120 * a pointer to that same buffer (for convenience).
121 */ 121 */
122 122
123char *disk_name(struct gendisk *hd, int part, char *buf) 123char *disk_name(struct gendisk *hd, int partno, char *buf)
124{ 124{
125 if (!part) 125 if (!partno)
126 snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); 126 snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
127 else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) 127 else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
128 snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part); 128 snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
129 else 129 else
130 snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part); 130 snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
131 131
132 return buf; 132 return buf;
133} 133}
134 134
135const char *bdevname(struct block_device *bdev, char *buf) 135const char *bdevname(struct block_device *bdev, char *buf)
136{ 136{
137 int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor; 137 return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
138 return disk_name(bdev->bd_disk, part, buf);
139} 138}
140 139
141EXPORT_SYMBOL(bdevname); 140EXPORT_SYMBOL(bdevname);
@@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
169 if (isdigit(state->name[strlen(state->name)-1])) 168 if (isdigit(state->name[strlen(state->name)-1]))
170 sprintf(state->name, "p"); 169 sprintf(state->name, "p");
171 170
172 state->limit = hd->minors; 171 state->limit = disk_max_parts(hd);
173 i = res = err = 0; 172 i = res = err = 0;
174 while (!res && check_part[i]) { 173 while (!res && check_part[i]) {
175 memset(&state->parts, 0, sizeof(state->parts)); 174 memset(&state->parts, 0, sizeof(state->parts));
@@ -204,21 +203,22 @@ static ssize_t part_start_show(struct device *dev,
204 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); 203 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
205} 204}
206 205
207static ssize_t part_size_show(struct device *dev, 206ssize_t part_size_show(struct device *dev,
208 struct device_attribute *attr, char *buf) 207 struct device_attribute *attr, char *buf)
209{ 208{
210 struct hd_struct *p = dev_to_part(dev); 209 struct hd_struct *p = dev_to_part(dev);
211 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 210 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
212} 211}
213 212
214static ssize_t part_stat_show(struct device *dev, 213ssize_t part_stat_show(struct device *dev,
215 struct device_attribute *attr, char *buf) 214 struct device_attribute *attr, char *buf)
216{ 215{
217 struct hd_struct *p = dev_to_part(dev); 216 struct hd_struct *p = dev_to_part(dev);
217 int cpu;
218 218
219 preempt_disable(); 219 cpu = part_stat_lock();
220 part_round_stats(p); 220 part_round_stats(cpu, p);
221 preempt_enable(); 221 part_stat_unlock();
222 return sprintf(buf, 222 return sprintf(buf,
223 "%8lu %8lu %8llu %8u " 223 "%8lu %8lu %8llu %8u "
224 "%8lu %8lu %8llu %8u " 224 "%8lu %8lu %8llu %8u "
@@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev,
238} 238}
239 239
240#ifdef CONFIG_FAIL_MAKE_REQUEST 240#ifdef CONFIG_FAIL_MAKE_REQUEST
241static ssize_t part_fail_show(struct device *dev, 241ssize_t part_fail_show(struct device *dev,
242 struct device_attribute *attr, char *buf) 242 struct device_attribute *attr, char *buf)
243{ 243{
244 struct hd_struct *p = dev_to_part(dev); 244 struct hd_struct *p = dev_to_part(dev);
245 245
246 return sprintf(buf, "%d\n", p->make_it_fail); 246 return sprintf(buf, "%d\n", p->make_it_fail);
247} 247}
248 248
249static ssize_t part_fail_store(struct device *dev, 249ssize_t part_fail_store(struct device *dev,
250 struct device_attribute *attr, 250 struct device_attribute *attr,
251 const char *buf, size_t count) 251 const char *buf, size_t count)
252{ 252{
253 struct hd_struct *p = dev_to_part(dev); 253 struct hd_struct *p = dev_to_part(dev);
254 int i; 254 int i;
@@ -300,40 +300,34 @@ struct device_type part_type = {
300 .release = part_release, 300 .release = part_release,
301}; 301};
302 302
303static inline void partition_sysfs_add_subdir(struct hd_struct *p) 303static void delete_partition_rcu_cb(struct rcu_head *head)
304{
305 struct kobject *k;
306
307 k = kobject_get(&p->dev.kobj);
308 p->holder_dir = kobject_create_and_add("holders", k);
309 kobject_put(k);
310}
311
312static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
313{ 304{
314 struct kobject *k; 305 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
315 306
316 k = kobject_get(&disk->dev.kobj); 307 part->start_sect = 0;
317 disk->holder_dir = kobject_create_and_add("holders", k); 308 part->nr_sects = 0;
318 disk->slave_dir = kobject_create_and_add("slaves", k); 309 part_stat_set_all(part, 0);
319 kobject_put(k); 310 put_device(part_to_dev(part));
320} 311}
321 312
322void delete_partition(struct gendisk *disk, int part) 313void delete_partition(struct gendisk *disk, int partno)
323{ 314{
324 struct hd_struct *p = disk->part[part-1]; 315 struct disk_part_tbl *ptbl = disk->part_tbl;
316 struct hd_struct *part;
325 317
326 if (!p) 318 if (partno >= ptbl->len)
327 return; 319 return;
328 if (!p->nr_sects) 320
321 part = ptbl->part[partno];
322 if (!part)
329 return; 323 return;
330 disk->part[part-1] = NULL; 324
331 p->start_sect = 0; 325 blk_free_devt(part_devt(part));
332 p->nr_sects = 0; 326 rcu_assign_pointer(ptbl->part[partno], NULL);
333 part_stat_set_all(p, 0); 327 kobject_put(part->holder_dir);
334 kobject_put(p->holder_dir); 328 device_del(part_to_dev(part));
335 device_del(&p->dev); 329
336 put_device(&p->dev); 330 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
337} 331}
338 332
339static ssize_t whole_disk_show(struct device *dev, 333static ssize_t whole_disk_show(struct device *dev,
@@ -344,102 +338,132 @@ static ssize_t whole_disk_show(struct device *dev,
344static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, 338static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
345 whole_disk_show, NULL); 339 whole_disk_show, NULL);
346 340
347int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) 341int add_partition(struct gendisk *disk, int partno,
342 sector_t start, sector_t len, int flags)
348{ 343{
349 struct hd_struct *p; 344 struct hd_struct *p;
345 dev_t devt = MKDEV(0, 0);
346 struct device *ddev = disk_to_dev(disk);
347 struct device *pdev;
348 struct disk_part_tbl *ptbl;
349 const char *dname;
350 int err; 350 int err;
351 351
352 err = disk_expand_part_tbl(disk, partno);
353 if (err)
354 return err;
355 ptbl = disk->part_tbl;
356
357 if (ptbl->part[partno])
358 return -EBUSY;
359
352 p = kzalloc(sizeof(*p), GFP_KERNEL); 360 p = kzalloc(sizeof(*p), GFP_KERNEL);
353 if (!p) 361 if (!p)
354 return -ENOMEM; 362 return -ENOMEM;
355 363
356 if (!init_part_stats(p)) { 364 if (!init_part_stats(p)) {
357 err = -ENOMEM; 365 err = -ENOMEM;
358 goto out0; 366 goto out_free;
359 } 367 }
368 pdev = part_to_dev(p);
369
360 p->start_sect = start; 370 p->start_sect = start;
361 p->nr_sects = len; 371 p->nr_sects = len;
362 p->partno = part; 372 p->partno = partno;
363 p->policy = disk->policy; 373 p->policy = get_disk_ro(disk);
364 374
365 if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1])) 375 dname = dev_name(ddev);
366 snprintf(p->dev.bus_id, BUS_ID_SIZE, 376 if (isdigit(dname[strlen(dname) - 1]))
367 "%sp%d", disk->dev.bus_id, part); 377 snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
368 else 378 else
369 snprintf(p->dev.bus_id, BUS_ID_SIZE, 379 snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
370 "%s%d", disk->dev.bus_id, part); 380
381 device_initialize(pdev);
382 pdev->class = &block_class;
383 pdev->type = &part_type;
384 pdev->parent = ddev;
371 385
372 device_initialize(&p->dev); 386 err = blk_alloc_devt(p, &devt);
373 p->dev.devt = MKDEV(disk->major, disk->first_minor + part); 387 if (err)
374 p->dev.class = &block_class; 388 goto out_free;
375 p->dev.type = &part_type; 389 pdev->devt = devt;
376 p->dev.parent = &disk->dev;
377 disk->part[part-1] = p;
378 390
379 /* delay uevent until 'holders' subdir is created */ 391 /* delay uevent until 'holders' subdir is created */
380 p->dev.uevent_suppress = 1; 392 pdev->uevent_suppress = 1;
381 err = device_add(&p->dev); 393 err = device_add(pdev);
382 if (err) 394 if (err)
383 goto out1; 395 goto out_put;
384 partition_sysfs_add_subdir(p); 396
385 p->dev.uevent_suppress = 0; 397 err = -ENOMEM;
398 p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
399 if (!p->holder_dir)
400 goto out_del;
401
402 pdev->uevent_suppress = 0;
386 if (flags & ADDPART_FLAG_WHOLEDISK) { 403 if (flags & ADDPART_FLAG_WHOLEDISK) {
387 err = device_create_file(&p->dev, &dev_attr_whole_disk); 404 err = device_create_file(pdev, &dev_attr_whole_disk);
388 if (err) 405 if (err)
389 goto out2; 406 goto out_del;
390 } 407 }
391 408
409 /* everything is up and running, commence */
410 INIT_RCU_HEAD(&p->rcu_head);
411 rcu_assign_pointer(ptbl->part[partno], p);
412
392 /* suppress uevent if the disk supresses it */ 413 /* suppress uevent if the disk supresses it */
393 if (!disk->dev.uevent_suppress) 414 if (!ddev->uevent_suppress)
394 kobject_uevent(&p->dev.kobj, KOBJ_ADD); 415 kobject_uevent(&pdev->kobj, KOBJ_ADD);
395 416
396 return 0; 417 return 0;
397 418
398out2: 419out_free:
399 device_del(&p->dev);
400out1:
401 put_device(&p->dev);
402 free_part_stats(p);
403out0:
404 kfree(p); 420 kfree(p);
405 return err; 421 return err;
422out_del:
423 kobject_put(p->holder_dir);
424 device_del(pdev);
425out_put:
426 put_device(pdev);
427 blk_free_devt(devt);
428 return err;
406} 429}
407 430
408/* Not exported, helper to add_disk(). */ 431/* Not exported, helper to add_disk(). */
409void register_disk(struct gendisk *disk) 432void register_disk(struct gendisk *disk)
410{ 433{
434 struct device *ddev = disk_to_dev(disk);
411 struct block_device *bdev; 435 struct block_device *bdev;
436 struct disk_part_iter piter;
437 struct hd_struct *part;
412 char *s; 438 char *s;
413 int i;
414 struct hd_struct *p;
415 int err; 439 int err;
416 440
417 disk->dev.parent = disk->driverfs_dev; 441 ddev->parent = disk->driverfs_dev;
418 disk->dev.devt = MKDEV(disk->major, disk->first_minor);
419 442
420 strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE); 443 strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
421 /* ewww... some of these buggers have / in the name... */ 444 /* ewww... some of these buggers have / in the name... */
422 s = strchr(disk->dev.bus_id, '/'); 445 s = strchr(ddev->bus_id, '/');
423 if (s) 446 if (s)
424 *s = '!'; 447 *s = '!';
425 448
426 /* delay uevents, until we scanned partition table */ 449 /* delay uevents, until we scanned partition table */
427 disk->dev.uevent_suppress = 1; 450 ddev->uevent_suppress = 1;
428 451
429 if (device_add(&disk->dev)) 452 if (device_add(ddev))
430 return; 453 return;
431#ifndef CONFIG_SYSFS_DEPRECATED 454#ifndef CONFIG_SYSFS_DEPRECATED
432 err = sysfs_create_link(block_depr, &disk->dev.kobj, 455 err = sysfs_create_link(block_depr, &ddev->kobj,
433 kobject_name(&disk->dev.kobj)); 456 kobject_name(&ddev->kobj));
434 if (err) { 457 if (err) {
435 device_del(&disk->dev); 458 device_del(ddev);
436 return; 459 return;
437 } 460 }
438#endif 461#endif
439 disk_sysfs_add_subdirs(disk); 462 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
463 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
440 464
441 /* No minors to use for partitions */ 465 /* No minors to use for partitions */
442 if (disk->minors == 1) 466 if (!disk_partitionable(disk))
443 goto exit; 467 goto exit;
444 468
445 /* No such device (e.g., media were just removed) */ 469 /* No such device (e.g., media were just removed) */
@@ -458,50 +482,66 @@ void register_disk(struct gendisk *disk)
458 482
459exit: 483exit:
460 /* announce disk after possible partitions are created */ 484 /* announce disk after possible partitions are created */
461 disk->dev.uevent_suppress = 0; 485 ddev->uevent_suppress = 0;
462 kobject_uevent(&disk->dev.kobj, KOBJ_ADD); 486 kobject_uevent(&ddev->kobj, KOBJ_ADD);
463 487
464 /* announce possible partitions */ 488 /* announce possible partitions */
465 for (i = 1; i < disk->minors; i++) { 489 disk_part_iter_init(&piter, disk, 0);
466 p = disk->part[i-1]; 490 while ((part = disk_part_iter_next(&piter)))
467 if (!p || !p->nr_sects) 491 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
468 continue; 492 disk_part_iter_exit(&piter);
469 kobject_uevent(&p->dev.kobj, KOBJ_ADD);
470 }
471} 493}
472 494
473int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 495int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
474{ 496{
497 struct disk_part_iter piter;
498 struct hd_struct *part;
475 struct parsed_partitions *state; 499 struct parsed_partitions *state;
476 int p, res; 500 int p, highest, res;
477 501
478 if (bdev->bd_part_count) 502 if (bdev->bd_part_count)
479 return -EBUSY; 503 return -EBUSY;
480 res = invalidate_partition(disk, 0); 504 res = invalidate_partition(disk, 0);
481 if (res) 505 if (res)
482 return res; 506 return res;
483 bdev->bd_invalidated = 0; 507
484 for (p = 1; p < disk->minors; p++) 508 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
485 delete_partition(disk, p); 509 while ((part = disk_part_iter_next(&piter)))
510 delete_partition(disk, part->partno);
511 disk_part_iter_exit(&piter);
512
486 if (disk->fops->revalidate_disk) 513 if (disk->fops->revalidate_disk)
487 disk->fops->revalidate_disk(disk); 514 disk->fops->revalidate_disk(disk);
515 check_disk_size_change(disk, bdev);
516 bdev->bd_invalidated = 0;
488 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 517 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
489 return 0; 518 return 0;
490 if (IS_ERR(state)) /* I/O error reading the partition table */ 519 if (IS_ERR(state)) /* I/O error reading the partition table */
491 return -EIO; 520 return -EIO;
492 521
493 /* tell userspace that the media / partition table may have changed */ 522 /* tell userspace that the media / partition table may have changed */
494 kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE); 523 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
495 524
525 /* Detect the highest partition number and preallocate
526 * disk->part_tbl. This is an optimization and not strictly
527 * necessary.
528 */
529 for (p = 1, highest = 0; p < state->limit; p++)
530 if (state->parts[p].size)
531 highest = p;
532
533 disk_expand_part_tbl(disk, highest);
534
535 /* add partitions */
496 for (p = 1; p < state->limit; p++) { 536 for (p = 1; p < state->limit; p++) {
497 sector_t size = state->parts[p].size; 537 sector_t size = state->parts[p].size;
498 sector_t from = state->parts[p].from; 538 sector_t from = state->parts[p].from;
499 if (!size) 539 if (!size)
500 continue; 540 continue;
501 if (from + size > get_capacity(disk)) { 541 if (from + size > get_capacity(disk)) {
502 printk(KERN_ERR " %s: p%d exceeds device capacity\n", 542 printk(KERN_WARNING
543 "%s: p%d exceeds device capacity\n",
503 disk->disk_name, p); 544 disk->disk_name, p);
504 continue;
505 } 545 }
506 res = add_partition(disk, p, from, size, state->parts[p].flags); 546 res = add_partition(disk, p, from, size, state->parts[p].flags);
507 if (res) { 547 if (res) {
@@ -541,25 +581,31 @@ EXPORT_SYMBOL(read_dev_sector);
541 581
542void del_gendisk(struct gendisk *disk) 582void del_gendisk(struct gendisk *disk)
543{ 583{
544 int p; 584 struct disk_part_iter piter;
585 struct hd_struct *part;
545 586
546 /* invalidate stuff */ 587 /* invalidate stuff */
547 for (p = disk->minors - 1; p > 0; p--) { 588 disk_part_iter_init(&piter, disk,
548 invalidate_partition(disk, p); 589 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
549 delete_partition(disk, p); 590 while ((part = disk_part_iter_next(&piter))) {
591 invalidate_partition(disk, part->partno);
592 delete_partition(disk, part->partno);
550 } 593 }
594 disk_part_iter_exit(&piter);
595
551 invalidate_partition(disk, 0); 596 invalidate_partition(disk, 0);
552 disk->capacity = 0; 597 blk_free_devt(disk_to_dev(disk)->devt);
598 set_capacity(disk, 0);
553 disk->flags &= ~GENHD_FL_UP; 599 disk->flags &= ~GENHD_FL_UP;
554 unlink_gendisk(disk); 600 unlink_gendisk(disk);
555 disk_stat_set_all(disk, 0); 601 part_stat_set_all(&disk->part0, 0);
556 disk->stamp = 0; 602 disk->part0.stamp = 0;
557 603
558 kobject_put(disk->holder_dir); 604 kobject_put(disk->part0.holder_dir);
559 kobject_put(disk->slave_dir); 605 kobject_put(disk->slave_dir);
560 disk->driverfs_dev = NULL; 606 disk->driverfs_dev = NULL;
561#ifndef CONFIG_SYSFS_DEPRECATED 607#ifndef CONFIG_SYSFS_DEPRECATED
562 sysfs_remove_link(block_depr, disk->dev.bus_id); 608 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
563#endif 609#endif
564 device_del(&disk->dev); 610 device_del(disk_to_dev(disk));
565} 611}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8b..98dbe1a84528 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
5 * add_gd_partition adds a partitions details to the devices partition 5 * add_gd_partition adds a partitions details to the devices partition
6 * description. 6 * description.
7 */ 7 */
8enum { MAX_PART = 256 };
9
10struct parsed_partitions { 8struct parsed_partitions {
11 char name[BDEVNAME_SIZE]; 9 char name[BDEVNAME_SIZE];
12 struct { 10 struct {
13 sector_t from; 11 sector_t from;
14 sector_t size; 12 sector_t size;
15 int flags; 13 int flags;
16 } parts[MAX_PART]; 14 } parts[DISK_MAX_PARTS];
17 int next; 15 int next;
18 int limit; 16 int limit;
19}; 17};
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 73cd7a418f06..50f8f0600f06 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -57,3 +57,13 @@ config PROC_SYSCTL
57 As it is generally a good thing, you should say Y here unless 57 As it is generally a good thing, you should say Y here unless
58 building a kernel for install/rescue disks or your system is very 58 building a kernel for install/rescue disks or your system is very
59 limited in memory. 59 limited in memory.
60
61config PROC_PAGE_MONITOR
62 default y
63 depends on PROC_FS && MMU
64 bool "Enable /proc page monitoring" if EMBEDDED
65 help
66 Various /proc files exist to monitor process memory utilization:
67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
68 /proc/kpagecount, and /proc/kpageflags. Disabling these
69 interfaces will reduce the size of the kernel by approximately 4kb.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0d6eb33597c6..f4bc0e789539 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -86,11 +86,6 @@
86#include <asm/processor.h> 86#include <asm/processor.h>
87#include "internal.h" 87#include "internal.h"
88 88
89/* Gcc optimizes away "strlen(x)" for constant x */
90#define ADDBUF(buffer, string) \
91do { memcpy(buffer, string, strlen(string)); \
92 buffer += strlen(string); } while (0)
93
94static inline void task_name(struct seq_file *m, struct task_struct *p) 89static inline void task_name(struct seq_file *m, struct task_struct *p)
95{ 90{
96 int i; 91 int i;
@@ -261,7 +256,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
261 sigemptyset(&ignored); 256 sigemptyset(&ignored);
262 sigemptyset(&caught); 257 sigemptyset(&caught);
263 258
264 rcu_read_lock();
265 if (lock_task_sighand(p, &flags)) { 259 if (lock_task_sighand(p, &flags)) {
266 pending = p->pending.signal; 260 pending = p->pending.signal;
267 shpending = p->signal->shared_pending.signal; 261 shpending = p->signal->shared_pending.signal;
@@ -272,7 +266,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
272 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; 266 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
273 unlock_task_sighand(p, &flags); 267 unlock_task_sighand(p, &flags);
274 } 268 }
275 rcu_read_unlock();
276 269
277 seq_printf(m, "Threads:\t%d\n", num_threads); 270 seq_printf(m, "Threads:\t%d\n", num_threads);
278 seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); 271 seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
@@ -337,65 +330,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
337 return 0; 330 return 0;
338} 331}
339 332
340/*
341 * Use precise platform statistics if available:
342 */
343#ifdef CONFIG_VIRT_CPU_ACCOUNTING
344static cputime_t task_utime(struct task_struct *p)
345{
346 return p->utime;
347}
348
349static cputime_t task_stime(struct task_struct *p)
350{
351 return p->stime;
352}
353#else
354static cputime_t task_utime(struct task_struct *p)
355{
356 clock_t utime = cputime_to_clock_t(p->utime),
357 total = utime + cputime_to_clock_t(p->stime);
358 u64 temp;
359
360 /*
361 * Use CFS's precise accounting:
362 */
363 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
364
365 if (total) {
366 temp *= utime;
367 do_div(temp, total);
368 }
369 utime = (clock_t)temp;
370
371 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
372 return p->prev_utime;
373}
374
375static cputime_t task_stime(struct task_struct *p)
376{
377 clock_t stime;
378
379 /*
380 * Use CFS's precise accounting. (we subtract utime from
381 * the total, to make sure the total observed by userspace
382 * grows monotonically - apps rely on that):
383 */
384 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
385 cputime_to_clock_t(task_utime(p));
386
387 if (stime >= 0)
388 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
389
390 return p->prev_stime;
391}
392#endif
393
394static cputime_t task_gtime(struct task_struct *p)
395{
396 return p->gtime;
397}
398
399static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, 333static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
400 struct pid *pid, struct task_struct *task, int whole) 334 struct pid *pid, struct task_struct *task, int whole)
401{ 335{
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a28840b11b89..b5918ae8ca79 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -148,9 +148,6 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
148 return count; 148 return count;
149} 149}
150 150
151int maps_protect;
152EXPORT_SYMBOL(maps_protect);
153
154static struct fs_struct *get_fs_struct(struct task_struct *task) 151static struct fs_struct *get_fs_struct(struct task_struct *task)
155{ 152{
156 struct fs_struct *fs; 153 struct fs_struct *fs;
@@ -164,7 +161,6 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
164 161
165static int get_nr_threads(struct task_struct *tsk) 162static int get_nr_threads(struct task_struct *tsk)
166{ 163{
167 /* Must be called with the rcu_read_lock held */
168 unsigned long flags; 164 unsigned long flags;
169 int count = 0; 165 int count = 0;
170 166
@@ -471,14 +467,10 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
471 467
472 struct rlimit rlim[RLIM_NLIMITS]; 468 struct rlimit rlim[RLIM_NLIMITS];
473 469
474 rcu_read_lock(); 470 if (!lock_task_sighand(task, &flags))
475 if (!lock_task_sighand(task,&flags)) {
476 rcu_read_unlock();
477 return 0; 471 return 0;
478 }
479 memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); 472 memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
480 unlock_task_sighand(task, &flags); 473 unlock_task_sighand(task, &flags);
481 rcu_read_unlock();
482 474
483 /* 475 /*
484 * print the file header 476 * print the file header
@@ -2443,6 +2435,13 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
2443} 2435}
2444#endif /* CONFIG_TASK_IO_ACCOUNTING */ 2436#endif /* CONFIG_TASK_IO_ACCOUNTING */
2445 2437
2438static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
2439 struct pid *pid, struct task_struct *task)
2440{
2441 seq_printf(m, "%08x\n", task->personality);
2442 return 0;
2443}
2444
2446/* 2445/*
2447 * Thread groups 2446 * Thread groups
2448 */ 2447 */
@@ -2459,6 +2458,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2459 REG("environ", S_IRUSR, environ), 2458 REG("environ", S_IRUSR, environ),
2460 INF("auxv", S_IRUSR, pid_auxv), 2459 INF("auxv", S_IRUSR, pid_auxv),
2461 ONE("status", S_IRUGO, pid_status), 2460 ONE("status", S_IRUGO, pid_status),
2461 ONE("personality", S_IRUSR, pid_personality),
2462 INF("limits", S_IRUSR, pid_limits), 2462 INF("limits", S_IRUSR, pid_limits),
2463#ifdef CONFIG_SCHED_DEBUG 2463#ifdef CONFIG_SCHED_DEBUG
2464 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2464 REG("sched", S_IRUGO|S_IWUSR, pid_sched),
@@ -2794,6 +2794,7 @@ static const struct pid_entry tid_base_stuff[] = {
2794 REG("environ", S_IRUSR, environ), 2794 REG("environ", S_IRUSR, environ),
2795 INF("auxv", S_IRUSR, pid_auxv), 2795 INF("auxv", S_IRUSR, pid_auxv),
2796 ONE("status", S_IRUGO, pid_status), 2796 ONE("status", S_IRUGO, pid_status),
2797 ONE("personality", S_IRUSR, pid_personality),
2797 INF("limits", S_IRUSR, pid_limits), 2798 INF("limits", S_IRUSR, pid_limits),
2798#ifdef CONFIG_SCHED_DEBUG 2799#ifdef CONFIG_SCHED_DEBUG
2799 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2800 REG("sched", S_IRUGO|S_IWUSR, pid_sched),
@@ -3088,9 +3089,7 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
3088 generic_fillattr(inode, stat); 3089 generic_fillattr(inode, stat);
3089 3090
3090 if (p) { 3091 if (p) {
3091 rcu_read_lock();
3092 stat->nlink += get_nr_threads(p); 3092 stat->nlink += get_nr_threads(p);
3093 rcu_read_unlock();
3094 put_task_struct(p); 3093 put_task_struct(p);
3095 } 3094 }
3096 3095
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 4fb81e9c94e3..7821589a17d5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -330,6 +330,7 @@ retry:
330 spin_lock(&proc_inum_lock); 330 spin_lock(&proc_inum_lock);
331 ida_remove(&proc_inum_ida, i); 331 ida_remove(&proc_inum_ida, i);
332 spin_unlock(&proc_inum_lock); 332 spin_unlock(&proc_inum_lock);
333 return 0;
333 } 334 }
334 return PROC_DYNAMIC_FIRST + i; 335 return PROC_DYNAMIC_FIRST + i;
335} 336}
@@ -546,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
546 547
547 for (tmp = dir->subdir; tmp; tmp = tmp->next) 548 for (tmp = dir->subdir; tmp; tmp = tmp->next)
548 if (strcmp(tmp->name, dp->name) == 0) { 549 if (strcmp(tmp->name, dp->name) == 0) {
549 printk(KERN_WARNING "proc_dir_entry '%s' already " 550 printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
550 "registered\n", dp->name); 551 dir->name, dp->name);
551 dump_stack(); 552 dump_stack();
552 break; 553 break;
553 } 554 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8bb03f056c28..c6b4fa7e3b49 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -342,7 +342,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
342 if (!pde->proc_fops) { 342 if (!pde->proc_fops) {
343 spin_unlock(&pde->pde_unload_lock); 343 spin_unlock(&pde->pde_unload_lock);
344 kfree(pdeo); 344 kfree(pdeo);
345 return rv; 345 return -EINVAL;
346 } 346 }
347 pde->pde_users++; 347 pde->pde_users++;
348 open = pde->proc_fops->open; 348 open = pde->proc_fops->open;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 442202314d53..3bfb7b8747b3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -45,8 +45,6 @@ do { \
45extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); 45extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
46#endif 46#endif
47 47
48extern int maps_protect;
49
50extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, 48extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
51 struct pid *pid, struct task_struct *task); 49 struct pid *pid, struct task_struct *task);
52extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, 50extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 79ecd281d2cb..3f87d2632947 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
52 } 52 }
53 53
54 seq_printf(m, 54 seq_printf(m,
55 "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", 55 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
56 vma->vm_start, 56 vma->vm_start,
57 vma->vm_end, 57 vma->vm_end,
58 flags & VM_READ ? 'r' : '-', 58 flags & VM_READ ? 'r' : '-',
59 flags & VM_WRITE ? 'w' : '-', 59 flags & VM_WRITE ? 'w' : '-',
60 flags & VM_EXEC ? 'x' : '-', 60 flags & VM_EXEC ? 'x' : '-',
61 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 61 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
62 vma->vm_pgoff << PAGE_SHIFT, 62 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
63 MAJOR(dev), MINOR(dev), ino, &len); 63 MAJOR(dev), MINOR(dev), ino, &len);
64 64
65 if (file) { 65 if (file) {
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ded969862960..b675a49c1823 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -24,6 +24,7 @@
24#include <linux/tty.h> 24#include <linux/tty.h>
25#include <linux/string.h> 25#include <linux/string.h>
26#include <linux/mman.h> 26#include <linux/mman.h>
27#include <linux/quicklist.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/ioport.h> 29#include <linux/ioport.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
@@ -67,7 +68,6 @@
67extern int get_hardware_list(char *); 68extern int get_hardware_list(char *);
68extern int get_stram_list(char *); 69extern int get_stram_list(char *);
69extern int get_exec_domain_list(char *); 70extern int get_exec_domain_list(char *);
70extern int get_dma_list(char *);
71 71
72static int proc_calc_metrics(char *page, char **start, off_t off, 72static int proc_calc_metrics(char *page, char **start, off_t off,
73 int count, int *eof, int len) 73 int count, int *eof, int len)
@@ -182,6 +182,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
182 "SReclaimable: %8lu kB\n" 182 "SReclaimable: %8lu kB\n"
183 "SUnreclaim: %8lu kB\n" 183 "SUnreclaim: %8lu kB\n"
184 "PageTables: %8lu kB\n" 184 "PageTables: %8lu kB\n"
185#ifdef CONFIG_QUICKLIST
186 "Quicklists: %8lu kB\n"
187#endif
185 "NFS_Unstable: %8lu kB\n" 188 "NFS_Unstable: %8lu kB\n"
186 "Bounce: %8lu kB\n" 189 "Bounce: %8lu kB\n"
187 "WritebackTmp: %8lu kB\n" 190 "WritebackTmp: %8lu kB\n"
@@ -214,6 +217,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
214 K(global_page_state(NR_SLAB_RECLAIMABLE)), 217 K(global_page_state(NR_SLAB_RECLAIMABLE)),
215 K(global_page_state(NR_SLAB_UNRECLAIMABLE)), 218 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
216 K(global_page_state(NR_PAGETABLE)), 219 K(global_page_state(NR_PAGETABLE)),
220#ifdef CONFIG_QUICKLIST
221 K(quicklist_total_size()),
222#endif
217 K(global_page_state(NR_UNSTABLE_NFS)), 223 K(global_page_state(NR_UNSTABLE_NFS)),
218 K(global_page_state(NR_BOUNCE)), 224 K(global_page_state(NR_BOUNCE)),
219 K(global_page_state(NR_WRITEBACK_TEMP)), 225 K(global_page_state(NR_WRITEBACK_TEMP)),
@@ -677,6 +683,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
677 return proc_calc_metrics(page, start, off, count, eof, len); 683 return proc_calc_metrics(page, start, off, count, eof, len);
678} 684}
679 685
686#ifdef CONFIG_FILE_LOCKING
680static int locks_open(struct inode *inode, struct file *filp) 687static int locks_open(struct inode *inode, struct file *filp)
681{ 688{
682 return seq_open(filp, &locks_seq_operations); 689 return seq_open(filp, &locks_seq_operations);
@@ -688,6 +695,7 @@ static const struct file_operations proc_locks_operations = {
688 .llseek = seq_lseek, 695 .llseek = seq_lseek,
689 .release = seq_release, 696 .release = seq_release,
690}; 697};
698#endif /* CONFIG_FILE_LOCKING */
691 699
692static int execdomains_read_proc(char *page, char **start, off_t off, 700static int execdomains_read_proc(char *page, char **start, off_t off,
693 int count, int *eof, void *data) 701 int count, int *eof, void *data)
@@ -881,7 +889,9 @@ void __init proc_misc_init(void)
881#ifdef CONFIG_PRINTK 889#ifdef CONFIG_PRINTK
882 proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); 890 proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
883#endif 891#endif
892#ifdef CONFIG_FILE_LOCKING
884 proc_create("locks", 0, NULL, &proc_locks_operations); 893 proc_create("locks", 0, NULL, &proc_locks_operations);
894#endif
885 proc_create("devices", 0, NULL, &proc_devinfo_operations); 895 proc_create("devices", 0, NULL, &proc_devinfo_operations);
886 proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); 896 proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
887#ifdef CONFIG_BLOCK 897#ifdef CONFIG_BLOCK
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f9a8b892718f..945a81043ba2 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -66,7 +66,7 @@ static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
66 return NULL; 66 return NULL;
67} 67}
68 68
69struct ctl_table_header *grab_header(struct inode *inode) 69static struct ctl_table_header *grab_header(struct inode *inode)
70{ 70{
71 if (PROC_I(inode)->sysctl) 71 if (PROC_I(inode)->sysctl)
72 return sysctl_head_grab(PROC_I(inode)->sysctl); 72 return sysctl_head_grab(PROC_I(inode)->sysctl);
@@ -395,10 +395,10 @@ static struct dentry_operations proc_sys_dentry_operations = {
395 .d_compare = proc_sys_compare, 395 .d_compare = proc_sys_compare,
396}; 396};
397 397
398static struct proc_dir_entry *proc_sys_root;
399
400int proc_sys_init(void) 398int proc_sys_init(void)
401{ 399{
400 struct proc_dir_entry *proc_sys_root;
401
402 proc_sys_root = proc_mkdir("sys", NULL); 402 proc_sys_root = proc_mkdir("sys", NULL);
403 proc_sys_root->proc_iops = &proc_sys_dir_operations; 403 proc_sys_root->proc_iops = &proc_sys_dir_operations;
404 proc_sys_root->proc_fops = &proc_sys_dir_file_operations; 404 proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7546a918f790..4806830ea2a1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,23 +210,20 @@ static int show_map(struct seq_file *m, void *v)
210 dev_t dev = 0; 210 dev_t dev = 0;
211 int len; 211 int len;
212 212
213 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
214 return -EACCES;
215
216 if (file) { 213 if (file) {
217 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 214 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
218 dev = inode->i_sb->s_dev; 215 dev = inode->i_sb->s_dev;
219 ino = inode->i_ino; 216 ino = inode->i_ino;
220 } 217 }
221 218
222 seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", 219 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
223 vma->vm_start, 220 vma->vm_start,
224 vma->vm_end, 221 vma->vm_end,
225 flags & VM_READ ? 'r' : '-', 222 flags & VM_READ ? 'r' : '-',
226 flags & VM_WRITE ? 'w' : '-', 223 flags & VM_WRITE ? 'w' : '-',
227 flags & VM_EXEC ? 'x' : '-', 224 flags & VM_EXEC ? 'x' : '-',
228 flags & VM_MAYSHARE ? 's' : 'p', 225 flags & VM_MAYSHARE ? 's' : 'p',
229 vma->vm_pgoff << PAGE_SHIFT, 226 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
230 MAJOR(dev), MINOR(dev), ino, &len); 227 MAJOR(dev), MINOR(dev), ino, &len);
231 228
232 /* 229 /*
@@ -742,22 +739,11 @@ const struct file_operations proc_pagemap_operations = {
742#ifdef CONFIG_NUMA 739#ifdef CONFIG_NUMA
743extern int show_numa_map(struct seq_file *m, void *v); 740extern int show_numa_map(struct seq_file *m, void *v);
744 741
745static int show_numa_map_checked(struct seq_file *m, void *v)
746{
747 struct proc_maps_private *priv = m->private;
748 struct task_struct *task = priv->task;
749
750 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
751 return -EACCES;
752
753 return show_numa_map(m, v);
754}
755
756static const struct seq_operations proc_pid_numa_maps_op = { 742static const struct seq_operations proc_pid_numa_maps_op = {
757 .start = m_start, 743 .start = m_start,
758 .next = m_next, 744 .next = m_next,
759 .stop = m_stop, 745 .stop = m_stop,
760 .show = show_numa_map_checked 746 .show = show_numa_map,
761}; 747};
762 748
763static int numa_maps_open(struct inode *inode, struct file *file) 749static int numa_maps_open(struct inode *inode, struct file *file)
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d84e7121df8..219bd79ea894 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -110,11 +110,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
110static int show_map(struct seq_file *m, void *_vml) 110static int show_map(struct seq_file *m, void *_vml)
111{ 111{
112 struct vm_list_struct *vml = _vml; 112 struct vm_list_struct *vml = _vml;
113 struct proc_maps_private *priv = m->private;
114 struct task_struct *task = priv->task;
115
116 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
117 return -EACCES;
118 113
119 return nommu_vma_show(m, vml->vma); 114 return nommu_vma_show(m, vml->vma);
120} 115}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9ac0f5e064e0..841368b87a29 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -165,14 +165,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
165 return acc; 165 return acc;
166} 166}
167 167
168static int open_vmcore(struct inode *inode, struct file *filp)
169{
170 return 0;
171}
172
173const struct file_operations proc_vmcore_operations = { 168const struct file_operations proc_vmcore_operations = {
174 .read = read_vmcore, 169 .read = read_vmcore,
175 .open = open_vmcore,
176}; 170};
177 171
178static struct vmcore* __init get_new_element(void) 172static struct vmcore* __init get_new_element(void)
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 52312ec93ff4..5145cb9125af 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {
58 * size 0 on the assumption that it's going to be used for an mmap of shared 58 * size 0 on the assumption that it's going to be used for an mmap of shared
59 * memory 59 * memory
60 */ 60 */
61static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) 61int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
62{ 62{
63 struct pagevec lru_pvec; 63 struct pagevec lru_pvec;
64 unsigned long npages, xpages, loop, limit; 64 unsigned long npages, xpages, loop, limit;
diff --git a/fs/readdir.c b/fs/readdir.c
index 4e026e5407fb..93a7559bbfd8 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset
80 if (buf->result) 80 if (buf->result)
81 return -EINVAL; 81 return -EINVAL;
82 d_ino = ino; 82 d_ino = ino;
83 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) 83 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
84 buf->result = -EOVERFLOW;
84 return -EOVERFLOW; 85 return -EOVERFLOW;
86 }
85 buf->result++; 87 buf->result++;
86 dirent = buf->dirent; 88 dirent = buf->dirent;
87 if (!access_ok(VERIFY_WRITE, dirent, 89 if (!access_ok(VERIFY_WRITE, dirent,
@@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
155 if (reclen > buf->count) 157 if (reclen > buf->count)
156 return -EINVAL; 158 return -EINVAL;
157 d_ino = ino; 159 d_ino = ino;
158 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) 160 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
161 buf->error = -EOVERFLOW;
159 return -EOVERFLOW; 162 return -EOVERFLOW;
163 }
160 dirent = buf->previous; 164 dirent = buf->previous;
161 if (dirent) { 165 if (dirent) {
162 if (__put_user(offset, &dirent->d_off)) 166 if (__put_user(offset, &dirent->d_off))
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 282a13596c70..d318c7e663fa 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,7 +27,6 @@
27#include <linux/mnt_namespace.h> 27#include <linux/mnt_namespace.h>
28#include <linux/mount.h> 28#include <linux/mount.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30#include <linux/quotaops.h>
31 30
32struct file_system_type reiserfs_fs_type; 31struct file_system_type reiserfs_fs_type;
33 32
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 3f54dbd6c49b..bd20f7f5a933 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
108 goto Done; 108 goto Done;
109 } 109 }
110 /* we need at least one record in buffer */ 110 /* we need at least one record in buffer */
111 pos = m->index;
112 p = m->op->start(m, &pos);
111 while (1) { 113 while (1) {
112 pos = m->index;
113 p = m->op->start(m, &pos);
114 err = PTR_ERR(p); 114 err = PTR_ERR(p);
115 if (!p || IS_ERR(p)) 115 if (!p || IS_ERR(p))
116 break; 116 break;
@@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
119 break; 119 break;
120 if (unlikely(err)) 120 if (unlikely(err))
121 m->count = 0; 121 m->count = 0;
122 if (unlikely(!m->count)) {
123 p = m->op->next(m, p, &pos);
124 m->index = pos;
125 continue;
126 }
122 if (m->count < m->size) 127 if (m->count < m->size)
123 goto Fill; 128 goto Fill;
124 m->op->stop(m, p); 129 m->op->stop(m, p);
@@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
128 goto Enomem; 133 goto Enomem;
129 m->count = 0; 134 m->count = 0;
130 m->version = 0; 135 m->version = 0;
136 pos = m->index;
137 p = m->op->start(m, &pos);
131 } 138 }
132 m->op->stop(m, p); 139 m->op->stop(m, p);
133 m->count = 0; 140 m->count = 0;
@@ -443,6 +450,20 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
443 return -1; 450 return -1;
444} 451}
445 452
453int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
454{
455 size_t len = bitmap_scnprintf_len(nr_bits);
456
457 if (m->count + len < m->size) {
458 bitmap_scnprintf(m->buf + m->count, m->size - m->count,
459 bits, nr_bits);
460 m->count += len;
461 return 0;
462 }
463 m->count = m->size;
464 return -1;
465}
466
446static void *single_start(struct seq_file *p, loff_t *pos) 467static void *single_start(struct seq_file *p, loff_t *pos)
447{ 468{
448 return NULL + (*pos == 0); 469 return NULL + (*pos == 0);
diff --git a/fs/splice.c b/fs/splice.c
index 1bbc6f4bb09c..a1e701c27156 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
898 if (unlikely(!(out->f_mode & FMODE_WRITE))) 898 if (unlikely(!(out->f_mode & FMODE_WRITE)))
899 return -EBADF; 899 return -EBADF;
900 900
901 if (unlikely(out->f_flags & O_APPEND))
902 return -EINVAL;
903
901 ret = rw_verify_area(WRITE, out, ppos, len); 904 ret = rw_verify_area(WRITE, out, ppos, len);
902 if (unlikely(ret < 0)) 905 if (unlikely(ret < 0))
903 return ret; 906 return ret;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index d81fb9ed2b8e..73db464cd08b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -263,8 +263,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
263 263
264 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 264 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
265 265
266 /* And make sure we have twice the index size of space reserved */ 266 /* And make sure we have thrice the index size of space reserved */
267 idx_size <<= 1; 267 idx_size = idx_size + (idx_size << 1);
268 268
269 /* 269 /*
270 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' 270 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
@@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
302 int subtract_lebs; 302 int subtract_lebs;
303 long long available; 303 long long available;
304 304
305 /*
306 * Force the amount available to the total size reported if the used
307 * space is zero.
308 */
309 if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
310 c->budg_data_growth + c->budg_dd_growth == 0) {
311 /* Do the same calculation as for c->block_cnt */
312 available = c->main_lebs - 2;
313 available *= c->leb_size - c->dark_wm;
314 return available;
315 }
316
317 available = c->main_bytes - c->lst.total_used; 305 available = c->main_bytes - c->lst.total_used;
318 306
319 /* 307 /*
@@ -388,11 +376,11 @@ static int can_use_rp(struct ubifs_info *c)
388 * This function makes sure UBIFS has enough free eraseblocks for index growth 376 * This function makes sure UBIFS has enough free eraseblocks for index growth
389 * and data. 377 * and data.
390 * 378 *
391 * When budgeting index space, UBIFS reserves twice as more LEBs as the index 379 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
392 * would take if it was consolidated and written to the flash. This guarantees 380 * would take if it was consolidated and written to the flash. This guarantees
393 * that the "in-the-gaps" commit method always succeeds and UBIFS will always 381 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
394 * be able to commit dirty index. So this function basically adds amount of 382 * be able to commit dirty index. So this function basically adds amount of
395 * budgeted index space to the size of the current index, multiplies this by 2, 383 * budgeted index space to the size of the current index, multiplies this by 3,
396 * and makes sure this does not exceed the amount of free eraseblocks. 384 * and makes sure this does not exceed the amount of free eraseblocks.
397 * 385 *
398 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 386 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
@@ -543,8 +531,16 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
543 int err, idx_growth, data_growth, dd_growth; 531 int err, idx_growth, data_growth, dd_growth;
544 struct retries_info ri; 532 struct retries_info ri;
545 533
534 ubifs_assert(req->new_page <= 1);
535 ubifs_assert(req->dirtied_page <= 1);
536 ubifs_assert(req->new_dent <= 1);
537 ubifs_assert(req->mod_dent <= 1);
538 ubifs_assert(req->new_ino <= 1);
539 ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
546 ubifs_assert(req->dirtied_ino <= 4); 540 ubifs_assert(req->dirtied_ino <= 4);
547 ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); 541 ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
542 ubifs_assert(!(req->new_ino_d & 7));
543 ubifs_assert(!(req->dirtied_ino_d & 7));
548 544
549 data_growth = calc_data_growth(c, req); 545 data_growth = calc_data_growth(c, req);
550 dd_growth = calc_dd_growth(c, req); 546 dd_growth = calc_dd_growth(c, req);
@@ -618,8 +614,16 @@ again:
618 */ 614 */
619void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) 615void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
620{ 616{
617 ubifs_assert(req->new_page <= 1);
618 ubifs_assert(req->dirtied_page <= 1);
619 ubifs_assert(req->new_dent <= 1);
620 ubifs_assert(req->mod_dent <= 1);
621 ubifs_assert(req->new_ino <= 1);
622 ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
621 ubifs_assert(req->dirtied_ino <= 4); 623 ubifs_assert(req->dirtied_ino <= 4);
622 ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); 624 ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
625 ubifs_assert(!(req->new_ino_d & 7));
626 ubifs_assert(!(req->dirtied_ino_d & 7));
623 if (!req->recalculate) { 627 if (!req->recalculate) {
624 ubifs_assert(req->idx_growth >= 0); 628 ubifs_assert(req->idx_growth >= 0);
625 ubifs_assert(req->data_growth >= 0); 629 ubifs_assert(req->data_growth >= 0);
@@ -647,7 +651,11 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
647 651
648 ubifs_assert(c->budg_idx_growth >= 0); 652 ubifs_assert(c->budg_idx_growth >= 0);
649 ubifs_assert(c->budg_data_growth >= 0); 653 ubifs_assert(c->budg_data_growth >= 0);
654 ubifs_assert(c->budg_dd_growth >= 0);
650 ubifs_assert(c->min_idx_lebs < c->main_lebs); 655 ubifs_assert(c->min_idx_lebs < c->main_lebs);
656 ubifs_assert(!(c->budg_idx_growth & 7));
657 ubifs_assert(!(c->budg_data_growth & 7));
658 ubifs_assert(!(c->budg_dd_growth & 7));
651 spin_unlock(&c->space_lock); 659 spin_unlock(&c->space_lock);
652} 660}
653 661
@@ -686,41 +694,114 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
686void ubifs_release_dirty_inode_budget(struct ubifs_info *c, 694void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
687 struct ubifs_inode *ui) 695 struct ubifs_inode *ui)
688{ 696{
689 struct ubifs_budget_req req = {.dd_growth = c->inode_budget, 697 struct ubifs_budget_req req;
690 .dirtied_ino_d = ui->data_len};
691 698
699 memset(&req, 0, sizeof(struct ubifs_budget_req));
700 req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
692 ubifs_release_budget(c, &req); 701 ubifs_release_budget(c, &req);
693} 702}
694 703
695/** 704/**
696 * ubifs_budg_get_free_space - return amount of free space. 705 * ubifs_reported_space - calculate reported free space.
706 * @c: the UBIFS file-system description object
707 * @free: amount of free space
708 *
709 * This function calculates amount of free space which will be reported to
710 * user-space. User-space application tend to expect that if the file-system
711 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
712 * are able to write a file of size N. UBIFS attaches node headers to each data
713 * node and it has to write indexind nodes as well. This introduces additional
714 * overhead, and UBIFS it has to report sligtly less free space to meet the
715 * above expectetion.
716 *
717 * This function assumes free space is made up of uncompressed data nodes and
718 * full index nodes (one per data node, tripled because we always allow enough
719 * space to write the index thrice).
720 *
721 * Note, the calculation is pessimistic, which means that most of the time
722 * UBIFS reports less space than it actually has.
723 */
724long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
725{
726 int divisor, factor, f;
727
728 /*
729 * Reported space size is @free * X, where X is UBIFS block size
730 * divided by UBIFS block size + all overhead one data block
731 * introduces. The overhead is the node header + indexing overhead.
732 *
733 * Indexing overhead calculations are based on the following formula:
734 * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
735 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
736 * as less than maximum fanout, we assume that each data node
737 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
738 * Note, the multiplier 3 is because UBIFS reseves thrice as more space
739 * for the index.
740 */
741 f = c->fanout > 3 ? c->fanout >> 1 : 2;
742 factor = UBIFS_BLOCK_SIZE;
743 divisor = UBIFS_MAX_DATA_NODE_SZ;
744 divisor += (c->max_idx_node_sz * 3) / (f - 1);
745 free *= factor;
746 do_div(free, divisor);
747 return free;
748}
749
750/**
751 * ubifs_get_free_space - return amount of free space.
697 * @c: UBIFS file-system description object 752 * @c: UBIFS file-system description object
698 * 753 *
699 * This function returns amount of free space on the file-system. 754 * This function calculates amount of free space to report to user-space.
755 *
756 * Because UBIFS may introduce substantial overhead (the index, node headers,
757 * alighment, wastage at the end of eraseblocks, etc), it cannot report real
758 * amount of free flash space it has (well, because not all dirty space is
759 * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
760 * it would bread user expectetion about what free space is. Users seem to
761 * accustomed to assume that if the file-system reports N bytes of free space,
762 * they would be able to fit a file of N bytes to the FS. This almost works for
763 * traditional file-systems, because they have way less overhead than UBIFS.
764 * So, to keep users happy, UBIFS tries to take the overhead into account.
700 */ 765 */
701long long ubifs_budg_get_free_space(struct ubifs_info *c) 766long long ubifs_get_free_space(struct ubifs_info *c)
702{ 767{
703 int min_idx_lebs, rsvd_idx_lebs; 768 int min_idx_lebs, rsvd_idx_lebs, lebs;
704 long long available, outstanding, free; 769 long long available, outstanding, free;
705 770
706 /* Do exactly the same calculations as in 'do_budget_space()' */
707 spin_lock(&c->space_lock); 771 spin_lock(&c->space_lock);
708 min_idx_lebs = ubifs_calc_min_idx_lebs(c); 772 min_idx_lebs = ubifs_calc_min_idx_lebs(c);
773 outstanding = c->budg_data_growth + c->budg_dd_growth;
709 774
710 if (min_idx_lebs > c->lst.idx_lebs) 775 /*
711 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; 776 * Force the amount available to the total size reported if the used
712 else 777 * space is zero.
713 rsvd_idx_lebs = 0; 778 */
714 779 if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
715 if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
716 - c->lst.taken_empty_lebs) {
717 spin_unlock(&c->space_lock); 780 spin_unlock(&c->space_lock);
718 return 0; 781 return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
719 } 782 }
720 783
721 available = ubifs_calc_available(c, min_idx_lebs); 784 available = ubifs_calc_available(c, min_idx_lebs);
722 outstanding = c->budg_data_growth + c->budg_dd_growth; 785
723 c->min_idx_lebs = min_idx_lebs; 786 /*
787 * When reporting free space to user-space, UBIFS guarantees that it is
788 * possible to write a file of free space size. This means that for
789 * empty LEBs we may use more precise calculations than
790 * 'ubifs_calc_available()' is using. Namely, we know that in empty
791 * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
792 * Thus, amend the available space.
793 *
794 * Note, the calculations below are similar to what we have in
795 * 'do_budget_space()', so refer there for comments.
796 */
797 if (min_idx_lebs > c->lst.idx_lebs)
798 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
799 else
800 rsvd_idx_lebs = 0;
801 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
802 c->lst.taken_empty_lebs;
803 lebs -= rsvd_idx_lebs;
804 available += lebs * (c->dark_wm - c->leb_overhead);
724 spin_unlock(&c->space_lock); 805 spin_unlock(&c->space_lock);
725 806
726 if (available > outstanding) 807 if (available > outstanding)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 3b516316c9b3..0a6aa2cc78f0 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -74,6 +74,7 @@ static int do_commit(struct ubifs_info *c)
74 goto out_up; 74 goto out_up;
75 } 75 }
76 76
77 c->cmt_no += 1;
77 err = ubifs_gc_start_commit(c); 78 err = ubifs_gc_start_commit(c);
78 if (err) 79 if (err)
79 goto out_up; 80 goto out_up;
@@ -115,7 +116,7 @@ static int do_commit(struct ubifs_info *c)
115 goto out; 116 goto out;
116 117
117 mutex_lock(&c->mst_mutex); 118 mutex_lock(&c->mst_mutex);
118 c->mst_node->cmt_no = cpu_to_le64(++c->cmt_no); 119 c->mst_node->cmt_no = cpu_to_le64(c->cmt_no);
119 c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); 120 c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
120 c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); 121 c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
121 c->mst_node->root_offs = cpu_to_le32(zroot.offs); 122 c->mst_node->root_offs = cpu_to_le32(zroot.offs);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4e3aaeba4eca..d7f7645779f2 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
538 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); 538 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
539 for (i = 0; i < n; i++) 539 for (i = 0; i < n; i++)
540 printk(KERN_DEBUG "\t ino %llu\n", 540 printk(KERN_DEBUG "\t ino %llu\n",
541 le64_to_cpu(orph->inos[i])); 541 (unsigned long long)le64_to_cpu(orph->inos[i]));
542 break; 542 break;
543 } 543 }
544 default: 544 default:
@@ -568,8 +568,8 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
568void dbg_dump_lstats(const struct ubifs_lp_stats *lst) 568void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
569{ 569{
570 spin_lock(&dbg_lock); 570 spin_lock(&dbg_lock);
571 printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs %d\n", 571 printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
572 lst->empty_lebs, lst->idx_lebs); 572 "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
573 printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " 573 printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
574 "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, 574 "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
575 lst->total_dirty); 575 lst->total_dirty);
@@ -587,8 +587,8 @@ void dbg_dump_budg(struct ubifs_info *c)
587 struct ubifs_gced_idx_leb *idx_gc; 587 struct ubifs_gced_idx_leb *idx_gc;
588 588
589 spin_lock(&dbg_lock); 589 spin_lock(&dbg_lock);
590 printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, " 590 printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
591 "budg_dd_growth %lld, budg_idx_growth %lld\n", 591 "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
592 c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); 592 c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
593 printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " 593 printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
594 "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, 594 "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
@@ -634,7 +634,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
634 struct ubifs_lprops lp; 634 struct ubifs_lprops lp;
635 struct ubifs_lp_stats lst; 635 struct ubifs_lp_stats lst;
636 636
637 printk(KERN_DEBUG "Dumping LEB properties\n"); 637 printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
638 ubifs_get_lp_stats(c, &lst); 638 ubifs_get_lp_stats(c, &lst);
639 dbg_dump_lstats(&lst); 639 dbg_dump_lstats(&lst);
640 640
@@ -655,7 +655,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
655 if (dbg_failure_mode) 655 if (dbg_failure_mode)
656 return; 656 return;
657 657
658 printk(KERN_DEBUG "Dumping LEB %d\n", lnum); 658 printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
659 659
660 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); 660 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
661 if (IS_ERR(sleb)) { 661 if (IS_ERR(sleb)) {
@@ -720,8 +720,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
720{ 720{
721 int i; 721 int i;
722 722
723 printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n", 723 printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
724 cat, heap->cnt); 724 current->pid, cat, heap->cnt);
725 for (i = 0; i < heap->cnt; i++) { 725 for (i = 0; i < heap->cnt; i++) {
726 struct ubifs_lprops *lprops = heap->arr[i]; 726 struct ubifs_lprops *lprops = heap->arr[i];
727 727
@@ -736,7 +736,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
736{ 736{
737 int i; 737 int i;
738 738
739 printk(KERN_DEBUG "Dumping pnode:\n"); 739 printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
740 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", 740 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
741 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); 741 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
742 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", 742 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -755,7 +755,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
755 int level; 755 int level;
756 756
757 printk(KERN_DEBUG "\n"); 757 printk(KERN_DEBUG "\n");
758 printk(KERN_DEBUG "Dumping the TNC tree\n"); 758 printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
759 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); 759 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
760 level = znode->level; 760 level = znode->level;
761 printk(KERN_DEBUG "== Level %d ==\n", level); 761 printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -2208,16 +2208,17 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
2208int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, 2208int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
2209 int offset, int len, int dtype) 2209 int offset, int len, int dtype)
2210{ 2210{
2211 int err; 2211 int err, failing;
2212 2212
2213 if (in_failure_mode(desc)) 2213 if (in_failure_mode(desc))
2214 return -EIO; 2214 return -EIO;
2215 if (do_fail(desc, lnum, 1)) 2215 failing = do_fail(desc, lnum, 1);
2216 if (failing)
2216 cut_data(buf, len); 2217 cut_data(buf, len);
2217 err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); 2218 err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
2218 if (err) 2219 if (err)
2219 return err; 2220 return err;
2220 if (in_failure_mode(desc)) 2221 if (failing)
2221 return -EIO; 2222 return -EIO;
2222 return 0; 2223 return 0;
2223} 2224}
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 3c4f1e93c9e0..50315fc57185 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,7 @@
27 27
28#define UBIFS_DBG(op) op 28#define UBIFS_DBG(op) op
29 29
30#define ubifs_assert(expr) do { \ 30#define ubifs_assert(expr) do { \
31 if (unlikely(!(expr))) { \ 31 if (unlikely(!(expr))) { \
32 printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ 32 printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
33 __func__, __LINE__, current->pid); \ 33 __func__, __LINE__, current->pid); \
@@ -73,50 +73,50 @@ const char *dbg_key_str1(const struct ubifs_info *c,
73 const union ubifs_key *key); 73 const union ubifs_key *key);
74 74
75/* 75/*
76 * DBGKEY macros require dbg_lock to be held, which it is in the dbg message 76 * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
77 * macros. 77 * macros.
78 */ 78 */
79#define DBGKEY(key) dbg_key_str0(c, (key)) 79#define DBGKEY(key) dbg_key_str0(c, (key))
80#define DBGKEY1(key) dbg_key_str1(c, (key)) 80#define DBGKEY1(key) dbg_key_str1(c, (key))
81 81
82/* General messages */ 82/* General messages */
83#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) 83#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
84 84
85/* Additional journal messages */ 85/* Additional journal messages */
86#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) 86#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
87 87
88/* Additional TNC messages */ 88/* Additional TNC messages */
89#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) 89#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
90 90
91/* Additional lprops messages */ 91/* Additional lprops messages */
92#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) 92#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
93 93
94/* Additional LEB find messages */ 94/* Additional LEB find messages */
95#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) 95#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
96 96
97/* Additional mount messages */ 97/* Additional mount messages */
98#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) 98#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
99 99
100/* Additional I/O messages */ 100/* Additional I/O messages */
101#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) 101#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
102 102
103/* Additional commit messages */ 103/* Additional commit messages */
104#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) 104#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
105 105
106/* Additional budgeting messages */ 106/* Additional budgeting messages */
107#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) 107#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
108 108
109/* Additional log messages */ 109/* Additional log messages */
110#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) 110#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
111 111
112/* Additional gc messages */ 112/* Additional gc messages */
113#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) 113#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
114 114
115/* Additional scan messages */ 115/* Additional scan messages */
116#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) 116#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
117 117
118/* Additional recovery messages */ 118/* Additional recovery messages */
119#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) 119#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
120 120
121/* 121/*
122 * Debugging message type flags (must match msg_type_names in debug.c). 122 * Debugging message type flags (must match msg_type_names in debug.c).
@@ -239,34 +239,23 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
239 struct ubifs_zbranch *zbr, void *priv); 239 struct ubifs_zbranch *zbr, void *priv);
240typedef int (*dbg_znode_callback)(struct ubifs_info *c, 240typedef int (*dbg_znode_callback)(struct ubifs_info *c,
241 struct ubifs_znode *znode, void *priv); 241 struct ubifs_znode *znode, void *priv);
242
243int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, 242int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
244 dbg_znode_callback znode_cb, void *priv); 243 dbg_znode_callback znode_cb, void *priv);
245 244
246/* Checking functions */ 245/* Checking functions */
247 246
248int dbg_check_lprops(struct ubifs_info *c); 247int dbg_check_lprops(struct ubifs_info *c);
249
250int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); 248int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
251int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); 249int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
252
253int dbg_check_cats(struct ubifs_info *c); 250int dbg_check_cats(struct ubifs_info *c);
254
255int dbg_check_ltab(struct ubifs_info *c); 251int dbg_check_ltab(struct ubifs_info *c);
256
257int dbg_check_synced_i_size(struct inode *inode); 252int dbg_check_synced_i_size(struct inode *inode);
258
259int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); 253int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
260
261int dbg_check_tnc(struct ubifs_info *c, int extra); 254int dbg_check_tnc(struct ubifs_info *c, int extra);
262
263int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); 255int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
264
265int dbg_check_filesystem(struct ubifs_info *c); 256int dbg_check_filesystem(struct ubifs_info *c);
266
267void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, 257void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
268 int add_pos); 258 int add_pos);
269
270int dbg_check_lprops(struct ubifs_info *c); 259int dbg_check_lprops(struct ubifs_info *c);
271int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, 260int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
272 int row, int col); 261 int row, int col);
@@ -329,71 +318,77 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
329#else /* !CONFIG_UBIFS_FS_DEBUG */ 318#else /* !CONFIG_UBIFS_FS_DEBUG */
330 319
331#define UBIFS_DBG(op) 320#define UBIFS_DBG(op)
332#define ubifs_assert(expr) ({}) 321
333#define ubifs_assert_cmt_locked(c) 322/* Use "if (0)" to make compiler check arguments even if debugging is off */
323#define ubifs_assert(expr) do { \
324 if (0 && (expr)) \
325 printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
326 __func__, __LINE__, current->pid); \
327} while (0)
328
329#define dbg_err(fmt, ...) do { \
330 if (0) \
331 ubifs_err(fmt, ##__VA_ARGS__); \
332} while (0)
333
334#define dbg_msg(fmt, ...) do { \
335 if (0) \
336 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \
337 current->pid, __func__, ##__VA_ARGS__); \
338} while (0)
339
334#define dbg_dump_stack() 340#define dbg_dump_stack()
335#define dbg_err(fmt, ...) ({}) 341#define ubifs_assert_cmt_locked(c)
336#define dbg_msg(fmt, ...) ({})
337#define dbg_key(c, key, fmt, ...) ({})
338
339#define dbg_gen(fmt, ...) ({})
340#define dbg_jnl(fmt, ...) ({})
341#define dbg_tnc(fmt, ...) ({})
342#define dbg_lp(fmt, ...) ({})
343#define dbg_find(fmt, ...) ({})
344#define dbg_mnt(fmt, ...) ({})
345#define dbg_io(fmt, ...) ({})
346#define dbg_cmt(fmt, ...) ({})
347#define dbg_budg(fmt, ...) ({})
348#define dbg_log(fmt, ...) ({})
349#define dbg_gc(fmt, ...) ({})
350#define dbg_scan(fmt, ...) ({})
351#define dbg_rcvry(fmt, ...) ({})
352
353#define dbg_ntype(type) ""
354#define dbg_cstate(cmt_state) ""
355#define dbg_get_key_dump(c, key) ({})
356#define dbg_dump_inode(c, inode) ({})
357#define dbg_dump_node(c, node) ({})
358#define dbg_dump_budget_req(req) ({})
359#define dbg_dump_lstats(lst) ({})
360#define dbg_dump_budg(c) ({})
361#define dbg_dump_lprop(c, lp) ({})
362#define dbg_dump_lprops(c) ({})
363#define dbg_dump_leb(c, lnum) ({})
364#define dbg_dump_znode(c, znode) ({})
365#define dbg_dump_heap(c, heap, cat) ({})
366#define dbg_dump_pnode(c, pnode, parent, iip) ({})
367#define dbg_dump_tnc(c) ({})
368#define dbg_dump_index(c) ({})
369 342
370#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 343#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
344#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
345#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
346#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
347#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
348#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
349#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
350#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
351#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
352#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
353#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
354#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
355#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
356
357#define DBGKEY(key) ((char *)(key))
358#define DBGKEY1(key) ((char *)(key))
359
360#define dbg_ntype(type) ""
361#define dbg_cstate(cmt_state) ""
362#define dbg_get_key_dump(c, key) ({})
363#define dbg_dump_inode(c, inode) ({})
364#define dbg_dump_node(c, node) ({})
365#define dbg_dump_budget_req(req) ({})
366#define dbg_dump_lstats(lst) ({})
367#define dbg_dump_budg(c) ({})
368#define dbg_dump_lprop(c, lp) ({})
369#define dbg_dump_lprops(c) ({})
370#define dbg_dump_leb(c, lnum) ({})
371#define dbg_dump_znode(c, znode) ({})
372#define dbg_dump_heap(c, heap, cat) ({})
373#define dbg_dump_pnode(c, pnode, parent, iip) ({})
374#define dbg_dump_tnc(c) ({})
375#define dbg_dump_index(c) ({})
371 376
377#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
372#define dbg_old_index_check_init(c, zroot) 0 378#define dbg_old_index_check_init(c, zroot) 0
373#define dbg_check_old_index(c, zroot) 0 379#define dbg_check_old_index(c, zroot) 0
374
375#define dbg_check_cats(c) 0 380#define dbg_check_cats(c) 0
376
377#define dbg_check_ltab(c) 0 381#define dbg_check_ltab(c) 0
378
379#define dbg_check_synced_i_size(inode) 0 382#define dbg_check_synced_i_size(inode) 0
380
381#define dbg_check_dir_size(c, dir) 0 383#define dbg_check_dir_size(c, dir) 0
382
383#define dbg_check_tnc(c, x) 0 384#define dbg_check_tnc(c, x) 0
384
385#define dbg_check_idx_size(c, idx_size) 0 385#define dbg_check_idx_size(c, idx_size) 0
386
387#define dbg_check_filesystem(c) 0 386#define dbg_check_filesystem(c) 0
388
389#define dbg_check_heap(c, heap, cat, add_pos) ({}) 387#define dbg_check_heap(c, heap, cat, add_pos) ({})
390
391#define dbg_check_lprops(c) 0 388#define dbg_check_lprops(c) 0
392#define dbg_check_lpt_nodes(c, cnode, row, col) 0 389#define dbg_check_lpt_nodes(c, cnode, row, col) 0
393
394#define dbg_force_in_the_gaps_enabled 0 390#define dbg_force_in_the_gaps_enabled 0
395#define dbg_force_in_the_gaps() 0 391#define dbg_force_in_the_gaps() 0
396
397#define dbg_failure_mode 0 392#define dbg_failure_mode 0
398#define dbg_failure_mode_registration(c) ({}) 393#define dbg_failure_mode_registration(c) ({})
399#define dbg_failure_mode_deregistration(c) ({}) 394#define dbg_failure_mode_deregistration(c) ({})
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e90374be7d3b..526c01ec8003 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -165,7 +165,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
165 } 165 }
166 166
167 inode->i_ino = ++c->highest_inum; 167 inode->i_ino = ++c->highest_inum;
168 inode->i_generation = ++c->vfs_gen;
169 /* 168 /*
170 * The creation sequence number remains with this inode for its 169 * The creation sequence number remains with this inode for its
171 * lifetime. All nodes for this inode have a greater sequence number, 170 * lifetime. All nodes for this inode have a greater sequence number,
@@ -220,15 +219,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
220 219
221 err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); 220 err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
222 if (err) { 221 if (err) {
223 /* 222 if (err == -ENOENT) {
224 * Do not hash the direntry if parent 'i_nlink' is zero, because
225 * this has side-effects - '->delete_inode()' call will not be
226 * called for the parent orphan inode, because 'd_count' of its
227 * direntry will stay 1 (it'll be negative direntry I guess)
228 * and prevent 'iput_final()' until the dentry is destroyed due
229 * to unmount or memory pressure.
230 */
231 if (err == -ENOENT && dir->i_nlink != 0) {
232 dbg_gen("not found"); 223 dbg_gen("not found");
233 goto done; 224 goto done;
234 } 225 }
@@ -435,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
435 426
436 while (1) { 427 while (1) {
437 dbg_gen("feed '%s', ino %llu, new f_pos %#x", 428 dbg_gen("feed '%s', ino %llu, new f_pos %#x",
438 dent->name, le64_to_cpu(dent->inum), 429 dent->name, (unsigned long long)le64_to_cpu(dent->inum),
439 key_hash_flash(c, &dent->key)); 430 key_hash_flash(c, &dent->key));
440 ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); 431 ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
441 432
@@ -525,7 +516,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
525 struct ubifs_inode *dir_ui = ubifs_inode(dir); 516 struct ubifs_inode *dir_ui = ubifs_inode(dir);
526 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); 517 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
527 struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, 518 struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
528 .dirtied_ino_d = ui->data_len }; 519 .dirtied_ino_d = ALIGN(ui->data_len, 8) };
529 520
530 /* 521 /*
531 * Budget request settings: new direntry, changing the target inode, 522 * Budget request settings: new direntry, changing the target inode,
@@ -596,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
596 if (err) { 587 if (err) {
597 if (err != -ENOSPC) 588 if (err != -ENOSPC)
598 return err; 589 return err;
599 err = 0;
600 budgeted = 0; 590 budgeted = 0;
601 } 591 }
602 592
@@ -727,8 +717,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
727 struct ubifs_inode *dir_ui = ubifs_inode(dir); 717 struct ubifs_inode *dir_ui = ubifs_inode(dir);
728 struct ubifs_info *c = dir->i_sb->s_fs_info; 718 struct ubifs_info *c = dir->i_sb->s_fs_info;
729 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); 719 int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
730 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 720 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
731 .dirtied_ino_d = 1 };
732 721
733 /* 722 /*
734 * Budget request settings: new inode, new direntry and changing parent 723 * Budget request settings: new inode, new direntry and changing parent
@@ -789,7 +778,8 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
789 int sz_change = CALC_DENT_SIZE(dentry->d_name.len); 778 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
790 int err, devlen = 0; 779 int err, devlen = 0;
791 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 780 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
792 .new_ino_d = devlen, .dirtied_ino = 1 }; 781 .new_ino_d = ALIGN(devlen, 8),
782 .dirtied_ino = 1 };
793 783
794 /* 784 /*
795 * Budget request settings: new inode, new direntry and changing parent 785 * Budget request settings: new inode, new direntry and changing parent
@@ -863,7 +853,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
863 int err, len = strlen(symname); 853 int err, len = strlen(symname);
864 int sz_change = CALC_DENT_SIZE(dentry->d_name.len); 854 int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
865 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 855 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
866 .new_ino_d = len, .dirtied_ino = 1 }; 856 .new_ino_d = ALIGN(len, 8),
857 .dirtied_ino = 1 };
867 858
868 /* 859 /*
869 * Budget request settings: new inode, new direntry and changing parent 860 * Budget request settings: new inode, new direntry and changing parent
@@ -1012,7 +1003,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1012 struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, 1003 struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
1013 .dirtied_ino = 3 }; 1004 .dirtied_ino = 3 };
1014 struct ubifs_budget_req ino_req = { .dirtied_ino = 1, 1005 struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
1015 .dirtied_ino_d = old_inode_ui->data_len }; 1006 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
1016 struct timespec time; 1007 struct timespec time;
1017 1008
1018 /* 1009 /*
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 8565e586e533..3d698e2022b1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
793 int err; 793 int err;
794 struct ubifs_budget_req req; 794 struct ubifs_budget_req req;
795 loff_t old_size = inode->i_size, new_size = attr->ia_size; 795 loff_t old_size = inode->i_size, new_size = attr->ia_size;
796 int offset = new_size & (UBIFS_BLOCK_SIZE - 1); 796 int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
797 struct ubifs_inode *ui = ubifs_inode(inode); 797 struct ubifs_inode *ui = ubifs_inode(inode);
798 798
799 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); 799 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
811 /* A funny way to budget for truncation node */ 811 /* A funny way to budget for truncation node */
812 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; 812 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
813 err = ubifs_budget_space(c, &req); 813 err = ubifs_budget_space(c, &req);
814 if (err) 814 if (err) {
815 return err; 815 /*
816 * Treat truncations to zero as deletion and always allow them,
817 * just like we do for '->unlink()'.
818 */
819 if (new_size || err != -ENOSPC)
820 return err;
821 budgeted = 0;
822 }
816 823
817 err = vmtruncate(inode, new_size); 824 err = vmtruncate(inode, new_size);
818 if (err) 825 if (err)
@@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
869 err = ubifs_jnl_truncate(c, inode, old_size, new_size); 876 err = ubifs_jnl_truncate(c, inode, old_size, new_size);
870 mutex_unlock(&ui->ui_mutex); 877 mutex_unlock(&ui->ui_mutex);
871out_budg: 878out_budg:
872 ubifs_release_budget(c, &req); 879 if (budgeted)
880 ubifs_release_budget(c, &req);
881 else {
882 c->nospace = c->nospace_rp = 0;
883 smp_wmb();
884 }
873 return err; 885 return err;
874} 886}
875 887
@@ -890,7 +902,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
890 loff_t new_size = attr->ia_size; 902 loff_t new_size = attr->ia_size;
891 struct ubifs_inode *ui = ubifs_inode(inode); 903 struct ubifs_inode *ui = ubifs_inode(inode);
892 struct ubifs_budget_req req = { .dirtied_ino = 1, 904 struct ubifs_budget_req req = { .dirtied_ino = 1,
893 .dirtied_ino_d = ui->data_len }; 905 .dirtied_ino_d = ALIGN(ui->data_len, 8) };
894 906
895 err = ubifs_budget_space(c, &req); 907 err = ubifs_budget_space(c, &req);
896 if (err) 908 if (err)
@@ -941,7 +953,8 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
941 struct inode *inode = dentry->d_inode; 953 struct inode *inode = dentry->d_inode;
942 struct ubifs_info *c = inode->i_sb->s_fs_info; 954 struct ubifs_info *c = inode->i_sb->s_fs_info;
943 955
944 dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid); 956 dbg_gen("ino %lu, mode %#x, ia_valid %#x",
957 inode->i_ino, inode->i_mode, attr->ia_valid);
945 err = inode_change_ok(inode, attr); 958 err = inode_change_ok(inode, attr);
946 if (err) 959 if (err)
947 return err; 960 return err;
@@ -1051,7 +1064,7 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)
1051 if (mctime_update_needed(inode, &now)) { 1064 if (mctime_update_needed(inode, &now)) {
1052 int err, release; 1065 int err, release;
1053 struct ubifs_budget_req req = { .dirtied_ino = 1, 1066 struct ubifs_budget_req req = { .dirtied_ino = 1,
1054 .dirtied_ino_d = ui->data_len }; 1067 .dirtied_ino_d = ALIGN(ui->data_len, 8) };
1055 1068
1056 err = ubifs_budget_space(c, &req); 1069 err = ubifs_budget_space(c, &req);
1057 if (err) 1070 if (err)
@@ -1270,6 +1283,7 @@ struct file_operations ubifs_file_operations = {
1270 .fsync = ubifs_fsync, 1283 .fsync = ubifs_fsync,
1271 .unlocked_ioctl = ubifs_ioctl, 1284 .unlocked_ioctl = ubifs_ioctl,
1272 .splice_read = generic_file_splice_read, 1285 .splice_read = generic_file_splice_read,
1286 .splice_write = generic_file_splice_write,
1273#ifdef CONFIG_COMPAT 1287#ifdef CONFIG_COMPAT
1274 .compat_ioctl = ubifs_compat_ioctl, 1288 .compat_ioctl = ubifs_compat_ioctl,
1275#endif 1289#endif
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 10394c548367..47814cde2407 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
211 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty 211 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
212 * or do not have an LEB which satisfies the @min_space criteria. 212 * or do not have an LEB which satisfies the @min_space criteria.
213 * 213 *
214 * Note: 214 * Note, LEBs which have less than dead watermark of free + dirty space are
215 * o LEBs which have less than dead watermark of dirty space are never picked 215 * never picked by this function.
216 * by this function;
217 *
218 * Returns zero and the LEB properties of
219 * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
220 * negative error code in case of other failures. The returned LEB is marked as
221 * "taken".
222 * 216 *
223 * The additional @pick_free argument controls if this function has to return a 217 * The additional @pick_free argument controls if this function has to return a
224 * free or freeable LEB if one is present. For example, GC must to set it to %1, 218 * free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
231 * 225 *
232 * In addition @pick_free is set to %2 by the recovery process in order to 226 * In addition @pick_free is set to %2 by the recovery process in order to
233 * recover gc_lnum in which case an index LEB must not be returned. 227 * recover gc_lnum in which case an index LEB must not be returned.
228 *
229 * This function returns zero and the LEB properties of found dirty LEB in case
230 * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
231 * case of other failures. The returned LEB is marked as "taken".
234 */ 232 */
235int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, 233int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
236 int min_space, int pick_free) 234 int min_space, int pick_free)
@@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
245 int lebs, rsvd_idx_lebs = 0; 243 int lebs, rsvd_idx_lebs = 0;
246 244
247 spin_lock(&c->space_lock); 245 spin_lock(&c->space_lock);
248 lebs = c->lst.empty_lebs; 246 lebs = c->lst.empty_lebs + c->idx_gc_cnt;
249 lebs += c->freeable_cnt - c->lst.taken_empty_lebs; 247 lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
250 248
251 /* 249 /*
@@ -290,9 +288,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
290 idx_lp = idx_heap->arr[0]; 288 idx_lp = idx_heap->arr[0];
291 sum = idx_lp->free + idx_lp->dirty; 289 sum = idx_lp->free + idx_lp->dirty;
292 /* 290 /*
293 * Since we reserve twice as more space for the index than it 291 * Since we reserve thrice as much space for the index than it
294 * actually takes, it does not make sense to pick indexing LEBs 292 * actually takes, it does not make sense to pick indexing LEBs
295 * with less than half LEB of dirty space. 293 * with less than, say, half LEB of dirty space. May be half is
294 * not the optimal boundary - this should be tested and
295 * checked. This boundary should determine how much we use
296 * in-the-gaps to consolidate the index comparing to how much
297 * we use garbage collector to consolidate it. The "half"
298 * criteria just feels to be fine.
296 */ 299 */
297 if (sum < min_space || sum < c->half_leb_size) 300 if (sum < min_space || sum < c->half_leb_size)
298 idx_lp = NULL; 301 idx_lp = NULL;
@@ -312,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
312 lp = idx_lp; 315 lp = idx_lp;
313 316
314 if (lp) { 317 if (lp) {
315 ubifs_assert(lp->dirty >= c->dead_wm); 318 ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
316 goto found; 319 goto found;
317 } 320 }
318 321
@@ -504,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
504 rsvd_idx_lebs = 0; 507 rsvd_idx_lebs = 0;
505 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 508 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
506 c->lst.taken_empty_lebs; 509 c->lst.taken_empty_lebs;
507 ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
508 if (rsvd_idx_lebs < lebs) 510 if (rsvd_idx_lebs < lebs)
509 /* 511 /*
510 * OK to allocate an empty LEB, but we still don't want to go 512 * OK to allocate an empty LEB, but we still don't want to go
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac29081..02aba36fe3d4 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -334,15 +334,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
334 334
335 err = move_nodes(c, sleb); 335 err = move_nodes(c, sleb);
336 if (err) 336 if (err)
337 goto out; 337 goto out_inc_seq;
338 338
339 err = gc_sync_wbufs(c); 339 err = gc_sync_wbufs(c);
340 if (err) 340 if (err)
341 goto out; 341 goto out_inc_seq;
342 342
343 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); 343 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
344 if (err) 344 if (err)
345 goto out; 345 goto out_inc_seq;
346
347 /* Allow for races with TNC */
348 c->gced_lnum = lnum;
349 smp_wmb();
350 c->gc_seq += 1;
351 smp_wmb();
346 352
347 if (c->gc_lnum == -1) { 353 if (c->gc_lnum == -1) {
348 c->gc_lnum = lnum; 354 c->gc_lnum = lnum;
@@ -363,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
363out: 369out:
364 ubifs_scan_destroy(sleb); 370 ubifs_scan_destroy(sleb);
365 return err; 371 return err;
372
373out_inc_seq:
374 /* We may have moved at least some nodes so allow for races with TNC */
375 c->gced_lnum = lnum;
376 smp_wmb();
377 c->gc_seq += 1;
378 smp_wmb();
379 goto out;
366} 380}
367 381
368/** 382/**
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3374f91b6709..054363f2b207 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -54,6 +54,20 @@
54#include "ubifs.h" 54#include "ubifs.h"
55 55
56/** 56/**
57 * ubifs_ro_mode - switch UBIFS to read read-only mode.
58 * @c: UBIFS file-system description object
59 * @err: error code which is the reason of switching to R/O mode
60 */
61void ubifs_ro_mode(struct ubifs_info *c, int err)
62{
63 if (!c->ro_media) {
64 c->ro_media = 1;
65 ubifs_warn("switched to read-only mode, error %d", err);
66 dbg_dump_stack();
67 }
68}
69
70/**
57 * ubifs_check_node - check node. 71 * ubifs_check_node - check node.
58 * @c: UBIFS file-system description object 72 * @c: UBIFS file-system description object
59 * @buf: node to check 73 * @buf: node to check
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 283155abe5f5..22993f867d19 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -447,13 +447,11 @@ static int get_dent_type(int mode)
447 * @ino: buffer in which to pack inode node 447 * @ino: buffer in which to pack inode node
448 * @inode: inode to pack 448 * @inode: inode to pack
449 * @last: indicates the last node of the group 449 * @last: indicates the last node of the group
450 * @last_reference: non-zero if this is a deletion inode
451 */ 450 */
452static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, 451static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
453 const struct inode *inode, int last, 452 const struct inode *inode, int last)
454 int last_reference)
455{ 453{
456 int data_len = 0; 454 int data_len = 0, last_reference = !inode->i_nlink;
457 struct ubifs_inode *ui = ubifs_inode(inode); 455 struct ubifs_inode *ui = ubifs_inode(inode);
458 456
459 ino->ch.node_type = UBIFS_INO_NODE; 457 ino->ch.node_type = UBIFS_INO_NODE;
@@ -596,9 +594,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
596 ubifs_prep_grp_node(c, dent, dlen, 0); 594 ubifs_prep_grp_node(c, dent, dlen, 0);
597 595
598 ino = (void *)dent + aligned_dlen; 596 ino = (void *)dent + aligned_dlen;
599 pack_inode(c, ino, inode, 0, last_reference); 597 pack_inode(c, ino, inode, 0);
600 ino = (void *)ino + aligned_ilen; 598 ino = (void *)ino + aligned_ilen;
601 pack_inode(c, ino, dir, 1, 0); 599 pack_inode(c, ino, dir, 1);
602 600
603 if (last_reference) { 601 if (last_reference) {
604 err = ubifs_add_orphan(c, inode->i_ino); 602 err = ubifs_add_orphan(c, inode->i_ino);
@@ -606,6 +604,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
606 release_head(c, BASEHD); 604 release_head(c, BASEHD);
607 goto out_finish; 605 goto out_finish;
608 } 606 }
607 ui->del_cmtno = c->cmt_no;
609 } 608 }
610 609
611 err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); 610 err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
@@ -750,30 +749,25 @@ out_free:
750 * ubifs_jnl_write_inode - flush inode to the journal. 749 * ubifs_jnl_write_inode - flush inode to the journal.
751 * @c: UBIFS file-system description object 750 * @c: UBIFS file-system description object
752 * @inode: inode to flush 751 * @inode: inode to flush
753 * @deletion: inode has been deleted
754 * 752 *
755 * This function writes inode @inode to the journal. If the inode is 753 * This function writes inode @inode to the journal. If the inode is
756 * synchronous, it also synchronizes the write-buffer. Returns zero in case of 754 * synchronous, it also synchronizes the write-buffer. Returns zero in case of
757 * success and a negative error code in case of failure. 755 * success and a negative error code in case of failure.
758 */ 756 */
759int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, 757int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
760 int deletion)
761{ 758{
762 int err, len, lnum, offs, sync = 0; 759 int err, lnum, offs;
763 struct ubifs_ino_node *ino; 760 struct ubifs_ino_node *ino;
764 struct ubifs_inode *ui = ubifs_inode(inode); 761 struct ubifs_inode *ui = ubifs_inode(inode);
762 int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
765 763
766 dbg_jnl("ino %lu%s", inode->i_ino, 764 dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
767 deletion ? " (last reference)" : "");
768 if (deletion)
769 ubifs_assert(inode->i_nlink == 0);
770 765
771 len = UBIFS_INO_NODE_SZ;
772 /* 766 /*
773 * If the inode is being deleted, do not write the attached data. No 767 * If the inode is being deleted, do not write the attached data. No
774 * need to synchronize the write-buffer either. 768 * need to synchronize the write-buffer either.
775 */ 769 */
776 if (!deletion) { 770 if (!last_reference) {
777 len += ui->data_len; 771 len += ui->data_len;
778 sync = IS_SYNC(inode); 772 sync = IS_SYNC(inode);
779 } 773 }
@@ -786,7 +780,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
786 if (err) 780 if (err)
787 goto out_free; 781 goto out_free;
788 782
789 pack_inode(c, ino, inode, 1, deletion); 783 pack_inode(c, ino, inode, 1);
790 err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); 784 err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
791 if (err) 785 if (err)
792 goto out_release; 786 goto out_release;
@@ -795,7 +789,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
795 inode->i_ino); 789 inode->i_ino);
796 release_head(c, BASEHD); 790 release_head(c, BASEHD);
797 791
798 if (deletion) { 792 if (last_reference) {
799 err = ubifs_tnc_remove_ino(c, inode->i_ino); 793 err = ubifs_tnc_remove_ino(c, inode->i_ino);
800 if (err) 794 if (err)
801 goto out_ro; 795 goto out_ro;
@@ -828,6 +822,65 @@ out_free:
828} 822}
829 823
830/** 824/**
825 * ubifs_jnl_delete_inode - delete an inode.
826 * @c: UBIFS file-system description object
827 * @inode: inode to delete
828 *
829 * This function deletes inode @inode which includes removing it from orphans,
830 * deleting it from TNC and, in some cases, writing a deletion inode to the
831 * journal.
832 *
833 * When regular file inodes are unlinked or a directory inode is removed, the
834 * 'ubifs_jnl_update()' function writes a corresponding deletion inode and
835 * direntry to the media, and adds the inode to orphans. After this, when the
836 * last reference to this inode has been dropped, this function is called. In
837 * general, it has to write one more deletion inode to the media, because if
838 * a commit happened between 'ubifs_jnl_update()' and
839 * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal
840 * anymore, and in fact it might not be on the flash anymore, because it might
841 * have been garbage-collected already. And for optimization reasons UBIFS does
842 * not read the orphan area if it has been unmounted cleanly, so it would have
843 * no indication in the journal that there is a deleted inode which has to be
844 * removed from TNC.
845 *
846 * However, if there was no commit between 'ubifs_jnl_update()' and
847 * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion
848 * inode to the media for the second time. And this is quite a typical case.
849 *
850 * This function returns zero in case of success and a negative error code in
851 * case of failure.
852 */
853int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
854{
855 int err;
856 struct ubifs_inode *ui = ubifs_inode(inode);
857
858 ubifs_assert(inode->i_nlink == 0);
859
860 if (ui->del_cmtno != c->cmt_no)
861 /* A commit happened for sure */
862 return ubifs_jnl_write_inode(c, inode);
863
864 down_read(&c->commit_sem);
865 /*
866 * Check commit number again, because the first test has been done
867 * without @c->commit_sem, so a commit might have happened.
868 */
869 if (ui->del_cmtno != c->cmt_no) {
870 up_read(&c->commit_sem);
871 return ubifs_jnl_write_inode(c, inode);
872 }
873
874 err = ubifs_tnc_remove_ino(c, inode->i_ino);
875 if (err)
876 ubifs_ro_mode(c, err);
877 else
878 ubifs_delete_orphan(c, inode->i_ino);
879 up_read(&c->commit_sem);
880 return err;
881}
882
883/**
831 * ubifs_jnl_rename - rename a directory entry. 884 * ubifs_jnl_rename - rename a directory entry.
832 * @c: UBIFS file-system description object 885 * @c: UBIFS file-system description object
833 * @old_dir: parent inode of directory entry to rename 886 * @old_dir: parent inode of directory entry to rename
@@ -917,16 +970,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
917 970
918 p = (void *)dent2 + aligned_dlen2; 971 p = (void *)dent2 + aligned_dlen2;
919 if (new_inode) { 972 if (new_inode) {
920 pack_inode(c, p, new_inode, 0, last_reference); 973 pack_inode(c, p, new_inode, 0);
921 p += ALIGN(ilen, 8); 974 p += ALIGN(ilen, 8);
922 } 975 }
923 976
924 if (!move) 977 if (!move)
925 pack_inode(c, p, old_dir, 1, 0); 978 pack_inode(c, p, old_dir, 1);
926 else { 979 else {
927 pack_inode(c, p, old_dir, 0, 0); 980 pack_inode(c, p, old_dir, 0);
928 p += ALIGN(plen, 8); 981 p += ALIGN(plen, 8);
929 pack_inode(c, p, new_dir, 1, 0); 982 pack_inode(c, p, new_dir, 1);
930 } 983 }
931 984
932 if (last_reference) { 985 if (last_reference) {
@@ -935,6 +988,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
935 release_head(c, BASEHD); 988 release_head(c, BASEHD);
936 goto out_finish; 989 goto out_finish;
937 } 990 }
991 new_ui->del_cmtno = c->cmt_no;
938 } 992 }
939 993
940 err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); 994 err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
@@ -1131,7 +1185,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1131 if (err) 1185 if (err)
1132 goto out_free; 1186 goto out_free;
1133 1187
1134 pack_inode(c, ino, inode, 0, 0); 1188 pack_inode(c, ino, inode, 0);
1135 ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); 1189 ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
1136 if (dlen) 1190 if (dlen)
1137 ubifs_prep_grp_node(c, dn, dlen, 1); 1191 ubifs_prep_grp_node(c, dn, dlen, 1);
@@ -1251,9 +1305,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
1251 ubifs_prep_grp_node(c, xent, xlen, 0); 1305 ubifs_prep_grp_node(c, xent, xlen, 0);
1252 1306
1253 ino = (void *)xent + aligned_xlen; 1307 ino = (void *)xent + aligned_xlen;
1254 pack_inode(c, ino, inode, 0, 1); 1308 pack_inode(c, ino, inode, 0);
1255 ino = (void *)ino + UBIFS_INO_NODE_SZ; 1309 ino = (void *)ino + UBIFS_INO_NODE_SZ;
1256 pack_inode(c, ino, host, 1, 0); 1310 pack_inode(c, ino, host, 1);
1257 1311
1258 err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); 1312 err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
1259 if (!sync && !err) 1313 if (!sync && !err)
@@ -1320,7 +1374,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
1320 const struct inode *host) 1374 const struct inode *host)
1321{ 1375{
1322 int err, len1, len2, aligned_len, aligned_len1, lnum, offs; 1376 int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
1323 struct ubifs_inode *host_ui = ubifs_inode(inode); 1377 struct ubifs_inode *host_ui = ubifs_inode(host);
1324 struct ubifs_ino_node *ino; 1378 struct ubifs_ino_node *ino;
1325 union ubifs_key key; 1379 union ubifs_key key;
1326 int sync = IS_DIRSYNC(host); 1380 int sync = IS_DIRSYNC(host);
@@ -1344,8 +1398,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
1344 if (err) 1398 if (err)
1345 goto out_free; 1399 goto out_free;
1346 1400
1347 pack_inode(c, ino, host, 0, 0); 1401 pack_inode(c, ino, host, 0);
1348 pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0); 1402 pack_inode(c, (void *)ino + aligned_len1, inode, 1);
1349 1403
1350 err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); 1404 err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
1351 if (!sync && !err) { 1405 if (!sync && !err) {
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36857b9ed59e..3e0aa7367556 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -317,6 +317,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
317 return 0; 317 return 0;
318 318
319out_unlock: 319out_unlock:
320 if (err != -EAGAIN)
321 ubifs_ro_mode(c, err);
320 mutex_unlock(&c->log_mutex); 322 mutex_unlock(&c->log_mutex);
321 kfree(ref); 323 kfree(ref);
322 kfree(bud); 324 kfree(bud);
@@ -410,7 +412,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
410 return -ENOMEM; 412 return -ENOMEM;
411 413
412 cs->ch.node_type = UBIFS_CS_NODE; 414 cs->ch.node_type = UBIFS_CS_NODE;
413 cs->cmt_no = cpu_to_le64(c->cmt_no + 1); 415 cs->cmt_no = cpu_to_le64(c->cmt_no);
414 ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); 416 ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
415 417
416 /* 418 /*
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4beccfc256d2..4c12a9215d7f 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -80,20 +80,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
80} 80}
81 81
82/** 82/**
83 * ubifs_ro_mode - switch UBIFS to read read-only mode.
84 * @c: UBIFS file-system description object
85 * @err: error code which is the reason of switching to R/O mode
86 */
87static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
88{
89 if (!c->ro_media) {
90 c->ro_media = 1;
91 ubifs_warn("switched to read-only mode, error %d", err);
92 dbg_dump_stack();
93 }
94}
95
96/**
97 * ubifs_compr_present - check if compressor was compiled in. 83 * ubifs_compr_present - check if compressor was compiled in.
98 * @compr_type: compressor type to check 84 * @compr_type: compressor type to check
99 * 85 *
@@ -298,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
298} 284}
299 285
300/** 286/**
301 * ubifs_reported_space - calculate reported free space.
302 * @c: the UBIFS file-system description object
303 * @free: amount of free space
304 *
305 * This function calculates amount of free space which will be reported to
306 * user-space. User-space application tend to expect that if the file-system
307 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
308 * are able to write a file of size N. UBIFS attaches node headers to each data
309 * node and it has to write indexind nodes as well. This introduces additional
310 * overhead, and UBIFS it has to report sligtly less free space to meet the
311 * above expectetion.
312 *
313 * This function assumes free space is made up of uncompressed data nodes and
314 * full index nodes (one per data node, doubled because we always allow enough
315 * space to write the index twice).
316 *
317 * Note, the calculation is pessimistic, which means that most of the time
318 * UBIFS reports less space than it actually has.
319 */
320static inline long long ubifs_reported_space(const struct ubifs_info *c,
321 uint64_t free)
322{
323 int divisor, factor;
324
325 divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
326 factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
327 do_div(free, divisor);
328
329 return free * factor;
330}
331
332/**
333 * ubifs_current_time - round current time to time granularity. 287 * ubifs_current_time - round current time to time granularity.
334 * @inode: inode 288 * @inode: inode
335 */ 289 */
@@ -339,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode)
339 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; 293 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
340} 294}
341 295
296/**
297 * ubifs_tnc_lookup - look up a file-system node.
298 * @c: UBIFS file-system description object
299 * @key: node key to lookup
300 * @node: the node is returned here
301 *
302 * This function look up and reads node with key @key. The caller has to make
303 * sure the @node buffer is large enough to fit the node. Returns zero in case
304 * of success, %-ENOENT if the node was not found, and a negative error code in
305 * case of failure.
306 */
307static inline int ubifs_tnc_lookup(struct ubifs_info *c,
308 const union ubifs_key *key, void *node)
309{
310 return ubifs_tnc_locate(c, key, node, NULL, NULL);
311}
312
342#endif /* __UBIFS_MISC_H__ */ 313#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 3afeb9242c6a..02d3462f4d3e 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -310,10 +310,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
310 c->cmt_orphans -= cnt; 310 c->cmt_orphans -= cnt;
311 spin_unlock(&c->orphan_lock); 311 spin_unlock(&c->orphan_lock);
312 if (c->cmt_orphans) 312 if (c->cmt_orphans)
313 orph->cmt_no = cpu_to_le64(c->cmt_no + 1); 313 orph->cmt_no = cpu_to_le64(c->cmt_no);
314 else 314 else
315 /* Mark the last node of the commit */ 315 /* Mark the last node of the commit */
316 orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63)); 316 orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));
317 ubifs_assert(c->ohead_offs + len <= c->leb_size); 317 ubifs_assert(c->ohead_offs + len <= c->leb_size);
318 ubifs_assert(c->ohead_lnum >= c->orph_first); 318 ubifs_assert(c->ohead_lnum >= c->orph_first);
319 ubifs_assert(c->ohead_lnum <= c->orph_last); 319 ubifs_assert(c->ohead_lnum <= c->orph_last);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ca1e2d4e03cc..9a9220333b3b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -30,7 +30,6 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/ctype.h> 32#include <linux/ctype.h>
33#include <linux/random.h>
34#include <linux/kthread.h> 33#include <linux/kthread.h>
35#include <linux/parser.h> 34#include <linux/parser.h>
36#include <linux/seq_file.h> 35#include <linux/seq_file.h>
@@ -149,7 +148,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
149 if (err) 148 if (err)
150 goto out_invalid; 149 goto out_invalid;
151 150
152 /* Disable readahead */ 151 /* Disable read-ahead */
153 inode->i_mapping->backing_dev_info = &c->bdi; 152 inode->i_mapping->backing_dev_info = &c->bdi;
154 153
155 switch (inode->i_mode & S_IFMT) { 154 switch (inode->i_mode & S_IFMT) {
@@ -278,7 +277,7 @@ static void ubifs_destroy_inode(struct inode *inode)
278 */ 277 */
279static int ubifs_write_inode(struct inode *inode, int wait) 278static int ubifs_write_inode(struct inode *inode, int wait)
280{ 279{
281 int err; 280 int err = 0;
282 struct ubifs_info *c = inode->i_sb->s_fs_info; 281 struct ubifs_info *c = inode->i_sb->s_fs_info;
283 struct ubifs_inode *ui = ubifs_inode(inode); 282 struct ubifs_inode *ui = ubifs_inode(inode);
284 283
@@ -299,10 +298,18 @@ static int ubifs_write_inode(struct inode *inode, int wait)
299 return 0; 298 return 0;
300 } 299 }
301 300
302 dbg_gen("inode %lu", inode->i_ino); 301 /*
303 err = ubifs_jnl_write_inode(c, inode, 0); 302 * As an optimization, do not write orphan inodes to the media just
304 if (err) 303 * because this is not needed.
305 ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); 304 */
305 dbg_gen("inode %lu, mode %#x, nlink %u",
306 inode->i_ino, (int)inode->i_mode, inode->i_nlink);
307 if (inode->i_nlink) {
308 err = ubifs_jnl_write_inode(c, inode);
309 if (err)
310 ubifs_err("can't write inode %lu, error %d",
311 inode->i_ino, err);
312 }
306 313
307 ui->dirty = 0; 314 ui->dirty = 0;
308 mutex_unlock(&ui->ui_mutex); 315 mutex_unlock(&ui->ui_mutex);
@@ -314,8 +321,9 @@ static void ubifs_delete_inode(struct inode *inode)
314{ 321{
315 int err; 322 int err;
316 struct ubifs_info *c = inode->i_sb->s_fs_info; 323 struct ubifs_info *c = inode->i_sb->s_fs_info;
324 struct ubifs_inode *ui = ubifs_inode(inode);
317 325
318 if (ubifs_inode(inode)->xattr) 326 if (ui->xattr)
319 /* 327 /*
320 * Extended attribute inode deletions are fully handled in 328 * Extended attribute inode deletions are fully handled in
321 * 'ubifs_removexattr()'. These inodes are special and have 329 * 'ubifs_removexattr()'. These inodes are special and have
@@ -323,7 +331,7 @@ static void ubifs_delete_inode(struct inode *inode)
323 */ 331 */
324 goto out; 332 goto out;
325 333
326 dbg_gen("inode %lu", inode->i_ino); 334 dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
327 ubifs_assert(!atomic_read(&inode->i_count)); 335 ubifs_assert(!atomic_read(&inode->i_count));
328 ubifs_assert(inode->i_nlink == 0); 336 ubifs_assert(inode->i_nlink == 0);
329 337
@@ -331,15 +339,19 @@ static void ubifs_delete_inode(struct inode *inode)
331 if (is_bad_inode(inode)) 339 if (is_bad_inode(inode))
332 goto out; 340 goto out;
333 341
334 ubifs_inode(inode)->ui_size = inode->i_size = 0; 342 ui->ui_size = inode->i_size = 0;
335 err = ubifs_jnl_write_inode(c, inode, 1); 343 err = ubifs_jnl_delete_inode(c, inode);
336 if (err) 344 if (err)
337 /* 345 /*
338 * Worst case we have a lost orphan inode wasting space, so a 346 * Worst case we have a lost orphan inode wasting space, so a
339 * simple error message is ok here. 347 * simple error message is OK here.
340 */ 348 */
341 ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); 349 ubifs_err("can't delete inode %lu, error %d",
350 inode->i_ino, err);
351
342out: 352out:
353 if (ui->dirty)
354 ubifs_release_dirty_inode_budget(c, ui);
343 clear_inode(inode); 355 clear_inode(inode);
344} 356}
345 357
@@ -358,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
358{ 370{
359 struct ubifs_info *c = dentry->d_sb->s_fs_info; 371 struct ubifs_info *c = dentry->d_sb->s_fs_info;
360 unsigned long long free; 372 unsigned long long free;
373 __le32 *uuid = (__le32 *)c->uuid;
361 374
362 free = ubifs_budg_get_free_space(c); 375 free = ubifs_get_free_space(c);
363 dbg_gen("free space %lld bytes (%lld blocks)", 376 dbg_gen("free space %lld bytes (%lld blocks)",
364 free, free >> UBIFS_BLOCK_SHIFT); 377 free, free >> UBIFS_BLOCK_SHIFT);
365 378
@@ -374,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
374 buf->f_files = 0; 387 buf->f_files = 0;
375 buf->f_ffree = 0; 388 buf->f_ffree = 0;
376 buf->f_namelen = UBIFS_MAX_NLEN; 389 buf->f_namelen = UBIFS_MAX_NLEN;
377 390 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
391 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
378 return 0; 392 return 0;
379} 393}
380 394
@@ -518,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c)
518 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); 532 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
519 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); 533 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
520 534
535 /*
536 * Calculate how many bytes would be wasted at the end of LEB if it was
537 * fully filled with data nodes of maximum size. This is used in
538 * calculations when reporting free space.
539 */
540 c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
521 return 0; 541 return 0;
522} 542}
523 543
@@ -635,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c)
635 * internally because it does not make much sense for UBIFS, but it is 655 * internally because it does not make much sense for UBIFS, but it is
636 * necessary to report something for the 'statfs()' call. 656 * necessary to report something for the 'statfs()' call.
637 * 657 *
638 * Subtract the LEB reserved for GC and the LEB which is reserved for 658 * Subtract the LEB reserved for GC, the LEB which is reserved for
639 * deletions. 659 * deletions, and assume only one journal head is available.
640 *
641 * Review 'ubifs_calc_available()' if changing this calculation.
642 */ 660 */
643 tmp64 = c->main_lebs - 2; 661 tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
644 tmp64 *= (uint64_t)c->leb_size - c->dark_wm; 662 tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
645 tmp64 = ubifs_reported_space(c, tmp64); 663 tmp64 = ubifs_reported_space(c, tmp64);
646 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; 664 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
647 665
@@ -830,7 +848,7 @@ enum {
830 Opt_err, 848 Opt_err,
831}; 849};
832 850
833static match_table_t tokens = { 851static const match_table_t tokens = {
834 {Opt_fast_unmount, "fast_unmount"}, 852 {Opt_fast_unmount, "fast_unmount"},
835 {Opt_norm_unmount, "norm_unmount"}, 853 {Opt_norm_unmount, "norm_unmount"},
836 {Opt_err, NULL}, 854 {Opt_err, NULL},
@@ -1006,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c)
1006 goto out_dereg; 1024 goto out_dereg;
1007 } 1025 }
1008 1026
1027 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
1009 if (!mounted_read_only) { 1028 if (!mounted_read_only) {
1010 err = alloc_wbufs(c); 1029 err = alloc_wbufs(c);
1011 if (err) 1030 if (err)
1012 goto out_cbuf; 1031 goto out_cbuf;
1013 1032
1014 /* Create background thread */ 1033 /* Create background thread */
1015 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
1016 c->vi.vol_id);
1017 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1034 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
1018 if (!c->bgt) 1035 if (!c->bgt)
1019 c->bgt = ERR_PTR(-EINVAL); 1036 c->bgt = ERR_PTR(-EINVAL);
@@ -1122,8 +1139,8 @@ static int mount_ubifs(struct ubifs_info *c)
1122 if (err) 1139 if (err)
1123 goto out_infos; 1140 goto out_infos;
1124 1141
1125 ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num, 1142 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
1126 c->vi.vol_id); 1143 c->vi.ubi_num, c->vi.vol_id, c->vi.name);
1127 if (mounted_read_only) 1144 if (mounted_read_only)
1128 ubifs_msg("mounted read-only"); 1145 ubifs_msg("mounted read-only");
1129 x = (long long)c->main_lebs * c->leb_size; 1146 x = (long long)c->main_lebs * c->leb_size;
@@ -1469,6 +1486,7 @@ static void ubifs_put_super(struct super_block *sb)
1469 */ 1486 */
1470 ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); 1487 ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
1471 ubifs_assert(c->budg_idx_growth == 0); 1488 ubifs_assert(c->budg_idx_growth == 0);
1489 ubifs_assert(c->budg_dd_growth == 0);
1472 ubifs_assert(c->budg_data_growth == 0); 1490 ubifs_assert(c->budg_data_growth == 0);
1473 1491
1474 /* 1492 /*
@@ -1657,7 +1675,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1657 INIT_LIST_HEAD(&c->orph_new); 1675 INIT_LIST_HEAD(&c->orph_new);
1658 1676
1659 c->highest_inum = UBIFS_FIRST_INO; 1677 c->highest_inum = UBIFS_FIRST_INO;
1660 get_random_bytes(&c->vfs_gen, sizeof(int));
1661 c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; 1678 c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
1662 1679
1663 ubi_get_volume_info(ubi, &c->vi); 1680 ubi_get_volume_info(ubi, &c->vi);
@@ -1671,10 +1688,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1671 } 1688 }
1672 1689
1673 /* 1690 /*
1674 * UBIFS provids 'backing_dev_info' in order to disable readahead. For 1691 * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
1675 * UBIFS, I/O is not deferred, it is done immediately in readpage, 1692 * UBIFS, I/O is not deferred, it is done immediately in readpage,
1676 * which means the user would have to wait not just for their own I/O 1693 * which means the user would have to wait not just for their own I/O
1677 * but the readahead I/O as well i.e. completely pointless. 1694 * but the read-ahead I/O as well i.e. completely pointless.
1678 * 1695 *
1679 * Read-ahead will be disabled because @c->bdi.ra_pages is 0. 1696 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
1680 */ 1697 */
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a96443..7634c5970887 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
506 if (keys_cmp(c, key, &node_key) != 0) 506 if (keys_cmp(c, key, &node_key) != 0)
507 ret = 0; 507 ret = 0;
508 } 508 }
509 if (ret == 0) 509 if (ret == 0 && c->replaying)
510 dbg_mnt("dangling branch LEB %d:%d len %d, key %s", 510 dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
511 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); 511 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
512 return ret; 512 return ret;
@@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
1382} 1382}
1383 1383
1384/** 1384/**
1385 * ubifs_tnc_lookup - look up a file-system node. 1385 * maybe_leb_gced - determine if a LEB may have been garbage collected.
1386 * @c: UBIFS file-system description object 1386 * @c: UBIFS file-system description object
1387 * @key: node key to lookup 1387 * @lnum: LEB number
1388 * @node: the node is returned here 1388 * @gc_seq1: garbage collection sequence number
1389 * 1389 *
1390 * This function look up and reads node with key @key. The caller has to make 1390 * This function determines if @lnum may have been garbage collected since
1391 * sure the @node buffer is large enough to fit the node. Returns zero in case 1391 * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
1392 * of success, %-ENOENT if the node was not found, and a negative error code in 1392 * %0 is returned.
1393 * case of failure.
1394 */ 1393 */
1395int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, 1394static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
1396 void *node)
1397{ 1395{
1398 int found, n, err; 1396 int gc_seq2, gced_lnum;
1399 struct ubifs_znode *znode;
1400 struct ubifs_zbranch zbr, *zt;
1401 1397
1402 mutex_lock(&c->tnc_mutex); 1398 gced_lnum = c->gced_lnum;
1403 found = ubifs_lookup_level0(c, key, &znode, &n); 1399 smp_rmb();
1404 if (!found) { 1400 gc_seq2 = c->gc_seq;
1405 err = -ENOENT; 1401 /* Same seq means no GC */
1406 goto out; 1402 if (gc_seq1 == gc_seq2)
1407 } else if (found < 0) { 1403 return 0;
1408 err = found; 1404 /* Different by more than 1 means we don't know */
1409 goto out; 1405 if (gc_seq1 + 1 != gc_seq2)
1410 } 1406 return 1;
1411 zt = &znode->zbranch[n]; 1407 /*
1412 if (is_hash_key(c, key)) { 1408 * We have seen the sequence number has increased by 1. Now we need to
1413 /* 1409 * be sure we read the right LEB number, so read it again.
1414 * In this case the leaf node cache gets used, so we pass the 1410 */
1415 * address of the zbranch and keep the mutex locked 1411 smp_rmb();
1416 */ 1412 if (gced_lnum != c->gced_lnum)
1417 err = tnc_read_node_nm(c, zt, node); 1413 return 1;
1418 goto out; 1414 /* Finally we can check lnum */
1419 } 1415 if (gced_lnum == lnum)
1420 zbr = znode->zbranch[n]; 1416 return 1;
1421 mutex_unlock(&c->tnc_mutex); 1417 return 0;
1422
1423 err = ubifs_tnc_read_node(c, &zbr, node);
1424 return err;
1425
1426out:
1427 mutex_unlock(&c->tnc_mutex);
1428 return err;
1429} 1418}
1430 1419
1431/** 1420/**
@@ -1436,16 +1425,19 @@ out:
1436 * @lnum: LEB number is returned here 1425 * @lnum: LEB number is returned here
1437 * @offs: offset is returned here 1426 * @offs: offset is returned here
1438 * 1427 *
1439 * This function is the same as 'ubifs_tnc_lookup()' but it returns the node 1428 * This function look up and reads node with key @key. The caller has to make
1440 * location also. See 'ubifs_tnc_lookup()'. 1429 * sure the @node buffer is large enough to fit the node. Returns zero in case
1430 * of success, %-ENOENT if the node was not found, and a negative error code in
1431 * case of failure. The node location can be returned in @lnum and @offs.
1441 */ 1432 */
1442int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, 1433int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1443 void *node, int *lnum, int *offs) 1434 void *node, int *lnum, int *offs)
1444{ 1435{
1445 int found, n, err; 1436 int found, n, err, safely = 0, gc_seq1;
1446 struct ubifs_znode *znode; 1437 struct ubifs_znode *znode;
1447 struct ubifs_zbranch zbr, *zt; 1438 struct ubifs_zbranch zbr, *zt;
1448 1439
1440again:
1449 mutex_lock(&c->tnc_mutex); 1441 mutex_lock(&c->tnc_mutex);
1450 found = ubifs_lookup_level0(c, key, &znode, &n); 1442 found = ubifs_lookup_level0(c, key, &znode, &n);
1451 if (!found) { 1443 if (!found) {
@@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1456 goto out; 1448 goto out;
1457 } 1449 }
1458 zt = &znode->zbranch[n]; 1450 zt = &znode->zbranch[n];
1451 if (lnum) {
1452 *lnum = zt->lnum;
1453 *offs = zt->offs;
1454 }
1459 if (is_hash_key(c, key)) { 1455 if (is_hash_key(c, key)) {
1460 /* 1456 /*
1461 * In this case the leaf node cache gets used, so we pass the 1457 * In this case the leaf node cache gets used, so we pass the
1462 * address of the zbranch and keep the mutex locked 1458 * address of the zbranch and keep the mutex locked
1463 */ 1459 */
1464 *lnum = zt->lnum;
1465 *offs = zt->offs;
1466 err = tnc_read_node_nm(c, zt, node); 1460 err = tnc_read_node_nm(c, zt, node);
1467 goto out; 1461 goto out;
1468 } 1462 }
1463 if (safely) {
1464 err = ubifs_tnc_read_node(c, zt, node);
1465 goto out;
1466 }
1467 /* Drop the TNC mutex prematurely and race with garbage collection */
1469 zbr = znode->zbranch[n]; 1468 zbr = znode->zbranch[n];
1469 gc_seq1 = c->gc_seq;
1470 mutex_unlock(&c->tnc_mutex); 1470 mutex_unlock(&c->tnc_mutex);
1471 1471
1472 *lnum = zbr.lnum; 1472 if (ubifs_get_wbuf(c, zbr.lnum)) {
1473 *offs = zbr.offs; 1473 /* We do not GC journal heads */
1474 err = ubifs_tnc_read_node(c, &zbr, node);
1475 return err;
1476 }
1474 1477
1475 err = ubifs_tnc_read_node(c, &zbr, node); 1478 err = fallible_read_node(c, key, &zbr, node);
1476 return err; 1479 if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
1480 /*
1481 * The node may have been GC'ed out from under us so try again
1482 * while keeping the TNC mutex locked.
1483 */
1484 safely = 1;
1485 goto again;
1486 }
1487 return 0;
1477 1488
1478out: 1489out:
1479 mutex_unlock(&c->tnc_mutex); 1490 mutex_unlock(&c->tnc_mutex);
@@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1498{ 1509{
1499 int found, n, err; 1510 int found, n, err;
1500 struct ubifs_znode *znode; 1511 struct ubifs_znode *znode;
1501 struct ubifs_zbranch zbr;
1502 1512
1503 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); 1513 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
1504 mutex_lock(&c->tnc_mutex); 1514 mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1522 goto out_unlock; 1532 goto out_unlock;
1523 } 1533 }
1524 1534
1525 zbr = znode->zbranch[n]; 1535 err = tnc_read_node_nm(c, &znode->zbranch[n], node);
1526 mutex_unlock(&c->tnc_mutex);
1527
1528 err = tnc_read_node_nm(c, &zbr, node);
1529 return err;
1530 1536
1531out_unlock: 1537out_unlock:
1532 mutex_unlock(&c->tnc_mutex); 1538 mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8117e65ba2e9..8ac76b1c2d55 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -372,26 +372,25 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
372 written = layout_leb_in_gaps(c, p); 372 written = layout_leb_in_gaps(c, p);
373 if (written < 0) { 373 if (written < 0) {
374 err = written; 374 err = written;
375 if (err == -ENOSPC) { 375 if (err != -ENOSPC) {
376 if (!dbg_force_in_the_gaps_enabled) { 376 kfree(c->gap_lebs);
377 /* 377 c->gap_lebs = NULL;
378 * Do not print scary warnings if the 378 return err;
379 * debugging option which forces
380 * in-the-gaps is enabled.
381 */
382 ubifs_err("out of space");
383 spin_lock(&c->space_lock);
384 dbg_dump_budg(c);
385 spin_unlock(&c->space_lock);
386 dbg_dump_lprops(c);
387 }
388 /* Try to commit anyway */
389 err = 0;
390 break;
391 } 379 }
392 kfree(c->gap_lebs); 380 if (!dbg_force_in_the_gaps_enabled) {
393 c->gap_lebs = NULL; 381 /*
394 return err; 382 * Do not print scary warnings if the debugging
383 * option which forces in-the-gaps is enabled.
384 */
385 ubifs_err("out of space");
386 spin_lock(&c->space_lock);
387 dbg_dump_budg(c);
388 spin_unlock(&c->space_lock);
389 dbg_dump_lprops(c);
390 }
391 /* Try to commit anyway */
392 err = 0;
393 break;
395 } 394 }
396 p++; 395 p++;
397 cnt -= written; 396 cnt -= written;
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0cc7da9bed47..a9ecbd9af20d 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -87,7 +87,7 @@
87#define UBIFS_SK_LEN 8 87#define UBIFS_SK_LEN 8
88 88
89/* Minimum index tree fanout */ 89/* Minimum index tree fanout */
90#define UBIFS_MIN_FANOUT 2 90#define UBIFS_MIN_FANOUT 3
91 91
92/* Maximum number of levels in UBIFS indexing B-tree */ 92/* Maximum number of levels in UBIFS indexing B-tree */
93#define UBIFS_MAX_LEVELS 512 93#define UBIFS_MAX_LEVELS 512
@@ -228,10 +228,10 @@ enum {
228/* Minimum number of orphan area logical eraseblocks */ 228/* Minimum number of orphan area logical eraseblocks */
229#define UBIFS_MIN_ORPH_LEBS 1 229#define UBIFS_MIN_ORPH_LEBS 1
230/* 230/*
231 * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1 231 * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1
232 * for GC, 1 for deletions, and at least 1 for committed data). 232 * for GC, 1 for deletions, and at least 1 for committed data).
233 */ 233 */
234#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5) 234#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)
235 235
236/* Minimum number of logical eraseblocks */ 236/* Minimum number of logical eraseblocks */
237#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ 237#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e4f89f271827..17c620b93eec 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -20,8 +20,6 @@
20 * Adrian Hunter 20 * Adrian Hunter
21 */ 21 */
22 22
23/* Implementation version 0.7 */
24
25#ifndef __UBIFS_H__ 23#ifndef __UBIFS_H__
26#define __UBIFS_H__ 24#define __UBIFS_H__
27 25
@@ -322,6 +320,8 @@ struct ubifs_gced_idx_leb {
322 * struct ubifs_inode - UBIFS in-memory inode description. 320 * struct ubifs_inode - UBIFS in-memory inode description.
323 * @vfs_inode: VFS inode description object 321 * @vfs_inode: VFS inode description object
324 * @creat_sqnum: sequence number at time of creation 322 * @creat_sqnum: sequence number at time of creation
323 * @del_cmtno: commit number corresponding to the time the inode was deleted,
324 * protected by @c->commit_sem;
325 * @xattr_size: summarized size of all extended attributes in bytes 325 * @xattr_size: summarized size of all extended attributes in bytes
326 * @xattr_cnt: count of extended attributes this inode has 326 * @xattr_cnt: count of extended attributes this inode has
327 * @xattr_names: sum of lengths of all extended attribute names belonging to 327 * @xattr_names: sum of lengths of all extended attribute names belonging to
@@ -373,6 +373,7 @@ struct ubifs_gced_idx_leb {
373struct ubifs_inode { 373struct ubifs_inode {
374 struct inode vfs_inode; 374 struct inode vfs_inode;
375 unsigned long long creat_sqnum; 375 unsigned long long creat_sqnum;
376 unsigned long long del_cmtno;
376 unsigned int xattr_size; 377 unsigned int xattr_size;
377 unsigned int xattr_cnt; 378 unsigned int xattr_cnt;
378 unsigned int xattr_names; 379 unsigned int xattr_names;
@@ -779,7 +780,7 @@ struct ubifs_compressor {
779/** 780/**
780 * struct ubifs_budget_req - budget requirements of an operation. 781 * struct ubifs_budget_req - budget requirements of an operation.
781 * 782 *
782 * @fast: non-zero if the budgeting should try to aquire budget quickly and 783 * @fast: non-zero if the budgeting should try to acquire budget quickly and
783 * should not try to call write-back 784 * should not try to call write-back
784 * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields 785 * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
785 * have to be re-calculated 786 * have to be re-calculated
@@ -805,21 +806,31 @@ struct ubifs_compressor {
805 * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d 806 * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
806 * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made 807 * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
807 * dirty by the re-name operation. 808 * dirty by the re-name operation.
809 *
810 * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to
811 * make sure the amount of inode data which contribute to @new_ino_d and
812 * @dirtied_ino_d fields are aligned.
808 */ 813 */
809struct ubifs_budget_req { 814struct ubifs_budget_req {
810 unsigned int fast:1; 815 unsigned int fast:1;
811 unsigned int recalculate:1; 816 unsigned int recalculate:1;
817#ifndef UBIFS_DEBUG
812 unsigned int new_page:1; 818 unsigned int new_page:1;
813 unsigned int dirtied_page:1; 819 unsigned int dirtied_page:1;
814 unsigned int new_dent:1; 820 unsigned int new_dent:1;
815 unsigned int mod_dent:1; 821 unsigned int mod_dent:1;
816 unsigned int new_ino:1; 822 unsigned int new_ino:1;
817 unsigned int new_ino_d:13; 823 unsigned int new_ino_d:13;
818#ifndef UBIFS_DEBUG
819 unsigned int dirtied_ino:4; 824 unsigned int dirtied_ino:4;
820 unsigned int dirtied_ino_d:15; 825 unsigned int dirtied_ino_d:15;
821#else 826#else
822 /* Not bit-fields to check for overflows */ 827 /* Not bit-fields to check for overflows */
828 unsigned int new_page;
829 unsigned int dirtied_page;
830 unsigned int new_dent;
831 unsigned int mod_dent;
832 unsigned int new_ino;
833 unsigned int new_ino_d;
823 unsigned int dirtied_ino; 834 unsigned int dirtied_ino;
824 unsigned int dirtied_ino_d; 835 unsigned int dirtied_ino_d;
825#endif 836#endif
@@ -860,13 +871,13 @@ struct ubifs_mount_opts {
860 * struct ubifs_info - UBIFS file-system description data structure 871 * struct ubifs_info - UBIFS file-system description data structure
861 * (per-superblock). 872 * (per-superblock).
862 * @vfs_sb: VFS @struct super_block object 873 * @vfs_sb: VFS @struct super_block object
863 * @bdi: backing device info object to make VFS happy and disable readahead 874 * @bdi: backing device info object to make VFS happy and disable read-ahead
864 * 875 *
865 * @highest_inum: highest used inode number 876 * @highest_inum: highest used inode number
866 * @vfs_gen: VFS inode generation counter
867 * @max_sqnum: current global sequence number 877 * @max_sqnum: current global sequence number
868 * @cmt_no: commit number (last successfully completed commit) 878 * @cmt_no: commit number of the last successfully completed commit, protected
869 * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters 879 * by @commit_sem
880 * @cnt_lock: protects @highest_inum and @max_sqnum counters
870 * @fmt_version: UBIFS on-flash format version 881 * @fmt_version: UBIFS on-flash format version
871 * @uuid: UUID from super block 882 * @uuid: UUID from super block
872 * 883 *
@@ -984,6 +995,9 @@ struct ubifs_mount_opts {
984 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary 995 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
985 * @max_inode_sz: maximum possible inode size in bytes 996 * @max_inode_sz: maximum possible inode size in bytes
986 * @max_znode_sz: size of znode in bytes 997 * @max_znode_sz: size of znode in bytes
998 *
999 * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
1000 * data nodes of maximum size - used in free space reporting
987 * @dead_wm: LEB dead space watermark 1001 * @dead_wm: LEB dead space watermark
988 * @dark_wm: LEB dark space watermark 1002 * @dark_wm: LEB dark space watermark
989 * @block_cnt: count of 4KiB blocks on the FS 1003 * @block_cnt: count of 4KiB blocks on the FS
@@ -1017,6 +1031,8 @@ struct ubifs_mount_opts {
1017 * @sbuf: a buffer of LEB size used by GC and replay for scanning 1031 * @sbuf: a buffer of LEB size used by GC and replay for scanning
1018 * @idx_gc: list of index LEBs that have been garbage collected 1032 * @idx_gc: list of index LEBs that have been garbage collected
1019 * @idx_gc_cnt: number of elements on the idx_gc list 1033 * @idx_gc_cnt: number of elements on the idx_gc list
1034 * @gc_seq: incremented for every non-index LEB garbage collected
1035 * @gced_lnum: last non-index LEB that was garbage collected
1020 * 1036 *
1021 * @infos_list: links all 'ubifs_info' objects 1037 * @infos_list: links all 'ubifs_info' objects
1022 * @umount_mutex: serializes shrinker and un-mount 1038 * @umount_mutex: serializes shrinker and un-mount
@@ -1103,7 +1119,6 @@ struct ubifs_info {
1103 struct backing_dev_info bdi; 1119 struct backing_dev_info bdi;
1104 1120
1105 ino_t highest_inum; 1121 ino_t highest_inum;
1106 unsigned int vfs_gen;
1107 unsigned long long max_sqnum; 1122 unsigned long long max_sqnum;
1108 unsigned long long cmt_no; 1123 unsigned long long cmt_no;
1109 spinlock_t cnt_lock; 1124 spinlock_t cnt_lock;
@@ -1214,6 +1229,8 @@ struct ubifs_info {
1214 int max_idx_node_sz; 1229 int max_idx_node_sz;
1215 long long max_inode_sz; 1230 long long max_inode_sz;
1216 int max_znode_sz; 1231 int max_znode_sz;
1232
1233 int leb_overhead;
1217 int dead_wm; 1234 int dead_wm;
1218 int dark_wm; 1235 int dark_wm;
1219 int block_cnt; 1236 int block_cnt;
@@ -1247,6 +1264,8 @@ struct ubifs_info {
1247 void *sbuf; 1264 void *sbuf;
1248 struct list_head idx_gc; 1265 struct list_head idx_gc;
1249 int idx_gc_cnt; 1266 int idx_gc_cnt;
1267 volatile int gc_seq;
1268 volatile int gced_lnum;
1250 1269
1251 struct list_head infos_list; 1270 struct list_head infos_list;
1252 struct mutex umount_mutex; 1271 struct mutex umount_mutex;
@@ -1346,6 +1365,7 @@ extern struct backing_dev_info ubifs_backing_dev_info;
1346extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; 1365extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
1347 1366
1348/* io.c */ 1367/* io.c */
1368void ubifs_ro_mode(struct ubifs_info *c, int err);
1349int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); 1369int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
1350int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, 1370int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
1351 int dtype); 1371 int dtype);
@@ -1399,8 +1419,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
1399 int deletion, int xent); 1419 int deletion, int xent);
1400int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, 1420int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
1401 const union ubifs_key *key, const void *buf, int len); 1421 const union ubifs_key *key, const void *buf, int len);
1402int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, 1422int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
1403 int last_reference); 1423int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
1404int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, 1424int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
1405 const struct dentry *old_dentry, 1425 const struct dentry *old_dentry,
1406 const struct inode *new_dir, 1426 const struct inode *new_dir,
@@ -1423,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1423 struct ubifs_budget_req *req); 1443 struct ubifs_budget_req *req);
1424void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, 1444void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1425 struct ubifs_budget_req *req); 1445 struct ubifs_budget_req *req);
1426long long ubifs_budg_get_free_space(struct ubifs_info *c); 1446long long ubifs_get_free_space(struct ubifs_info *c);
1427int ubifs_calc_min_idx_lebs(struct ubifs_info *c); 1447int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1428void ubifs_convert_page_budget(struct ubifs_info *c); 1448void ubifs_convert_page_budget(struct ubifs_info *c);
1449long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
1429long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1450long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1430 1451
1431/* find.c */ 1452/* find.c */
@@ -1440,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
1440/* tnc.c */ 1461/* tnc.c */
1441int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, 1462int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1442 struct ubifs_znode **zn, int *n); 1463 struct ubifs_znode **zn, int *n);
1443int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
1444 void *node);
1445int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, 1464int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1446 void *node, const struct qstr *nm); 1465 void *node, const struct qstr *nm);
1447int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, 1466int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 1388a078e1a9..649bec78b645 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -61,7 +61,7 @@
61 61
62/* 62/*
63 * Limit the number of extended attributes per inode so that the total size 63 * Limit the number of extended attributes per inode so that the total size
64 * (xattr_size) is guaranteeded to fit in an 'unsigned int'. 64 * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
65 */ 65 */
66#define MAX_XATTRS_PER_INODE 65535 66#define MAX_XATTRS_PER_INODE 65535
67 67
@@ -103,14 +103,14 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
103 struct inode *inode; 103 struct inode *inode;
104 struct ubifs_inode *ui, *host_ui = ubifs_inode(host); 104 struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
105 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, 105 struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
106 .new_ino_d = size, .dirtied_ino = 1, 106 .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
107 .dirtied_ino_d = host_ui->data_len}; 107 .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
108 108
109 if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) 109 if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
110 return -ENOSPC; 110 return -ENOSPC;
111 /* 111 /*
112 * Linux limits the maximum size of the extended attribute names list 112 * Linux limits the maximum size of the extended attribute names list
113 * to %XATTR_LIST_MAX. This means we should not allow creating more* 113 * to %XATTR_LIST_MAX. This means we should not allow creating more
114 * extended attributes if the name list becomes larger. This limitation 114 * extended attributes if the name list becomes larger. This limitation
115 * is artificial for UBIFS, though. 115 * is artificial for UBIFS, though.
116 */ 116 */
@@ -128,7 +128,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
128 goto out_budg; 128 goto out_budg;
129 } 129 }
130 130
131 mutex_lock(&host_ui->ui_mutex);
132 /* Re-define all operations to be "nothing" */ 131 /* Re-define all operations to be "nothing" */
133 inode->i_mapping->a_ops = &none_address_operations; 132 inode->i_mapping->a_ops = &none_address_operations;
134 inode->i_op = &none_inode_operations; 133 inode->i_op = &none_inode_operations;
@@ -141,23 +140,19 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
141 ui->data = kmalloc(size, GFP_NOFS); 140 ui->data = kmalloc(size, GFP_NOFS);
142 if (!ui->data) { 141 if (!ui->data) {
143 err = -ENOMEM; 142 err = -ENOMEM;
144 goto out_unlock; 143 goto out_free;
145 } 144 }
146
147 memcpy(ui->data, value, size); 145 memcpy(ui->data, value, size);
146 inode->i_size = ui->ui_size = size;
147 ui->data_len = size;
148
149 mutex_lock(&host_ui->ui_mutex);
148 host->i_ctime = ubifs_current_time(host); 150 host->i_ctime = ubifs_current_time(host);
149 host_ui->xattr_cnt += 1; 151 host_ui->xattr_cnt += 1;
150 host_ui->xattr_size += CALC_DENT_SIZE(nm->len); 152 host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
151 host_ui->xattr_size += CALC_XATTR_BYTES(size); 153 host_ui->xattr_size += CALC_XATTR_BYTES(size);
152 host_ui->xattr_names += nm->len; 154 host_ui->xattr_names += nm->len;
153 155
154 /*
155 * We do not use i_size_write() because nobody can race with us as we
156 * are holding host @host->i_mutex - every xattr operation for this
157 * inode is serialized by it.
158 */
159 inode->i_size = ui->ui_size = size;
160 ui->data_len = size;
161 err = ubifs_jnl_update(c, host, nm, inode, 0, 1); 156 err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
162 if (err) 157 if (err)
163 goto out_cancel; 158 goto out_cancel;
@@ -172,8 +167,8 @@ out_cancel:
172 host_ui->xattr_cnt -= 1; 167 host_ui->xattr_cnt -= 1;
173 host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); 168 host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
174 host_ui->xattr_size -= CALC_XATTR_BYTES(size); 169 host_ui->xattr_size -= CALC_XATTR_BYTES(size);
175out_unlock:
176 mutex_unlock(&host_ui->ui_mutex); 170 mutex_unlock(&host_ui->ui_mutex);
171out_free:
177 make_bad_inode(inode); 172 make_bad_inode(inode);
178 iput(inode); 173 iput(inode);
179out_budg: 174out_budg:
@@ -200,29 +195,28 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
200 struct ubifs_inode *host_ui = ubifs_inode(host); 195 struct ubifs_inode *host_ui = ubifs_inode(host);
201 struct ubifs_inode *ui = ubifs_inode(inode); 196 struct ubifs_inode *ui = ubifs_inode(inode);
202 struct ubifs_budget_req req = { .dirtied_ino = 2, 197 struct ubifs_budget_req req = { .dirtied_ino = 2,
203 .dirtied_ino_d = size + host_ui->data_len }; 198 .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
204 199
205 ubifs_assert(ui->data_len == inode->i_size); 200 ubifs_assert(ui->data_len == inode->i_size);
206 err = ubifs_budget_space(c, &req); 201 err = ubifs_budget_space(c, &req);
207 if (err) 202 if (err)
208 return err; 203 return err;
209 204
210 mutex_lock(&host_ui->ui_mutex);
211 host->i_ctime = ubifs_current_time(host);
212 host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
213 host_ui->xattr_size += CALC_XATTR_BYTES(size);
214
215 kfree(ui->data); 205 kfree(ui->data);
216 ui->data = kmalloc(size, GFP_NOFS); 206 ui->data = kmalloc(size, GFP_NOFS);
217 if (!ui->data) { 207 if (!ui->data) {
218 err = -ENOMEM; 208 err = -ENOMEM;
219 goto out_unlock; 209 goto out_free;
220 } 210 }
221
222 memcpy(ui->data, value, size); 211 memcpy(ui->data, value, size);
223 inode->i_size = ui->ui_size = size; 212 inode->i_size = ui->ui_size = size;
224 ui->data_len = size; 213 ui->data_len = size;
225 214
215 mutex_lock(&host_ui->ui_mutex);
216 host->i_ctime = ubifs_current_time(host);
217 host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
218 host_ui->xattr_size += CALC_XATTR_BYTES(size);
219
226 /* 220 /*
227 * It is important to write the host inode after the xattr inode 221 * It is important to write the host inode after the xattr inode
228 * because if the host inode gets synchronized (via 'fsync()'), then 222 * because if the host inode gets synchronized (via 'fsync()'), then
@@ -240,9 +234,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
240out_cancel: 234out_cancel:
241 host_ui->xattr_size -= CALC_XATTR_BYTES(size); 235 host_ui->xattr_size -= CALC_XATTR_BYTES(size);
242 host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); 236 host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
243 make_bad_inode(inode);
244out_unlock:
245 mutex_unlock(&host_ui->ui_mutex); 237 mutex_unlock(&host_ui->ui_mutex);
238 make_bad_inode(inode);
239out_free:
246 ubifs_release_budget(c, &req); 240 ubifs_release_budget(c, &req);
247 return err; 241 return err;
248} 242}
@@ -312,6 +306,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
312 306
313 dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name, 307 dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
314 host->i_ino, dentry->d_name.len, dentry->d_name.name, size); 308 host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
309 ubifs_assert(mutex_is_locked(&host->i_mutex));
315 310
316 if (size > UBIFS_MAX_INO_DATA) 311 if (size > UBIFS_MAX_INO_DATA)
317 return -ERANGE; 312 return -ERANGE;
@@ -384,7 +379,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
384 if (!xent) 379 if (!xent)
385 return -ENOMEM; 380 return -ENOMEM;
386 381
387 mutex_lock(&host->i_mutex);
388 xent_key_init(c, &key, host->i_ino, &nm); 382 xent_key_init(c, &key, host->i_ino, &nm);
389 err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); 383 err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
390 if (err) { 384 if (err) {
@@ -419,7 +413,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
419out_iput: 413out_iput:
420 iput(inode); 414 iput(inode);
421out_unlock: 415out_unlock:
422 mutex_unlock(&host->i_mutex);
423 kfree(xent); 416 kfree(xent);
424 return err; 417 return err;
425} 418}
@@ -449,8 +442,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
449 return -ERANGE; 442 return -ERANGE;
450 443
451 lowest_xent_key(c, &key, host->i_ino); 444 lowest_xent_key(c, &key, host->i_ino);
452
453 mutex_lock(&host->i_mutex);
454 while (1) { 445 while (1) {
455 int type; 446 int type;
456 447
@@ -479,7 +470,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
479 pxent = xent; 470 pxent = xent;
480 key_read(c, &xent->key, &key); 471 key_read(c, &xent->key, &key);
481 } 472 }
482 mutex_unlock(&host->i_mutex);
483 473
484 kfree(pxent); 474 kfree(pxent);
485 if (err != -ENOENT) { 475 if (err != -ENOENT) {
@@ -497,8 +487,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
497 int err; 487 int err;
498 struct ubifs_inode *host_ui = ubifs_inode(host); 488 struct ubifs_inode *host_ui = ubifs_inode(host);
499 struct ubifs_inode *ui = ubifs_inode(inode); 489 struct ubifs_inode *ui = ubifs_inode(inode);
500 struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1, 490 struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1,
501 .dirtied_ino_d = host_ui->data_len }; 491 .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
502 492
503 ubifs_assert(ui->data_len == inode->i_size); 493 ubifs_assert(ui->data_len == inode->i_size);
504 494
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d9..eb91f3b70320 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
211 .release = udf_release_file, 211 .release = udf_release_file,
212 .fsync = udf_fsync_file, 212 .fsync = udf_fsync_file,
213 .splice_read = generic_file_splice_read, 213 .splice_read = generic_file_splice_read,
214 .llseek = generic_file_llseek,
214}; 215};
215 216
216const struct inode_operations udf_file_inode_operations = { 217const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3d..a4f2b3ce45b0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
76 *err = -ENOSPC; 76 *err = -ENOSPC;
77 77
78 iinfo = UDF_I(inode); 78 iinfo = UDF_I(inode);
79 iinfo->i_unique = 0; 79 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
80 iinfo->i_lenExtents = 0; 80 iinfo->i_efe = 1;
81 iinfo->i_next_alloc_block = 0; 81 if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
82 iinfo->i_next_alloc_goal = 0; 82 sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
83 iinfo->i_strat4096 = 0; 83 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
84 sizeof(struct extendedFileEntry),
85 GFP_KERNEL);
86 } else {
87 iinfo->i_efe = 0;
88 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
89 sizeof(struct fileEntry),
90 GFP_KERNEL);
91 }
92 if (!iinfo->i_ext.i_data) {
93 iput(inode);
94 *err = -ENOMEM;
95 return NULL;
96 }
84 97
85 block = udf_new_block(dir->i_sb, NULL, 98 block = udf_new_block(dir->i_sb, NULL,
86 dinfo->i_location.partitionReferenceNum, 99 dinfo->i_location.partitionReferenceNum,
@@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
111 lvhd->uniqueID = cpu_to_le64(uniqueID); 124 lvhd->uniqueID = cpu_to_le64(uniqueID);
112 mark_buffer_dirty(sbi->s_lvid_bh); 125 mark_buffer_dirty(sbi->s_lvid_bh);
113 } 126 }
127 mutex_unlock(&sbi->s_alloc_mutex);
114 inode->i_mode = mode; 128 inode->i_mode = mode;
115 inode->i_uid = current->fsuid; 129 inode->i_uid = current->fsuid;
116 if (dir->i_mode & S_ISGID) { 130 if (dir->i_mode & S_ISGID) {
@@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
129 iinfo->i_lenEAttr = 0; 143 iinfo->i_lenEAttr = 0;
130 iinfo->i_lenAlloc = 0; 144 iinfo->i_lenAlloc = 0;
131 iinfo->i_use = 0; 145 iinfo->i_use = 0;
132 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
133 iinfo->i_efe = 1;
134 if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
135 sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
136 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
137 sizeof(struct extendedFileEntry),
138 GFP_KERNEL);
139 } else {
140 iinfo->i_efe = 0;
141 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
142 sizeof(struct fileEntry),
143 GFP_KERNEL);
144 }
145 if (!iinfo->i_ext.i_data) {
146 iput(inode);
147 *err = -ENOMEM;
148 mutex_unlock(&sbi->s_alloc_mutex);
149 return NULL;
150 }
151 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) 146 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
152 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; 147 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
153 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 148 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
158 iinfo->i_crtime = current_fs_time(inode->i_sb); 153 iinfo->i_crtime = current_fs_time(inode->i_sb);
159 insert_inode_hash(inode); 154 insert_inode_hash(inode);
160 mark_inode_dirty(inode); 155 mark_inode_dirty(inode);
161 mutex_unlock(&sbi->s_alloc_mutex);
162 156
163 if (DQUOT_ALLOC_INODE(inode)) { 157 if (DQUOT_ALLOC_INODE(inode)) {
164 DQUOT_DROP(inode); 158 DQUOT_DROP(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 5698bbf83bbf..e25e7010627b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -369,7 +369,7 @@ enum {
369 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore 369 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
370}; 370};
371 371
372static match_table_t tokens = { 372static const match_table_t tokens = {
373 {Opt_novrs, "novrs"}, 373 {Opt_novrs, "novrs"},
374 {Opt_nostrict, "nostrict"}, 374 {Opt_nostrict, "nostrict"},
375 {Opt_bs, "bs=%u"}, 375 {Opt_bs, "bs=%u"},
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3141969b456d..e65212dfb60e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -309,7 +309,7 @@ enum {
309 Opt_err 309 Opt_err
310}; 310};
311 311
312static match_table_t tokens = { 312static const match_table_t tokens = {
313 {Opt_type_old, "ufstype=old"}, 313 {Opt_type_old, "ufstype=old"},
314 {Opt_type_sunx86, "ufstype=sunx86"}, 314 {Opt_type_sunx86, "ufstype=sunx86"},
315 {Opt_type_sun, "ufstype=sun"}, 315 {Opt_type_sun, "ufstype=sun"},
@@ -1233,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
1233{ 1233{
1234 struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); 1234 struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
1235 unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; 1235 unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
1236 struct match_token *tp = tokens; 1236 const struct match_token *tp = tokens;
1237 1237
1238 while (tp->token != Opt_onerror_panic && tp->token != mval) 1238 while (tp->token != Opt_onerror_panic && tp->token != mval)
1239 ++tp; 1239 ++tp;
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
deleted file mode 100644
index 3abe7e9ceb33..000000000000
--- a/fs/xfs/linux-2.6/sema.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_SEMA_H__
19#define __XFS_SUPPORT_SEMA_H__
20
21#include <linux/time.h>
22#include <linux/wait.h>
23#include <linux/semaphore.h>
24#include <asm/atomic.h>
25
26/*
27 * sema_t structure just maps to struct semaphore in Linux kernel.
28 */
29
30typedef struct semaphore sema_t;
31
32#define initnsema(sp, val, name) sema_init(sp, val)
33#define psema(sp, b) down(sp)
34#define vsema(sp) up(sp)
35#define freesema(sema) do { } while (0)
36
37static inline int issemalocked(sema_t *sp)
38{
39 return down_trylock(sp) || (up(sp), 0);
40}
41
42/*
43 * Map cpsema (try to get the sema) to down_trylock. We need to switch
44 * the return values since cpsema returns 1 (acquired) 0 (failed) and
45 * down_trylock returns the reverse 0 (acquired) 1 (failed).
46 */
47static inline int cpsema(sema_t *sp)
48{
49 return down_trylock(sp) ? 0 : 1;
50}
51
52#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index fa47e43b8b41..a44d68eb50b5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -73,7 +73,6 @@ xfs_page_trace(
73 unsigned long pgoff) 73 unsigned long pgoff)
74{ 74{
75 xfs_inode_t *ip; 75 xfs_inode_t *ip;
76 bhv_vnode_t *vp = vn_from_inode(inode);
77 loff_t isize = i_size_read(inode); 76 loff_t isize = i_size_read(inode);
78 loff_t offset = page_offset(page); 77 loff_t offset = page_offset(page);
79 int delalloc = -1, unmapped = -1, unwritten = -1; 78 int delalloc = -1, unmapped = -1, unwritten = -1;
@@ -81,7 +80,7 @@ xfs_page_trace(
81 if (page_has_buffers(page)) 80 if (page_has_buffers(page))
82 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); 81 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
83 82
84 ip = xfs_vtoi(vp); 83 ip = XFS_I(inode);
85 if (!ip->i_rwtrace) 84 if (!ip->i_rwtrace)
86 return; 85 return;
87 86
@@ -1339,6 +1338,10 @@ __xfs_get_blocks(
1339 offset = (xfs_off_t)iblock << inode->i_blkbits; 1338 offset = (xfs_off_t)iblock << inode->i_blkbits;
1340 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1339 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1341 size = bh_result->b_size; 1340 size = bh_result->b_size;
1341
1342 if (!create && direct && offset >= i_size_read(inode))
1343 return 0;
1344
1342 error = xfs_iomap(XFS_I(inode), offset, size, 1345 error = xfs_iomap(XFS_I(inode), offset, size,
1343 create ? flags : BMAPI_READ, &iomap, &niomap); 1346 create ? flags : BMAPI_READ, &iomap, &niomap);
1344 if (error) 1347 if (error)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9cc8f0213095..36d5fcd3f593 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -58,7 +58,7 @@ xfs_buf_trace(
58 bp, id, 58 bp, id,
59 (void *)(unsigned long)bp->b_flags, 59 (void *)(unsigned long)bp->b_flags,
60 (void *)(unsigned long)bp->b_hold.counter, 60 (void *)(unsigned long)bp->b_hold.counter,
61 (void *)(unsigned long)bp->b_sema.count.counter, 61 (void *)(unsigned long)bp->b_sema.count,
62 (void *)current, 62 (void *)current,
63 data, ra, 63 data, ra,
64 (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), 64 (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
@@ -253,7 +253,7 @@ _xfs_buf_initialize(
253 253
254 memset(bp, 0, sizeof(xfs_buf_t)); 254 memset(bp, 0, sizeof(xfs_buf_t));
255 atomic_set(&bp->b_hold, 1); 255 atomic_set(&bp->b_hold, 1);
256 init_MUTEX_LOCKED(&bp->b_iodonesema); 256 init_completion(&bp->b_iowait);
257 INIT_LIST_HEAD(&bp->b_list); 257 INIT_LIST_HEAD(&bp->b_list);
258 INIT_LIST_HEAD(&bp->b_hash_list); 258 INIT_LIST_HEAD(&bp->b_hash_list);
259 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 259 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
@@ -838,6 +838,7 @@ xfs_buf_rele(
838 return; 838 return;
839 } 839 }
840 840
841 ASSERT(atomic_read(&bp->b_hold) > 0);
841 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 842 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
842 if (bp->b_relse) { 843 if (bp->b_relse) {
843 atomic_inc(&bp->b_hold); 844 atomic_inc(&bp->b_hold);
@@ -851,11 +852,6 @@ xfs_buf_rele(
851 spin_unlock(&hash->bh_lock); 852 spin_unlock(&hash->bh_lock);
852 xfs_buf_free(bp); 853 xfs_buf_free(bp);
853 } 854 }
854 } else {
855 /*
856 * Catch reference count leaks
857 */
858 ASSERT(atomic_read(&bp->b_hold) >= 0);
859 } 855 }
860} 856}
861 857
@@ -1005,12 +1001,13 @@ xfs_buf_iodone_work(
1005 * We can get an EOPNOTSUPP to ordered writes. Here we clear the 1001 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
1006 * ordered flag and reissue them. Because we can't tell the higher 1002 * ordered flag and reissue them. Because we can't tell the higher
1007 * layers directly that they should not issue ordered I/O anymore, they 1003 * layers directly that they should not issue ordered I/O anymore, they
1008 * need to check if the ordered flag was cleared during I/O completion. 1004 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
1009 */ 1005 */
1010 if ((bp->b_error == EOPNOTSUPP) && 1006 if ((bp->b_error == EOPNOTSUPP) &&
1011 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { 1007 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
1012 XB_TRACE(bp, "ordered_retry", bp->b_iodone); 1008 XB_TRACE(bp, "ordered_retry", bp->b_iodone);
1013 bp->b_flags &= ~XBF_ORDERED; 1009 bp->b_flags &= ~XBF_ORDERED;
1010 bp->b_flags |= _XFS_BARRIER_FAILED;
1014 xfs_buf_iorequest(bp); 1011 xfs_buf_iorequest(bp);
1015 } else if (bp->b_iodone) 1012 } else if (bp->b_iodone)
1016 (*(bp->b_iodone))(bp); 1013 (*(bp->b_iodone))(bp);
@@ -1037,7 +1034,7 @@ xfs_buf_ioend(
1037 xfs_buf_iodone_work(&bp->b_iodone_work); 1034 xfs_buf_iodone_work(&bp->b_iodone_work);
1038 } 1035 }
1039 } else { 1036 } else {
1040 up(&bp->b_iodonesema); 1037 complete(&bp->b_iowait);
1041 } 1038 }
1042} 1039}
1043 1040
@@ -1275,7 +1272,7 @@ xfs_buf_iowait(
1275 XB_TRACE(bp, "iowait", 0); 1272 XB_TRACE(bp, "iowait", 0);
1276 if (atomic_read(&bp->b_io_remaining)) 1273 if (atomic_read(&bp->b_io_remaining))
1277 blk_run_address_space(bp->b_target->bt_mapping); 1274 blk_run_address_space(bp->b_target->bt_mapping);
1278 down(&bp->b_iodonesema); 1275 wait_for_completion(&bp->b_iowait);
1279 XB_TRACE(bp, "iowaited", (long)bp->b_error); 1276 XB_TRACE(bp, "iowaited", (long)bp->b_error);
1280 return bp->b_error; 1277 return bp->b_error;
1281} 1278}
@@ -1799,7 +1796,7 @@ int __init
1799xfs_buf_init(void) 1796xfs_buf_init(void)
1800{ 1797{
1801#ifdef XFS_BUF_TRACE 1798#ifdef XFS_BUF_TRACE
1802 xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP); 1799 xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
1803#endif 1800#endif
1804 1801
1805 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1802 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 29d1d4adc078..456519a088c7 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -85,6 +85,14 @@ typedef enum {
85 * modifications being lost. 85 * modifications being lost.
86 */ 86 */
87 _XBF_PAGE_LOCKED = (1 << 22), 87 _XBF_PAGE_LOCKED = (1 << 22),
88
89 /*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95 _XFS_BARRIER_FAILED = (1 << 23),
88} xfs_buf_flags_t; 96} xfs_buf_flags_t;
89 97
90typedef enum { 98typedef enum {
@@ -157,7 +165,7 @@ typedef struct xfs_buf {
157 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 165 xfs_buf_iodone_t b_iodone; /* I/O completion function */
158 xfs_buf_relse_t b_relse; /* releasing function */ 166 xfs_buf_relse_t b_relse; /* releasing function */
159 xfs_buf_bdstrat_t b_strat; /* pre-write function */ 167 xfs_buf_bdstrat_t b_strat; /* pre-write function */
160 struct semaphore b_iodonesema; /* Semaphore for I/O waiters */ 168 struct completion b_iowait; /* queue for I/O waiters */
161 void *b_fspriv; 169 void *b_fspriv;
162 void *b_fspriv2; 170 void *b_fspriv2;
163 void *b_fspriv3; 171 void *b_fspriv3;
@@ -352,7 +360,7 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
352#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) 360#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
353#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp) 361#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp)
354#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp) 362#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp)
355#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema); 363#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait);
356 364
357#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) 365#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target))
358#define XFS_BUF_TARGET(bp) ((bp)->b_target) 366#define XFS_BUF_TARGET(bp) ((bp)->b_target)
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 987fe84f7b13..24fd598af846 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -139,7 +139,7 @@ xfs_nfs_get_inode(
139 } 139 }
140 140
141 xfs_iunlock(ip, XFS_ILOCK_SHARED); 141 xfs_iunlock(ip, XFS_ILOCK_SHARED);
142 return ip->i_vnode; 142 return VFS_I(ip);
143} 143}
144 144
145STATIC struct dentry * 145STATIC struct dentry *
@@ -167,7 +167,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
167 if (!inode) 167 if (!inode)
168 return NULL; 168 return NULL;
169 if (IS_ERR(inode)) 169 if (IS_ERR(inode))
170 return ERR_PTR(PTR_ERR(inode)); 170 return ERR_CAST(inode);
171 result = d_alloc_anon(inode); 171 result = d_alloc_anon(inode);
172 if (!result) { 172 if (!result) {
173 iput(inode); 173 iput(inode);
@@ -198,7 +198,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
198 if (!inode) 198 if (!inode)
199 return NULL; 199 return NULL;
200 if (IS_ERR(inode)) 200 if (IS_ERR(inode))
201 return ERR_PTR(PTR_ERR(inode)); 201 return ERR_CAST(inode);
202 result = d_alloc_anon(inode); 202 result = d_alloc_anon(inode);
203 if (!result) { 203 if (!result) {
204 iput(inode); 204 iput(inode);
@@ -219,9 +219,9 @@ xfs_fs_get_parent(
219 if (unlikely(error)) 219 if (unlikely(error))
220 return ERR_PTR(-error); 220 return ERR_PTR(-error);
221 221
222 parent = d_alloc_anon(cip->i_vnode); 222 parent = d_alloc_anon(VFS_I(cip));
223 if (unlikely(!parent)) { 223 if (unlikely(!parent)) {
224 iput(cip->i_vnode); 224 iput(VFS_I(cip));
225 return ERR_PTR(-ENOMEM); 225 return ERR_PTR(-ENOMEM);
226 } 226 }
227 return parent; 227 return parent;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5f60363b9343..5311c1acdd40 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = {
475const struct file_operations xfs_dir_file_operations = { 475const struct file_operations xfs_dir_file_operations = {
476 .read = generic_read_dir, 476 .read = generic_read_dir,
477 .readdir = xfs_file_readdir, 477 .readdir = xfs_file_readdir,
478 .llseek = generic_file_llseek,
478 .unlocked_ioctl = xfs_file_ioctl, 479 .unlocked_ioctl = xfs_file_ioctl,
479#ifdef CONFIG_COMPAT 480#ifdef CONFIG_COMPAT
480 .compat_ioctl = xfs_file_compat_ioctl, 481 .compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1eefe61f0e10..36caa6d957df 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -31,7 +31,7 @@ xfs_tosspages(
31 xfs_off_t last, 31 xfs_off_t last,
32 int fiopt) 32 int fiopt)
33{ 33{
34 struct address_space *mapping = ip->i_vnode->i_mapping; 34 struct address_space *mapping = VFS_I(ip)->i_mapping;
35 35
36 if (mapping->nrpages) 36 if (mapping->nrpages)
37 truncate_inode_pages(mapping, first); 37 truncate_inode_pages(mapping, first);
@@ -44,7 +44,7 @@ xfs_flushinval_pages(
44 xfs_off_t last, 44 xfs_off_t last,
45 int fiopt) 45 int fiopt)
46{ 46{
47 struct address_space *mapping = ip->i_vnode->i_mapping; 47 struct address_space *mapping = VFS_I(ip)->i_mapping;
48 int ret = 0; 48 int ret = 0;
49 49
50 if (mapping->nrpages) { 50 if (mapping->nrpages) {
@@ -64,7 +64,7 @@ xfs_flush_pages(
64 uint64_t flags, 64 uint64_t flags,
65 int fiopt) 65 int fiopt)
66{ 66{
67 struct address_space *mapping = ip->i_vnode->i_mapping; 67 struct address_space *mapping = VFS_I(ip)->i_mapping;
68 int ret = 0; 68 int ret = 0;
69 int ret2; 69 int ret2;
70 70
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index acb978d9d085..48799ba7e3e6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -245,7 +245,7 @@ xfs_vget_fsop_handlereq(
245 245
246 xfs_iunlock(ip, XFS_ILOCK_SHARED); 246 xfs_iunlock(ip, XFS_ILOCK_SHARED);
247 247
248 *inode = XFS_ITOV(ip); 248 *inode = VFS_I(ip);
249 return 0; 249 return 0;
250} 250}
251 251
@@ -927,7 +927,7 @@ STATIC void
927xfs_diflags_to_linux( 927xfs_diflags_to_linux(
928 struct xfs_inode *ip) 928 struct xfs_inode *ip)
929{ 929{
930 struct inode *inode = XFS_ITOV(ip); 930 struct inode *inode = VFS_I(ip);
931 unsigned int xflags = xfs_ip2xflags(ip); 931 unsigned int xflags = xfs_ip2xflags(ip);
932 932
933 if (xflags & XFS_XFLAG_IMMUTABLE) 933 if (xflags & XFS_XFLAG_IMMUTABLE)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e88f51028086..095d271f3434 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,7 +62,7 @@ void
62xfs_synchronize_atime( 62xfs_synchronize_atime(
63 xfs_inode_t *ip) 63 xfs_inode_t *ip)
64{ 64{
65 struct inode *inode = ip->i_vnode; 65 struct inode *inode = VFS_I(ip);
66 66
67 if (inode) { 67 if (inode) {
68 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; 68 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
@@ -79,7 +79,7 @@ void
79xfs_mark_inode_dirty_sync( 79xfs_mark_inode_dirty_sync(
80 xfs_inode_t *ip) 80 xfs_inode_t *ip)
81{ 81{
82 struct inode *inode = ip->i_vnode; 82 struct inode *inode = VFS_I(ip);
83 83
84 if (inode) 84 if (inode)
85 mark_inode_dirty_sync(inode); 85 mark_inode_dirty_sync(inode);
@@ -89,36 +89,31 @@ xfs_mark_inode_dirty_sync(
89 * Change the requested timestamp in the given inode. 89 * Change the requested timestamp in the given inode.
90 * We don't lock across timestamp updates, and we don't log them but 90 * We don't lock across timestamp updates, and we don't log them but
91 * we do record the fact that there is dirty information in core. 91 * we do record the fact that there is dirty information in core.
92 *
93 * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
94 * with XFS_ICHGTIME_ACC to be sure that access time
95 * update will take. Calling first with XFS_ICHGTIME_ACC
96 * and then XFS_ICHGTIME_MOD may fail to modify the access
97 * timestamp if the filesystem is mounted noacctm.
98 */ 92 */
99void 93void
100xfs_ichgtime( 94xfs_ichgtime(
101 xfs_inode_t *ip, 95 xfs_inode_t *ip,
102 int flags) 96 int flags)
103{ 97{
104 struct inode *inode = vn_to_inode(XFS_ITOV(ip)); 98 struct inode *inode = VFS_I(ip);
105 timespec_t tv; 99 timespec_t tv;
100 int sync_it = 0;
101
102 tv = current_fs_time(inode->i_sb);
106 103
107 nanotime(&tv); 104 if ((flags & XFS_ICHGTIME_MOD) &&
108 if (flags & XFS_ICHGTIME_MOD) { 105 !timespec_equal(&inode->i_mtime, &tv)) {
109 inode->i_mtime = tv; 106 inode->i_mtime = tv;
110 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 107 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
111 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 108 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
109 sync_it = 1;
112 } 110 }
113 if (flags & XFS_ICHGTIME_ACC) { 111 if ((flags & XFS_ICHGTIME_CHG) &&
114 inode->i_atime = tv; 112 !timespec_equal(&inode->i_ctime, &tv)) {
115 ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
116 ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
117 }
118 if (flags & XFS_ICHGTIME_CHG) {
119 inode->i_ctime = tv; 113 inode->i_ctime = tv;
120 ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; 114 ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
121 ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; 115 ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
116 sync_it = 1;
122 } 117 }
123 118
124 /* 119 /*
@@ -130,55 +125,11 @@ xfs_ichgtime(
130 * ensure that the compiler does not reorder the update 125 * ensure that the compiler does not reorder the update
131 * of i_update_core above the timestamp updates above. 126 * of i_update_core above the timestamp updates above.
132 */ 127 */
133 SYNCHRONIZE(); 128 if (sync_it) {
134 ip->i_update_core = 1; 129 SYNCHRONIZE();
135 if (!(inode->i_state & I_NEW)) 130 ip->i_update_core = 1;
136 mark_inode_dirty_sync(inode); 131 mark_inode_dirty_sync(inode);
137}
138
139/*
140 * Variant on the above which avoids querying the system clock
141 * in situations where we know the Linux inode timestamps have
142 * just been updated (and so we can update our inode cheaply).
143 */
144void
145xfs_ichgtime_fast(
146 xfs_inode_t *ip,
147 struct inode *inode,
148 int flags)
149{
150 timespec_t *tvp;
151
152 /*
153 * Atime updates for read() & friends are handled lazily now, and
154 * explicit updates must go through xfs_ichgtime()
155 */
156 ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
157
158 if (flags & XFS_ICHGTIME_MOD) {
159 tvp = &inode->i_mtime;
160 ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
161 ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
162 } 132 }
163 if (flags & XFS_ICHGTIME_CHG) {
164 tvp = &inode->i_ctime;
165 ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
166 ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec;
167 }
168
169 /*
170 * We update the i_update_core field _after_ changing
171 * the timestamps in order to coordinate properly with
172 * xfs_iflush() so that we don't lose timestamp updates.
173 * This keeps us from having to hold the inode lock
174 * while doing this. We use the SYNCHRONIZE macro to
175 * ensure that the compiler does not reorder the update
176 * of i_update_core above the timestamp updates above.
177 */
178 SYNCHRONIZE();
179 ip->i_update_core = 1;
180 if (!(inode->i_state & I_NEW))
181 mark_inode_dirty_sync(inode);
182} 133}
183 134
184/* 135/*
@@ -299,7 +250,7 @@ xfs_vn_mknod(
299 if (unlikely(error)) 250 if (unlikely(error))
300 goto out_free_acl; 251 goto out_free_acl;
301 252
302 inode = ip->i_vnode; 253 inode = VFS_I(ip);
303 254
304 error = xfs_init_security(inode, dir); 255 error = xfs_init_security(inode, dir);
305 if (unlikely(error)) 256 if (unlikely(error))
@@ -366,7 +317,7 @@ xfs_vn_lookup(
366 return NULL; 317 return NULL;
367 } 318 }
368 319
369 return d_splice_alias(cip->i_vnode, dentry); 320 return d_splice_alias(VFS_I(cip), dentry);
370} 321}
371 322
372STATIC struct dentry * 323STATIC struct dentry *
@@ -399,12 +350,12 @@ xfs_vn_ci_lookup(
399 350
400 /* if exact match, just splice and exit */ 351 /* if exact match, just splice and exit */
401 if (!ci_name.name) 352 if (!ci_name.name)
402 return d_splice_alias(ip->i_vnode, dentry); 353 return d_splice_alias(VFS_I(ip), dentry);
403 354
404 /* else case-insensitive match... */ 355 /* else case-insensitive match... */
405 dname.name = ci_name.name; 356 dname.name = ci_name.name;
406 dname.len = ci_name.len; 357 dname.len = ci_name.len;
407 dentry = d_add_ci(ip->i_vnode, dentry, &dname); 358 dentry = d_add_ci(dentry, VFS_I(ip), &dname);
408 kmem_free(ci_name.name); 359 kmem_free(ci_name.name);
409 return dentry; 360 return dentry;
410} 361}
@@ -478,7 +429,7 @@ xfs_vn_symlink(
478 if (unlikely(error)) 429 if (unlikely(error))
479 goto out; 430 goto out;
480 431
481 inode = cip->i_vnode; 432 inode = VFS_I(cip);
482 433
483 error = xfs_init_security(inode, dir); 434 error = xfs_init_security(inode, dir);
484 if (unlikely(error)) 435 if (unlikely(error))
@@ -710,7 +661,7 @@ out_error:
710 return error; 661 return error;
711} 662}
712 663
713const struct inode_operations xfs_inode_operations = { 664static const struct inode_operations xfs_inode_operations = {
714 .permission = xfs_vn_permission, 665 .permission = xfs_vn_permission,
715 .truncate = xfs_vn_truncate, 666 .truncate = xfs_vn_truncate,
716 .getattr = xfs_vn_getattr, 667 .getattr = xfs_vn_getattr,
@@ -722,7 +673,7 @@ const struct inode_operations xfs_inode_operations = {
722 .fallocate = xfs_vn_fallocate, 673 .fallocate = xfs_vn_fallocate,
723}; 674};
724 675
725const struct inode_operations xfs_dir_inode_operations = { 676static const struct inode_operations xfs_dir_inode_operations = {
726 .create = xfs_vn_create, 677 .create = xfs_vn_create,
727 .lookup = xfs_vn_lookup, 678 .lookup = xfs_vn_lookup,
728 .link = xfs_vn_link, 679 .link = xfs_vn_link,
@@ -747,7 +698,7 @@ const struct inode_operations xfs_dir_inode_operations = {
747 .listxattr = xfs_vn_listxattr, 698 .listxattr = xfs_vn_listxattr,
748}; 699};
749 700
750const struct inode_operations xfs_dir_ci_inode_operations = { 701static const struct inode_operations xfs_dir_ci_inode_operations = {
751 .create = xfs_vn_create, 702 .create = xfs_vn_create,
752 .lookup = xfs_vn_ci_lookup, 703 .lookup = xfs_vn_ci_lookup,
753 .link = xfs_vn_link, 704 .link = xfs_vn_link,
@@ -772,7 +723,7 @@ const struct inode_operations xfs_dir_ci_inode_operations = {
772 .listxattr = xfs_vn_listxattr, 723 .listxattr = xfs_vn_listxattr,
773}; 724};
774 725
775const struct inode_operations xfs_symlink_inode_operations = { 726static const struct inode_operations xfs_symlink_inode_operations = {
776 .readlink = generic_readlink, 727 .readlink = generic_readlink,
777 .follow_link = xfs_vn_follow_link, 728 .follow_link = xfs_vn_follow_link,
778 .put_link = xfs_vn_put_link, 729 .put_link = xfs_vn_put_link,
@@ -784,3 +735,98 @@ const struct inode_operations xfs_symlink_inode_operations = {
784 .removexattr = generic_removexattr, 735 .removexattr = generic_removexattr,
785 .listxattr = xfs_vn_listxattr, 736 .listxattr = xfs_vn_listxattr,
786}; 737};
738
739STATIC void
740xfs_diflags_to_iflags(
741 struct inode *inode,
742 struct xfs_inode *ip)
743{
744 if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
745 inode->i_flags |= S_IMMUTABLE;
746 else
747 inode->i_flags &= ~S_IMMUTABLE;
748 if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
749 inode->i_flags |= S_APPEND;
750 else
751 inode->i_flags &= ~S_APPEND;
752 if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
753 inode->i_flags |= S_SYNC;
754 else
755 inode->i_flags &= ~S_SYNC;
756 if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
757 inode->i_flags |= S_NOATIME;
758 else
759 inode->i_flags &= ~S_NOATIME;
760}
761
762/*
763 * Initialize the Linux inode, set up the operation vectors and
764 * unlock the inode.
765 *
766 * When reading existing inodes from disk this is called directly
767 * from xfs_iget, when creating a new inode it is called from
768 * xfs_ialloc after setting up the inode.
769 */
770void
771xfs_setup_inode(
772 struct xfs_inode *ip)
773{
774 struct inode *inode = ip->i_vnode;
775
776 inode->i_mode = ip->i_d.di_mode;
777 inode->i_nlink = ip->i_d.di_nlink;
778 inode->i_uid = ip->i_d.di_uid;
779 inode->i_gid = ip->i_d.di_gid;
780
781 switch (inode->i_mode & S_IFMT) {
782 case S_IFBLK:
783 case S_IFCHR:
784 inode->i_rdev =
785 MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
786 sysv_minor(ip->i_df.if_u2.if_rdev));
787 break;
788 default:
789 inode->i_rdev = 0;
790 break;
791 }
792
793 inode->i_generation = ip->i_d.di_gen;
794 i_size_write(inode, ip->i_d.di_size);
795 inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
796 inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
797 inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
798 inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
799 inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
800 inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
801 xfs_diflags_to_iflags(inode, ip);
802 xfs_iflags_clear(ip, XFS_IMODIFIED);
803
804 switch (inode->i_mode & S_IFMT) {
805 case S_IFREG:
806 inode->i_op = &xfs_inode_operations;
807 inode->i_fop = &xfs_file_operations;
808 inode->i_mapping->a_ops = &xfs_address_space_operations;
809 break;
810 case S_IFDIR:
811 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
812 inode->i_op = &xfs_dir_ci_inode_operations;
813 else
814 inode->i_op = &xfs_dir_inode_operations;
815 inode->i_fop = &xfs_dir_file_operations;
816 break;
817 case S_IFLNK:
818 inode->i_op = &xfs_symlink_inode_operations;
819 if (!(ip->i_df.if_flags & XFS_IFINLINE))
820 inode->i_mapping->a_ops = &xfs_address_space_operations;
821 break;
822 default:
823 inode->i_op = &xfs_inode_operations;
824 init_special_inode(inode, inode->i_mode, inode->i_rdev);
825 break;
826 }
827
828 xfs_iflags_clear(ip, XFS_INEW);
829 barrier();
830
831 unlock_new_inode(inode);
832}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index d97ba934a2ac..8b1a1e31dc21 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -18,10 +18,7 @@
18#ifndef __XFS_IOPS_H__ 18#ifndef __XFS_IOPS_H__
19#define __XFS_IOPS_H__ 19#define __XFS_IOPS_H__
20 20
21extern const struct inode_operations xfs_inode_operations; 21struct xfs_inode;
22extern const struct inode_operations xfs_dir_inode_operations;
23extern const struct inode_operations xfs_dir_ci_inode_operations;
24extern const struct inode_operations xfs_symlink_inode_operations;
25 22
26extern const struct file_operations xfs_file_operations; 23extern const struct file_operations xfs_file_operations;
27extern const struct file_operations xfs_dir_file_operations; 24extern const struct file_operations xfs_dir_file_operations;
@@ -29,14 +26,6 @@ extern const struct file_operations xfs_invis_file_operations;
29 26
30extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); 27extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
31 28
32struct xfs_inode; 29extern void xfs_setup_inode(struct xfs_inode *);
33extern void xfs_ichgtime(struct xfs_inode *, int);
34extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int);
35
36#define xfs_vtoi(vp) \
37 ((struct xfs_inode *)vn_to_inode(vp)->i_private)
38
39#define XFS_I(inode) \
40 ((struct xfs_inode *)(inode)->i_private)
41 30
42#endif /* __XFS_IOPS_H__ */ 31#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 4d45d9351a6c..cc0f7b3a9795 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -45,13 +45,13 @@
45#include <mrlock.h> 45#include <mrlock.h>
46#include <sv.h> 46#include <sv.h>
47#include <mutex.h> 47#include <mutex.h>
48#include <sema.h>
49#include <time.h> 48#include <time.h>
50 49
51#include <support/ktrace.h> 50#include <support/ktrace.h>
52#include <support/debug.h> 51#include <support/debug.h>
53#include <support/uuid.h> 52#include <support/uuid.h>
54 53
54#include <linux/semaphore.h>
55#include <linux/mm.h> 55#include <linux/mm.h>
56#include <linux/kernel.h> 56#include <linux/kernel.h>
57#include <linux/blkdev.h> 57#include <linux/blkdev.h>
@@ -126,8 +126,6 @@
126 126
127#define current_cpu() (raw_smp_processor_id()) 127#define current_cpu() (raw_smp_processor_id())
128#define current_pid() (current->pid) 128#define current_pid() (current->pid)
129#define current_fsuid(cred) (current->fsuid)
130#define current_fsgid(cred) (current->fsgid)
131#define current_test_flags(f) (current->flags & (f)) 129#define current_test_flags(f) (current->flags & (f))
132#define current_set_flags_nested(sp, f) \ 130#define current_set_flags_nested(sp, f) \
133 (*(sp) = current->flags, current->flags |= (f)) 131 (*(sp) = current->flags, current->flags |= (f))
@@ -180,7 +178,7 @@
180#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) 178#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
181#define xfs_stack_trace() dump_stack() 179#define xfs_stack_trace() dump_stack()
182#define xfs_itruncate_data(ip, off) \ 180#define xfs_itruncate_data(ip, off) \
183 (-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off))) 181 (-vmtruncate(VFS_I(ip), (off)))
184 182
185 183
186/* Move the kernel do_div definition off to one side */ 184/* Move the kernel do_div definition off to one side */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 82333b3e118e..1957e5357d04 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -137,7 +137,7 @@ xfs_iozero(
137 struct address_space *mapping; 137 struct address_space *mapping;
138 int status; 138 int status;
139 139
140 mapping = ip->i_vnode->i_mapping; 140 mapping = VFS_I(ip)->i_mapping;
141 do { 141 do {
142 unsigned offset, bytes; 142 unsigned offset, bytes;
143 void *fsdata; 143 void *fsdata;
@@ -674,9 +674,7 @@ start:
674 */ 674 */
675 if (likely(!(ioflags & IO_INVIS) && 675 if (likely(!(ioflags & IO_INVIS) &&
676 !mnt_want_write(file->f_path.mnt))) { 676 !mnt_want_write(file->f_path.mnt))) {
677 file_update_time(file); 677 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
678 xfs_ichgtime_fast(xip, inode,
679 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
680 mnt_drop_write(file->f_path.mnt); 678 mnt_drop_write(file->f_path.mnt);
681 } 679 }
682 680
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 30ae96397e31..7227b2efef22 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -158,7 +158,7 @@ enum {
158 Opt_barrier, Opt_nobarrier, Opt_err 158 Opt_barrier, Opt_nobarrier, Opt_err
159}; 159};
160 160
161static match_table_t tokens = { 161static const match_table_t tokens = {
162 {Opt_barrier, "barrier"}, 162 {Opt_barrier, "barrier"},
163 {Opt_nobarrier, "nobarrier"}, 163 {Opt_nobarrier, "nobarrier"},
164 {Opt_err, NULL} 164 {Opt_err, NULL}
@@ -581,118 +581,6 @@ xfs_max_file_offset(
581 return (((__uint64_t)pagefactor) << bitshift) - 1; 581 return (((__uint64_t)pagefactor) << bitshift) - 1;
582} 582}
583 583
584STATIC_INLINE void
585xfs_set_inodeops(
586 struct inode *inode)
587{
588 switch (inode->i_mode & S_IFMT) {
589 case S_IFREG:
590 inode->i_op = &xfs_inode_operations;
591 inode->i_fop = &xfs_file_operations;
592 inode->i_mapping->a_ops = &xfs_address_space_operations;
593 break;
594 case S_IFDIR:
595 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
596 inode->i_op = &xfs_dir_ci_inode_operations;
597 else
598 inode->i_op = &xfs_dir_inode_operations;
599 inode->i_fop = &xfs_dir_file_operations;
600 break;
601 case S_IFLNK:
602 inode->i_op = &xfs_symlink_inode_operations;
603 if (!(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE))
604 inode->i_mapping->a_ops = &xfs_address_space_operations;
605 break;
606 default:
607 inode->i_op = &xfs_inode_operations;
608 init_special_inode(inode, inode->i_mode, inode->i_rdev);
609 break;
610 }
611}
612
613STATIC_INLINE void
614xfs_revalidate_inode(
615 xfs_mount_t *mp,
616 bhv_vnode_t *vp,
617 xfs_inode_t *ip)
618{
619 struct inode *inode = vn_to_inode(vp);
620
621 inode->i_mode = ip->i_d.di_mode;
622 inode->i_nlink = ip->i_d.di_nlink;
623 inode->i_uid = ip->i_d.di_uid;
624 inode->i_gid = ip->i_d.di_gid;
625
626 switch (inode->i_mode & S_IFMT) {
627 case S_IFBLK:
628 case S_IFCHR:
629 inode->i_rdev =
630 MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
631 sysv_minor(ip->i_df.if_u2.if_rdev));
632 break;
633 default:
634 inode->i_rdev = 0;
635 break;
636 }
637
638 inode->i_generation = ip->i_d.di_gen;
639 i_size_write(inode, ip->i_d.di_size);
640 inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
641 inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
642 inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
643 inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
644 inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
645 inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
646 if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
647 inode->i_flags |= S_IMMUTABLE;
648 else
649 inode->i_flags &= ~S_IMMUTABLE;
650 if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
651 inode->i_flags |= S_APPEND;
652 else
653 inode->i_flags &= ~S_APPEND;
654 if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
655 inode->i_flags |= S_SYNC;
656 else
657 inode->i_flags &= ~S_SYNC;
658 if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
659 inode->i_flags |= S_NOATIME;
660 else
661 inode->i_flags &= ~S_NOATIME;
662 xfs_iflags_clear(ip, XFS_IMODIFIED);
663}
664
665void
666xfs_initialize_vnode(
667 struct xfs_mount *mp,
668 bhv_vnode_t *vp,
669 struct xfs_inode *ip)
670{
671 struct inode *inode = vn_to_inode(vp);
672
673 if (!ip->i_vnode) {
674 ip->i_vnode = vp;
675 inode->i_private = ip;
676 }
677
678 /*
679 * We need to set the ops vectors, and unlock the inode, but if
680 * we have been called during the new inode create process, it is
681 * too early to fill in the Linux inode. We will get called a
682 * second time once the inode is properly set up, and then we can
683 * finish our work.
684 */
685 if (ip->i_d.di_mode != 0 && (inode->i_state & I_NEW)) {
686 xfs_revalidate_inode(mp, vp, ip);
687 xfs_set_inodeops(inode);
688
689 xfs_iflags_clear(ip, XFS_INEW);
690 barrier();
691
692 unlock_new_inode(inode);
693 }
694}
695
696int 584int
697xfs_blkdev_get( 585xfs_blkdev_get(
698 xfs_mount_t *mp, 586 xfs_mount_t *mp,
@@ -982,26 +870,21 @@ STATIC struct inode *
982xfs_fs_alloc_inode( 870xfs_fs_alloc_inode(
983 struct super_block *sb) 871 struct super_block *sb)
984{ 872{
985 bhv_vnode_t *vp; 873 return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
986
987 vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
988 if (unlikely(!vp))
989 return NULL;
990 return vn_to_inode(vp);
991} 874}
992 875
993STATIC void 876STATIC void
994xfs_fs_destroy_inode( 877xfs_fs_destroy_inode(
995 struct inode *inode) 878 struct inode *inode)
996{ 879{
997 kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode)); 880 kmem_zone_free(xfs_vnode_zone, inode);
998} 881}
999 882
1000STATIC void 883STATIC void
1001xfs_fs_inode_init_once( 884xfs_fs_inode_init_once(
1002 void *vnode) 885 void *vnode)
1003{ 886{
1004 inode_init_once(vn_to_inode((bhv_vnode_t *)vnode)); 887 inode_init_once((struct inode *)vnode);
1005} 888}
1006 889
1007/* 890/*
@@ -1106,7 +989,7 @@ void
1106xfs_flush_inode( 989xfs_flush_inode(
1107 xfs_inode_t *ip) 990 xfs_inode_t *ip)
1108{ 991{
1109 struct inode *inode = ip->i_vnode; 992 struct inode *inode = VFS_I(ip);
1110 993
1111 igrab(inode); 994 igrab(inode);
1112 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work); 995 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
@@ -1131,7 +1014,7 @@ void
1131xfs_flush_device( 1014xfs_flush_device(
1132 xfs_inode_t *ip) 1015 xfs_inode_t *ip)
1133{ 1016{
1134 struct inode *inode = vn_to_inode(XFS_ITOV(ip)); 1017 struct inode *inode = VFS_I(ip);
1135 1018
1136 igrab(inode); 1019 igrab(inode);
1137 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work); 1020 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
@@ -1201,6 +1084,15 @@ xfssyncd(
1201} 1084}
1202 1085
1203STATIC void 1086STATIC void
1087xfs_free_fsname(
1088 struct xfs_mount *mp)
1089{
1090 kfree(mp->m_fsname);
1091 kfree(mp->m_rtname);
1092 kfree(mp->m_logname);
1093}
1094
1095STATIC void
1204xfs_fs_put_super( 1096xfs_fs_put_super(
1205 struct super_block *sb) 1097 struct super_block *sb)
1206{ 1098{
@@ -1239,8 +1131,6 @@ xfs_fs_put_super(
1239 error = xfs_unmount_flush(mp, 0); 1131 error = xfs_unmount_flush(mp, 0);
1240 WARN_ON(error); 1132 WARN_ON(error);
1241 1133
1242 IRELE(rip);
1243
1244 /* 1134 /*
1245 * If we're forcing a shutdown, typically because of a media error, 1135 * If we're forcing a shutdown, typically because of a media error,
1246 * we want to make sure we invalidate dirty pages that belong to 1136 * we want to make sure we invalidate dirty pages that belong to
@@ -1257,10 +1147,12 @@ xfs_fs_put_super(
1257 } 1147 }
1258 1148
1259 xfs_unmountfs(mp); 1149 xfs_unmountfs(mp);
1150 xfs_freesb(mp);
1260 xfs_icsb_destroy_counters(mp); 1151 xfs_icsb_destroy_counters(mp);
1261 xfs_close_devices(mp); 1152 xfs_close_devices(mp);
1262 xfs_qmops_put(mp); 1153 xfs_qmops_put(mp);
1263 xfs_dmops_put(mp); 1154 xfs_dmops_put(mp);
1155 xfs_free_fsname(mp);
1264 kfree(mp); 1156 kfree(mp);
1265} 1157}
1266 1158
@@ -1410,9 +1302,29 @@ xfs_fs_remount(
1410 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1302 mp->m_flags &= ~XFS_MOUNT_BARRIER;
1411 break; 1303 break;
1412 default: 1304 default:
1305 /*
1306 * Logically we would return an error here to prevent
1307 * users from believing they might have changed
1308 * mount options using remount which can't be changed.
1309 *
1310 * But unfortunately mount(8) adds all options from
1311 * mtab and fstab to the mount arguments in some cases
1312 * so we can't blindly reject options, but have to
1313 * check for each specified option if it actually
1314 * differs from the currently set option and only
1315 * reject it if that's the case.
1316 *
1317 * Until that is implemented we return success for
1318 * every remount request, and silently ignore all
1319 * options that we can't actually change.
1320 */
1321#if 0
1413 printk(KERN_INFO 1322 printk(KERN_INFO
1414 "XFS: mount option \"%s\" not supported for remount\n", p); 1323 "XFS: mount option \"%s\" not supported for remount\n", p);
1415 return -EINVAL; 1324 return -EINVAL;
1325#else
1326 return 0;
1327#endif
1416 } 1328 }
1417 } 1329 }
1418 1330
@@ -1517,6 +1429,8 @@ xfs_start_flags(
1517 struct xfs_mount_args *ap, 1429 struct xfs_mount_args *ap,
1518 struct xfs_mount *mp) 1430 struct xfs_mount *mp)
1519{ 1431{
1432 int error;
1433
1520 /* Values are in BBs */ 1434 /* Values are in BBs */
1521 if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { 1435 if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
1522 /* 1436 /*
@@ -1549,17 +1463,27 @@ xfs_start_flags(
1549 ap->logbufsize); 1463 ap->logbufsize);
1550 return XFS_ERROR(EINVAL); 1464 return XFS_ERROR(EINVAL);
1551 } 1465 }
1466
1467 error = ENOMEM;
1468
1552 mp->m_logbsize = ap->logbufsize; 1469 mp->m_logbsize = ap->logbufsize;
1553 mp->m_fsname_len = strlen(ap->fsname) + 1; 1470 mp->m_fsname_len = strlen(ap->fsname) + 1;
1554 mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP); 1471
1555 strcpy(mp->m_fsname, ap->fsname); 1472 mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
1473 if (!mp->m_fsname)
1474 goto out;
1475
1556 if (ap->rtname[0]) { 1476 if (ap->rtname[0]) {
1557 mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP); 1477 mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
1558 strcpy(mp->m_rtname, ap->rtname); 1478 if (!mp->m_rtname)
1479 goto out_free_fsname;
1480
1559 } 1481 }
1482
1560 if (ap->logname[0]) { 1483 if (ap->logname[0]) {
1561 mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP); 1484 mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
1562 strcpy(mp->m_logname, ap->logname); 1485 if (!mp->m_logname)
1486 goto out_free_rtname;
1563 } 1487 }
1564 1488
1565 if (ap->flags & XFSMNT_WSYNC) 1489 if (ap->flags & XFSMNT_WSYNC)
@@ -1632,6 +1556,14 @@ xfs_start_flags(
1632 if (ap->flags & XFSMNT_DMAPI) 1556 if (ap->flags & XFSMNT_DMAPI)
1633 mp->m_flags |= XFS_MOUNT_DMAPI; 1557 mp->m_flags |= XFS_MOUNT_DMAPI;
1634 return 0; 1558 return 0;
1559
1560
1561 out_free_rtname:
1562 kfree(mp->m_rtname);
1563 out_free_fsname:
1564 kfree(mp->m_fsname);
1565 out:
1566 return error;
1635} 1567}
1636 1568
1637/* 1569/*
@@ -1792,10 +1724,10 @@ xfs_fs_fill_super(
1792 */ 1724 */
1793 error = xfs_start_flags(args, mp); 1725 error = xfs_start_flags(args, mp);
1794 if (error) 1726 if (error)
1795 goto out_destroy_counters; 1727 goto out_free_fsname;
1796 error = xfs_readsb(mp, flags); 1728 error = xfs_readsb(mp, flags);
1797 if (error) 1729 if (error)
1798 goto out_destroy_counters; 1730 goto out_free_fsname;
1799 error = xfs_finish_flags(args, mp); 1731 error = xfs_finish_flags(args, mp);
1800 if (error) 1732 if (error)
1801 goto out_free_sb; 1733 goto out_free_sb;
@@ -1811,7 +1743,7 @@ xfs_fs_fill_super(
1811 if (error) 1743 if (error)
1812 goto out_free_sb; 1744 goto out_free_sb;
1813 1745
1814 error = xfs_mountfs(mp, flags); 1746 error = xfs_mountfs(mp);
1815 if (error) 1747 if (error)
1816 goto out_filestream_unmount; 1748 goto out_filestream_unmount;
1817 1749
@@ -1825,7 +1757,7 @@ xfs_fs_fill_super(
1825 sb->s_time_gran = 1; 1757 sb->s_time_gran = 1;
1826 set_posix_acl_flag(sb); 1758 set_posix_acl_flag(sb);
1827 1759
1828 root = igrab(mp->m_rootip->i_vnode); 1760 root = igrab(VFS_I(mp->m_rootip));
1829 if (!root) { 1761 if (!root) {
1830 error = ENOENT; 1762 error = ENOENT;
1831 goto fail_unmount; 1763 goto fail_unmount;
@@ -1857,7 +1789,8 @@ xfs_fs_fill_super(
1857 xfs_filestream_unmount(mp); 1789 xfs_filestream_unmount(mp);
1858 out_free_sb: 1790 out_free_sb:
1859 xfs_freesb(mp); 1791 xfs_freesb(mp);
1860 out_destroy_counters: 1792 out_free_fsname:
1793 xfs_free_fsname(mp);
1861 xfs_icsb_destroy_counters(mp); 1794 xfs_icsb_destroy_counters(mp);
1862 xfs_close_devices(mp); 1795 xfs_close_devices(mp);
1863 out_put_qmops: 1796 out_put_qmops:
@@ -1890,10 +1823,8 @@ xfs_fs_fill_super(
1890 error = xfs_unmount_flush(mp, 0); 1823 error = xfs_unmount_flush(mp, 0);
1891 WARN_ON(error); 1824 WARN_ON(error);
1892 1825
1893 IRELE(mp->m_rootip);
1894
1895 xfs_unmountfs(mp); 1826 xfs_unmountfs(mp);
1896 goto out_destroy_counters; 1827 goto out_free_sb;
1897} 1828}
1898 1829
1899STATIC int 1830STATIC int
@@ -2014,7 +1945,7 @@ xfs_free_trace_bufs(void)
2014STATIC int __init 1945STATIC int __init
2015xfs_init_zones(void) 1946xfs_init_zones(void)
2016{ 1947{
2017 xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode", 1948 xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
2018 KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | 1949 KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
2019 KM_ZONE_SPREAD, 1950 KM_ZONE_SPREAD,
2020 xfs_fs_inode_init_once); 1951 xfs_fs_inode_init_once);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index b7d13da01bd6..fe2ef4e6a0f9 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -101,9 +101,6 @@ struct block_device;
101 101
102extern __uint64_t xfs_max_file_offset(unsigned int); 102extern __uint64_t xfs_max_file_offset(unsigned int);
103 103
104extern void xfs_initialize_vnode(struct xfs_mount *mp, bhv_vnode_t *vp,
105 struct xfs_inode *ip);
106
107extern void xfs_flush_inode(struct xfs_inode *); 104extern void xfs_flush_inode(struct xfs_inode *);
108extern void xfs_flush_device(struct xfs_inode *); 105extern void xfs_flush_device(struct xfs_inode *);
109 106
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 25488b6d9881..b52528bbbfff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -33,7 +33,7 @@
33 33
34 34
35/* 35/*
36 * Dedicated vnode inactive/reclaim sync semaphores. 36 * Dedicated vnode inactive/reclaim sync wait queues.
37 * Prime number of hash buckets since address is used as the key. 37 * Prime number of hash buckets since address is used as the key.
38 */ 38 */
39#define NVSYNC 37 39#define NVSYNC 37
@@ -82,24 +82,6 @@ vn_ioerror(
82 xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l); 82 xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
83} 83}
84 84
85
86/*
87 * Add a reference to a referenced vnode.
88 */
89bhv_vnode_t *
90vn_hold(
91 bhv_vnode_t *vp)
92{
93 struct inode *inode;
94
95 XFS_STATS_INC(vn_hold);
96
97 inode = igrab(vn_to_inode(vp));
98 ASSERT(inode);
99
100 return vp;
101}
102
103#ifdef XFS_INODE_TRACE 85#ifdef XFS_INODE_TRACE
104 86
105/* 87/*
@@ -108,7 +90,7 @@ vn_hold(
108 */ 90 */
109static inline int xfs_icount(struct xfs_inode *ip) 91static inline int xfs_icount(struct xfs_inode *ip)
110{ 92{
111 bhv_vnode_t *vp = XFS_ITOV_NULL(ip); 93 struct inode *vp = VFS_I(ip);
112 94
113 if (vp) 95 if (vp)
114 return vn_count(vp); 96 return vn_count(vp);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 41ca2cec5d31..683ce16210ff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -22,20 +22,6 @@ struct file;
22struct xfs_iomap; 22struct xfs_iomap;
23struct attrlist_cursor_kern; 23struct attrlist_cursor_kern;
24 24
25typedef struct inode bhv_vnode_t;
26
27/*
28 * Vnode to Linux inode mapping.
29 */
30static inline bhv_vnode_t *vn_from_inode(struct inode *inode)
31{
32 return inode;
33}
34static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
35{
36 return vnode;
37}
38
39/* 25/*
40 * Return values for xfs_inactive. A return value of 26 * Return values for xfs_inactive. A return value of
41 * VN_INACTIVE_NOCACHE implies that the file system behavior 27 * VN_INACTIVE_NOCACHE implies that the file system behavior
@@ -76,57 +62,52 @@ extern void vn_iowait(struct xfs_inode *ip);
76extern void vn_iowake(struct xfs_inode *ip); 62extern void vn_iowake(struct xfs_inode *ip);
77extern void vn_ioerror(struct xfs_inode *ip, int error, char *f, int l); 63extern void vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
78 64
79static inline int vn_count(bhv_vnode_t *vp) 65static inline int vn_count(struct inode *vp)
80{ 66{
81 return atomic_read(&vn_to_inode(vp)->i_count); 67 return atomic_read(&vp->i_count);
82} 68}
83 69
84/* 70#define IHOLD(ip) \
85 * Vnode reference counting functions (and macros for compatibility). 71do { \
86 */ 72 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
87extern bhv_vnode_t *vn_hold(bhv_vnode_t *); 73 atomic_inc(&(VFS_I(ip)->i_count)); \
74 xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
75} while (0)
88 76
89#if defined(XFS_INODE_TRACE) 77#define IRELE(ip) \
90#define VN_HOLD(vp) \ 78do { \
91 ((void)vn_hold(vp), \ 79 xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
92 xfs_itrace_hold(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address)) 80 iput(VFS_I(ip)); \
93#define VN_RELE(vp) \ 81} while (0)
94 (xfs_itrace_rele(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address), \
95 iput(vn_to_inode(vp)))
96#else
97#define VN_HOLD(vp) ((void)vn_hold(vp))
98#define VN_RELE(vp) (iput(vn_to_inode(vp)))
99#endif
100 82
101static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp) 83static inline struct inode *vn_grab(struct inode *vp)
102{ 84{
103 struct inode *inode = igrab(vn_to_inode(vp)); 85 return igrab(vp);
104 return inode ? vn_from_inode(inode) : NULL;
105} 86}
106 87
107/* 88/*
108 * Dealing with bad inodes 89 * Dealing with bad inodes
109 */ 90 */
110static inline int VN_BAD(bhv_vnode_t *vp) 91static inline int VN_BAD(struct inode *vp)
111{ 92{
112 return is_bad_inode(vn_to_inode(vp)); 93 return is_bad_inode(vp);
113} 94}
114 95
115/* 96/*
116 * Extracting atime values in various formats 97 * Extracting atime values in various formats
117 */ 98 */
118static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime) 99static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
119{ 100{
120 bs_atime->tv_sec = vp->i_atime.tv_sec; 101 bs_atime->tv_sec = vp->i_atime.tv_sec;
121 bs_atime->tv_nsec = vp->i_atime.tv_nsec; 102 bs_atime->tv_nsec = vp->i_atime.tv_nsec;
122} 103}
123 104
124static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts) 105static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
125{ 106{
126 *ts = vp->i_atime; 107 *ts = vp->i_atime;
127} 108}
128 109
129static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt) 110static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
130{ 111{
131 *tt = vp->i_atime.tv_sec; 112 *tt = vp->i_atime.tv_sec;
132} 113}
@@ -134,9 +115,9 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
134/* 115/*
135 * Some useful predicates. 116 * Some useful predicates.
136 */ 117 */
137#define VN_MAPPED(vp) mapping_mapped(vn_to_inode(vp)->i_mapping) 118#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
138#define VN_CACHED(vp) (vn_to_inode(vp)->i_mapping->nrpages) 119#define VN_CACHED(vp) (vp->i_mapping->nrpages)
139#define VN_DIRTY(vp) mapping_tagged(vn_to_inode(vp)->i_mapping, \ 120#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \
140 PAGECACHE_TAG_DIRTY) 121 PAGECACHE_TAG_DIRTY)
141 122
142 123
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index fc9f3fb39b7b..f2705f2fd43c 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,11 +101,18 @@ xfs_qm_dqinit(
101 if (brandnewdquot) { 101 if (brandnewdquot) {
102 dqp->dq_flnext = dqp->dq_flprev = dqp; 102 dqp->dq_flnext = dqp->dq_flprev = dqp;
103 mutex_init(&dqp->q_qlock); 103 mutex_init(&dqp->q_qlock);
104 initnsema(&dqp->q_flock, 1, "fdq");
105 sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); 104 sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
106 105
106 /*
107 * Because we want to use a counting completion, complete
108 * the flush completion once to allow a single access to
109 * the flush completion without blocking.
110 */
111 init_completion(&dqp->q_flush);
112 complete(&dqp->q_flush);
113
107#ifdef XFS_DQUOT_TRACE 114#ifdef XFS_DQUOT_TRACE
108 dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP); 115 dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
109 xfs_dqtrace_entry(dqp, "DQINIT"); 116 xfs_dqtrace_entry(dqp, "DQINIT");
110#endif 117#endif
111 } else { 118 } else {
@@ -150,7 +157,6 @@ xfs_qm_dqdestroy(
150 ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); 157 ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
151 158
152 mutex_destroy(&dqp->q_qlock); 159 mutex_destroy(&dqp->q_qlock);
153 freesema(&dqp->q_flock);
154 sv_destroy(&dqp->q_pinwait); 160 sv_destroy(&dqp->q_pinwait);
155 161
156#ifdef XFS_DQUOT_TRACE 162#ifdef XFS_DQUOT_TRACE
@@ -431,7 +437,7 @@ xfs_qm_dqalloc(
431 * when it unlocks the inode. Since we want to keep the quota 437 * when it unlocks the inode. Since we want to keep the quota
432 * inode around, we bump the vnode ref count now. 438 * inode around, we bump the vnode ref count now.
433 */ 439 */
434 VN_HOLD(XFS_ITOV(quotip)); 440 IHOLD(quotip);
435 441
436 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); 442 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
437 nmaps = 1; 443 nmaps = 1;
@@ -1211,7 +1217,7 @@ xfs_qm_dqflush(
1211 int error; 1217 int error;
1212 1218
1213 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 1219 ASSERT(XFS_DQ_IS_LOCKED(dqp));
1214 ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); 1220 ASSERT(!completion_done(&dqp->q_flush));
1215 xfs_dqtrace_entry(dqp, "DQFLUSH"); 1221 xfs_dqtrace_entry(dqp, "DQFLUSH");
1216 1222
1217 /* 1223 /*
@@ -1348,34 +1354,18 @@ xfs_qm_dqflush_done(
1348 xfs_dqfunlock(dqp); 1354 xfs_dqfunlock(dqp);
1349} 1355}
1350 1356
1351
1352int
1353xfs_qm_dqflock_nowait(
1354 xfs_dquot_t *dqp)
1355{
1356 int locked;
1357
1358 locked = cpsema(&((dqp)->q_flock));
1359
1360 /* XXX ifdef these out */
1361 if (locked)
1362 (dqp)->dq_flags |= XFS_DQ_FLOCKED;
1363 return (locked);
1364}
1365
1366
1367int 1357int
1368xfs_qm_dqlock_nowait( 1358xfs_qm_dqlock_nowait(
1369 xfs_dquot_t *dqp) 1359 xfs_dquot_t *dqp)
1370{ 1360{
1371 return (mutex_trylock(&((dqp)->q_qlock))); 1361 return mutex_trylock(&dqp->q_qlock);
1372} 1362}
1373 1363
1374void 1364void
1375xfs_dqlock( 1365xfs_dqlock(
1376 xfs_dquot_t *dqp) 1366 xfs_dquot_t *dqp)
1377{ 1367{
1378 mutex_lock(&(dqp->q_qlock)); 1368 mutex_lock(&dqp->q_qlock);
1379} 1369}
1380 1370
1381void 1371void
@@ -1468,7 +1458,7 @@ xfs_qm_dqpurge(
1468 * if we're turning off quotas. Basically, we need this flush 1458 * if we're turning off quotas. Basically, we need this flush
1469 * lock, and are willing to block on it. 1459 * lock, and are willing to block on it.
1470 */ 1460 */
1471 if (! xfs_qm_dqflock_nowait(dqp)) { 1461 if (!xfs_dqflock_nowait(dqp)) {
1472 /* 1462 /*
1473 * Block on the flush lock after nudging dquot buffer, 1463 * Block on the flush lock after nudging dquot buffer,
1474 * if it is incore. 1464 * if it is incore.
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index f7393bba4e95..8958d0faf8d3 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -82,7 +82,7 @@ typedef struct xfs_dquot {
82 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ 82 xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ 83 xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
84 mutex_t q_qlock; /* quota lock */ 84 mutex_t q_qlock; /* quota lock */
85 sema_t q_flock; /* flush lock */ 85 struct completion q_flush; /* flush completion queue */
86 uint q_pincount; /* pin count for this dquot */ 86 uint q_pincount; /* pin count for this dquot */
87 sv_t q_pinwait; /* sync var for pinning */ 87 sv_t q_pinwait; /* sync var for pinning */
88#ifdef XFS_DQUOT_TRACE 88#ifdef XFS_DQUOT_TRACE
@@ -113,17 +113,25 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
113 113
114 114
115/* 115/*
116 * The following three routines simply manage the q_flock 116 * Manage the q_flush completion queue embedded in the dquot. This completion
117 * semaphore embedded in the dquot. This semaphore synchronizes 117 * queue synchronizes processes attempting to flush the in-core dquot back to
118 * processes attempting to flush the in-core dquot back to disk. 118 * disk.
119 */ 119 */
120#define xfs_dqflock(dqp) { psema(&((dqp)->q_flock), PINOD | PRECALC);\ 120static inline void xfs_dqflock(xfs_dquot_t *dqp)
121 (dqp)->dq_flags |= XFS_DQ_FLOCKED; } 121{
122#define xfs_dqfunlock(dqp) { ASSERT(issemalocked(&((dqp)->q_flock))); \ 122 wait_for_completion(&dqp->q_flush);
123 vsema(&((dqp)->q_flock)); \ 123}
124 (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); } 124
125static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
126{
127 return try_wait_for_completion(&dqp->q_flush);
128}
129
130static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
131{
132 complete(&dqp->q_flush);
133}
125 134
126#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
127#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) 135#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
128#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 136#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
129#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 137#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
@@ -167,7 +175,6 @@ extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
167extern int xfs_qm_dqpurge(xfs_dquot_t *); 175extern int xfs_qm_dqpurge(xfs_dquot_t *);
168extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); 176extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
169extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); 177extern int xfs_qm_dqlock_nowait(xfs_dquot_t *);
170extern int xfs_qm_dqflock_nowait(xfs_dquot_t *);
171extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); 178extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
172extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, 179extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
173 xfs_disk_dquot_t *); 180 xfs_disk_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 08d2fc89e6a1..f028644caa5e 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -151,7 +151,7 @@ xfs_qm_dquot_logitem_push(
151 dqp = logitem->qli_dquot; 151 dqp = logitem->qli_dquot;
152 152
153 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 153 ASSERT(XFS_DQ_IS_LOCKED(dqp));
154 ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); 154 ASSERT(!completion_done(&dqp->q_flush));
155 155
156 /* 156 /*
157 * Since we were able to lock the dquot's flush lock and 157 * Since we were able to lock the dquot's flush lock and
@@ -245,7 +245,7 @@ xfs_qm_dquot_logitem_pushbuf(
245 * inode flush completed and the inode was taken off the AIL. 245 * inode flush completed and the inode was taken off the AIL.
246 * So, just get out. 246 * So, just get out.
247 */ 247 */
248 if (!issemalocked(&(dqp->q_flock)) || 248 if (completion_done(&dqp->q_flush) ||
249 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { 249 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
250 qip->qli_pushbuf_flag = 0; 250 qip->qli_pushbuf_flag = 0;
251 xfs_dqunlock(dqp); 251 xfs_dqunlock(dqp);
@@ -258,7 +258,7 @@ xfs_qm_dquot_logitem_pushbuf(
258 if (bp != NULL) { 258 if (bp != NULL) {
259 if (XFS_BUF_ISDELAYWRITE(bp)) { 259 if (XFS_BUF_ISDELAYWRITE(bp)) {
260 dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && 260 dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
261 issemalocked(&(dqp->q_flock))); 261 !completion_done(&dqp->q_flush));
262 qip->qli_pushbuf_flag = 0; 262 qip->qli_pushbuf_flag = 0;
263 xfs_dqunlock(dqp); 263 xfs_dqunlock(dqp);
264 264
@@ -317,7 +317,7 @@ xfs_qm_dquot_logitem_trylock(
317 return (XFS_ITEM_LOCKED); 317 return (XFS_ITEM_LOCKED);
318 318
319 retval = XFS_ITEM_SUCCESS; 319 retval = XFS_ITEM_SUCCESS;
320 if (! xfs_qm_dqflock_nowait(dqp)) { 320 if (!xfs_dqflock_nowait(dqp)) {
321 /* 321 /*
322 * The dquot is already being flushed. It may have been 322 * The dquot is already being flushed. It may have been
323 * flushed delayed write, however, and we don't want to 323 * flushed delayed write, however, and we don't want to
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 021934a3d456..df0ffef9775a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -310,8 +310,7 @@ xfs_qm_unmount_quotadestroy(
310 */ 310 */
311void 311void
312xfs_qm_mount_quotas( 312xfs_qm_mount_quotas(
313 xfs_mount_t *mp, 313 xfs_mount_t *mp)
314 int mfsi_flags)
315{ 314{
316 int error = 0; 315 int error = 0;
317 uint sbf; 316 uint sbf;
@@ -346,8 +345,7 @@ xfs_qm_mount_quotas(
346 /* 345 /*
347 * If any of the quotas are not consistent, do a quotacheck. 346 * If any of the quotas are not consistent, do a quotacheck.
348 */ 347 */
349 if (XFS_QM_NEED_QUOTACHECK(mp) && 348 if (XFS_QM_NEED_QUOTACHECK(mp)) {
350 !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
351 error = xfs_qm_quotacheck(mp); 349 error = xfs_qm_quotacheck(mp);
352 if (error) { 350 if (error) {
353 /* Quotacheck failed and disabled quotas. */ 351 /* Quotacheck failed and disabled quotas. */
@@ -484,7 +482,7 @@ again:
484 xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY"); 482 xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
485 /* XXX a sentinel would be better */ 483 /* XXX a sentinel would be better */
486 recl = XFS_QI_MPLRECLAIMS(mp); 484 recl = XFS_QI_MPLRECLAIMS(mp);
487 if (! xfs_qm_dqflock_nowait(dqp)) { 485 if (!xfs_dqflock_nowait(dqp)) {
488 /* 486 /*
489 * If we can't grab the flush lock then check 487 * If we can't grab the flush lock then check
490 * to see if the dquot has been flushed delayed 488 * to see if the dquot has been flushed delayed
@@ -1062,7 +1060,7 @@ xfs_qm_sync(
1062 1060
1063 /* XXX a sentinel would be better */ 1061 /* XXX a sentinel would be better */
1064 recl = XFS_QI_MPLRECLAIMS(mp); 1062 recl = XFS_QI_MPLRECLAIMS(mp);
1065 if (! xfs_qm_dqflock_nowait(dqp)) { 1063 if (!xfs_dqflock_nowait(dqp)) {
1066 if (nowait) { 1064 if (nowait) {
1067 xfs_dqunlock(dqp); 1065 xfs_dqunlock(dqp);
1068 continue; 1066 continue;
@@ -2079,7 +2077,7 @@ xfs_qm_shake_freelist(
2079 * Try to grab the flush lock. If this dquot is in the process of 2077 * Try to grab the flush lock. If this dquot is in the process of
2080 * getting flushed to disk, we don't want to reclaim it. 2078 * getting flushed to disk, we don't want to reclaim it.
2081 */ 2079 */
2082 if (! xfs_qm_dqflock_nowait(dqp)) { 2080 if (!xfs_dqflock_nowait(dqp)) {
2083 xfs_dqunlock(dqp); 2081 xfs_dqunlock(dqp);
2084 dqp = dqp->dq_flnext; 2082 dqp = dqp->dq_flnext;
2085 continue; 2083 continue;
@@ -2257,7 +2255,7 @@ xfs_qm_dqreclaim_one(void)
2257 * Try to grab the flush lock. If this dquot is in the process of 2255 * Try to grab the flush lock. If this dquot is in the process of
2258 * getting flushed to disk, we don't want to reclaim it. 2256 * getting flushed to disk, we don't want to reclaim it.
2259 */ 2257 */
2260 if (! xfs_qm_dqflock_nowait(dqp)) { 2258 if (!xfs_dqflock_nowait(dqp)) {
2261 xfs_dqunlock(dqp); 2259 xfs_dqunlock(dqp);
2262 continue; 2260 continue;
2263 } 2261 }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index cd2300e374af..44f25349e478 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
165#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) 165#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
166 166
167extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 167extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
168extern void xfs_qm_mount_quotas(xfs_mount_t *, int); 168extern void xfs_qm_mount_quotas(xfs_mount_t *);
169extern int xfs_qm_quotacheck(xfs_mount_t *); 169extern int xfs_qm_quotacheck(xfs_mount_t *);
170extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); 170extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
171extern int xfs_qm_unmount_quotas(xfs_mount_t *); 171extern int xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index f4f6c4c861d7..eea2e60b456b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -162,7 +162,7 @@ xfs_qm_newmount(
162 * mounting, and get on with the boring life 162 * mounting, and get on with the boring life
163 * without disk quotas. 163 * without disk quotas.
164 */ 164 */
165 xfs_qm_mount_quotas(mp, 0); 165 xfs_qm_mount_quotas(mp);
166 } else { 166 } else {
167 /* 167 /*
168 * Clear the quota flags, but remember them. This 168 * Clear the quota flags, but remember them. This
@@ -184,13 +184,12 @@ STATIC int
184xfs_qm_endmount( 184xfs_qm_endmount(
185 xfs_mount_t *mp, 185 xfs_mount_t *mp,
186 uint needquotamount, 186 uint needquotamount,
187 uint quotaflags, 187 uint quotaflags)
188 int mfsi_flags)
189{ 188{
190 if (needquotamount) { 189 if (needquotamount) {
191 ASSERT(mp->m_qflags == 0); 190 ASSERT(mp->m_qflags == 0);
192 mp->m_qflags = quotaflags; 191 mp->m_qflags = quotaflags;
193 xfs_qm_mount_quotas(mp, mfsi_flags); 192 xfs_qm_mount_quotas(mp);
194 } 193 }
195 194
196#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) 195#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index adfb8723f65a..1a3b803dfa55 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1034,7 +1034,7 @@ xfs_qm_dqrele_all_inodes(
1034{ 1034{
1035 xfs_inode_t *ip, *topino; 1035 xfs_inode_t *ip, *topino;
1036 uint ireclaims; 1036 uint ireclaims;
1037 bhv_vnode_t *vp; 1037 struct inode *vp;
1038 boolean_t vnode_refd; 1038 boolean_t vnode_refd;
1039 1039
1040 ASSERT(mp->m_quotainfo); 1040 ASSERT(mp->m_quotainfo);
@@ -1059,7 +1059,7 @@ again:
1059 ip = ip->i_mnext; 1059 ip = ip->i_mnext;
1060 continue; 1060 continue;
1061 } 1061 }
1062 vp = XFS_ITOV_NULL(ip); 1062 vp = VFS_I(ip);
1063 if (!vp) { 1063 if (!vp) {
1064 ASSERT(ip->i_udquot == NULL); 1064 ASSERT(ip->i_udquot == NULL);
1065 ASSERT(ip->i_gdquot == NULL); 1065 ASSERT(ip->i_gdquot == NULL);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 3e4648ad9cfc..b2f639a1416f 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,15 +37,15 @@
37#include <linux/capability.h> 37#include <linux/capability.h>
38#include <linux/posix_acl_xattr.h> 38#include <linux/posix_acl_xattr.h>
39 39
40STATIC int xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *); 40STATIC int xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
41STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *); 41STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *);
42STATIC void xfs_acl_get_endian(xfs_acl_t *); 42STATIC void xfs_acl_get_endian(xfs_acl_t *);
43STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *); 43STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
44STATIC int xfs_acl_invalid(xfs_acl_t *); 44STATIC int xfs_acl_invalid(xfs_acl_t *);
45STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *); 45STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *);
46STATIC void xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *); 46STATIC void xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
47STATIC void xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *); 47STATIC void xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
48STATIC int xfs_acl_allow_set(bhv_vnode_t *, int); 48STATIC int xfs_acl_allow_set(struct inode *, int);
49 49
50kmem_zone_t *xfs_acl_zone; 50kmem_zone_t *xfs_acl_zone;
51 51
@@ -55,7 +55,7 @@ kmem_zone_t *xfs_acl_zone;
55 */ 55 */
56int 56int
57xfs_acl_vhasacl_access( 57xfs_acl_vhasacl_access(
58 bhv_vnode_t *vp) 58 struct inode *vp)
59{ 59{
60 int error; 60 int error;
61 61
@@ -68,7 +68,7 @@ xfs_acl_vhasacl_access(
68 */ 68 */
69int 69int
70xfs_acl_vhasacl_default( 70xfs_acl_vhasacl_default(
71 bhv_vnode_t *vp) 71 struct inode *vp)
72{ 72{
73 int error; 73 int error;
74 74
@@ -207,7 +207,7 @@ posix_acl_xfs_to_xattr(
207 207
208int 208int
209xfs_acl_vget( 209xfs_acl_vget(
210 bhv_vnode_t *vp, 210 struct inode *vp,
211 void *acl, 211 void *acl,
212 size_t size, 212 size_t size,
213 int kind) 213 int kind)
@@ -217,7 +217,6 @@ xfs_acl_vget(
217 posix_acl_xattr_header *ext_acl = acl; 217 posix_acl_xattr_header *ext_acl = acl;
218 int flags = 0; 218 int flags = 0;
219 219
220 VN_HOLD(vp);
221 if(size) { 220 if(size) {
222 if (!(_ACL_ALLOC(xfs_acl))) { 221 if (!(_ACL_ALLOC(xfs_acl))) {
223 error = ENOMEM; 222 error = ENOMEM;
@@ -239,11 +238,10 @@ xfs_acl_vget(
239 goto out; 238 goto out;
240 } 239 }
241 if (kind == _ACL_TYPE_ACCESS) 240 if (kind == _ACL_TYPE_ACCESS)
242 xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl); 241 xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
243 error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size); 242 error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
244 } 243 }
245out: 244out:
246 VN_RELE(vp);
247 if(xfs_acl) 245 if(xfs_acl)
248 _ACL_FREE(xfs_acl); 246 _ACL_FREE(xfs_acl);
249 return -error; 247 return -error;
@@ -251,28 +249,26 @@ out:
251 249
252int 250int
253xfs_acl_vremove( 251xfs_acl_vremove(
254 bhv_vnode_t *vp, 252 struct inode *vp,
255 int kind) 253 int kind)
256{ 254{
257 int error; 255 int error;
258 256
259 VN_HOLD(vp);
260 error = xfs_acl_allow_set(vp, kind); 257 error = xfs_acl_allow_set(vp, kind);
261 if (!error) { 258 if (!error) {
262 error = xfs_attr_remove(xfs_vtoi(vp), 259 error = xfs_attr_remove(XFS_I(vp),
263 kind == _ACL_TYPE_DEFAULT? 260 kind == _ACL_TYPE_DEFAULT?
264 SGI_ACL_DEFAULT: SGI_ACL_FILE, 261 SGI_ACL_DEFAULT: SGI_ACL_FILE,
265 ATTR_ROOT); 262 ATTR_ROOT);
266 if (error == ENOATTR) 263 if (error == ENOATTR)
267 error = 0; /* 'scool */ 264 error = 0; /* 'scool */
268 } 265 }
269 VN_RELE(vp);
270 return -error; 266 return -error;
271} 267}
272 268
273int 269int
274xfs_acl_vset( 270xfs_acl_vset(
275 bhv_vnode_t *vp, 271 struct inode *vp,
276 void *acl, 272 void *acl,
277 size_t size, 273 size_t size,
278 int kind) 274 int kind)
@@ -298,7 +294,6 @@ xfs_acl_vset(
298 return 0; 294 return 0;
299 } 295 }
300 296
301 VN_HOLD(vp);
302 error = xfs_acl_allow_set(vp, kind); 297 error = xfs_acl_allow_set(vp, kind);
303 298
304 /* Incoming ACL exists, set file mode based on its value */ 299 /* Incoming ACL exists, set file mode based on its value */
@@ -321,7 +316,6 @@ xfs_acl_vset(
321 } 316 }
322 317
323out: 318out:
324 VN_RELE(vp);
325 _ACL_FREE(xfs_acl); 319 _ACL_FREE(xfs_acl);
326 return -error; 320 return -error;
327} 321}
@@ -363,7 +357,7 @@ xfs_acl_iaccess(
363 357
364STATIC int 358STATIC int
365xfs_acl_allow_set( 359xfs_acl_allow_set(
366 bhv_vnode_t *vp, 360 struct inode *vp,
367 int kind) 361 int kind)
368{ 362{
369 if (vp->i_flags & (S_IMMUTABLE|S_APPEND)) 363 if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
@@ -372,7 +366,7 @@ xfs_acl_allow_set(
372 return ENOTDIR; 366 return ENOTDIR;
373 if (vp->i_sb->s_flags & MS_RDONLY) 367 if (vp->i_sb->s_flags & MS_RDONLY)
374 return EROFS; 368 return EROFS;
375 if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER)) 369 if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
376 return EPERM; 370 return EPERM;
377 return 0; 371 return 0;
378} 372}
@@ -566,7 +560,7 @@ xfs_acl_get_endian(
566 */ 560 */
567STATIC void 561STATIC void
568xfs_acl_get_attr( 562xfs_acl_get_attr(
569 bhv_vnode_t *vp, 563 struct inode *vp,
570 xfs_acl_t *aclp, 564 xfs_acl_t *aclp,
571 int kind, 565 int kind,
572 int flags, 566 int flags,
@@ -576,7 +570,7 @@ xfs_acl_get_attr(
576 570
577 ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1); 571 ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
578 flags |= ATTR_ROOT; 572 flags |= ATTR_ROOT;
579 *error = xfs_attr_get(xfs_vtoi(vp), 573 *error = xfs_attr_get(XFS_I(vp),
580 kind == _ACL_TYPE_ACCESS ? 574 kind == _ACL_TYPE_ACCESS ?
581 SGI_ACL_FILE : SGI_ACL_DEFAULT, 575 SGI_ACL_FILE : SGI_ACL_DEFAULT,
582 (char *)aclp, &len, flags); 576 (char *)aclp, &len, flags);
@@ -590,7 +584,7 @@ xfs_acl_get_attr(
590 */ 584 */
591STATIC void 585STATIC void
592xfs_acl_set_attr( 586xfs_acl_set_attr(
593 bhv_vnode_t *vp, 587 struct inode *vp,
594 xfs_acl_t *aclp, 588 xfs_acl_t *aclp,
595 int kind, 589 int kind,
596 int *error) 590 int *error)
@@ -615,7 +609,7 @@ xfs_acl_set_attr(
615 INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm); 609 INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
616 } 610 }
617 INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); 611 INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
618 *error = xfs_attr_set(xfs_vtoi(vp), 612 *error = xfs_attr_set(XFS_I(vp),
619 kind == _ACL_TYPE_ACCESS ? 613 kind == _ACL_TYPE_ACCESS ?
620 SGI_ACL_FILE: SGI_ACL_DEFAULT, 614 SGI_ACL_FILE: SGI_ACL_DEFAULT,
621 (char *)newacl, len, ATTR_ROOT); 615 (char *)newacl, len, ATTR_ROOT);
@@ -624,7 +618,7 @@ xfs_acl_set_attr(
624 618
625int 619int
626xfs_acl_vtoacl( 620xfs_acl_vtoacl(
627 bhv_vnode_t *vp, 621 struct inode *vp,
628 xfs_acl_t *access_acl, 622 xfs_acl_t *access_acl,
629 xfs_acl_t *default_acl) 623 xfs_acl_t *default_acl)
630{ 624{
@@ -639,7 +633,7 @@ xfs_acl_vtoacl(
639 if (error) 633 if (error)
640 access_acl->acl_cnt = XFS_ACL_NOT_PRESENT; 634 access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
641 else /* We have a good ACL and the file mode, synchronize. */ 635 else /* We have a good ACL and the file mode, synchronize. */
642 xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl); 636 xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
643 } 637 }
644 638
645 if (default_acl) { 639 if (default_acl) {
@@ -656,7 +650,7 @@ xfs_acl_vtoacl(
656 */ 650 */
657int 651int
658xfs_acl_inherit( 652xfs_acl_inherit(
659 bhv_vnode_t *vp, 653 struct inode *vp,
660 mode_t mode, 654 mode_t mode,
661 xfs_acl_t *pdaclp) 655 xfs_acl_t *pdaclp)
662{ 656{
@@ -715,7 +709,7 @@ out_error:
715 */ 709 */
716STATIC int 710STATIC int
717xfs_acl_setmode( 711xfs_acl_setmode(
718 bhv_vnode_t *vp, 712 struct inode *vp,
719 xfs_acl_t *acl, 713 xfs_acl_t *acl,
720 int *basicperms) 714 int *basicperms)
721{ 715{
@@ -734,7 +728,7 @@ xfs_acl_setmode(
734 * mode. The m:: bits take precedence over the g:: bits. 728 * mode. The m:: bits take precedence over the g:: bits.
735 */ 729 */
736 iattr.ia_valid = ATTR_MODE; 730 iattr.ia_valid = ATTR_MODE;
737 iattr.ia_mode = xfs_vtoi(vp)->i_d.di_mode; 731 iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
738 iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); 732 iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
739 ap = acl->acl_entry; 733 ap = acl->acl_entry;
740 for (i = 0; i < acl->acl_cnt; ++i) { 734 for (i = 0; i < acl->acl_cnt; ++i) {
@@ -764,7 +758,7 @@ xfs_acl_setmode(
764 if (gap && nomask) 758 if (gap && nomask)
765 iattr.ia_mode |= gap->ae_perm << 3; 759 iattr.ia_mode |= gap->ae_perm << 3;
766 760
767 return xfs_setattr(xfs_vtoi(vp), &iattr, 0, sys_cred); 761 return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
768} 762}
769 763
770/* 764/*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 323ee94cf831..a4e293b93efa 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -59,14 +59,14 @@ extern struct kmem_zone *xfs_acl_zone;
59 (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name)) 59 (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
60#define xfs_acl_zone_destroy(zone) kmem_zone_destroy(zone) 60#define xfs_acl_zone_destroy(zone) kmem_zone_destroy(zone)
61 61
62extern int xfs_acl_inherit(bhv_vnode_t *, mode_t mode, xfs_acl_t *); 62extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
63extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *); 63extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
64extern int xfs_acl_vtoacl(bhv_vnode_t *, xfs_acl_t *, xfs_acl_t *); 64extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
65extern int xfs_acl_vhasacl_access(bhv_vnode_t *); 65extern int xfs_acl_vhasacl_access(struct inode *);
66extern int xfs_acl_vhasacl_default(bhv_vnode_t *); 66extern int xfs_acl_vhasacl_default(struct inode *);
67extern int xfs_acl_vset(bhv_vnode_t *, void *, size_t, int); 67extern int xfs_acl_vset(struct inode *, void *, size_t, int);
68extern int xfs_acl_vget(bhv_vnode_t *, void *, size_t, int); 68extern int xfs_acl_vget(struct inode *, void *, size_t, int);
69extern int xfs_acl_vremove(bhv_vnode_t *, int); 69extern int xfs_acl_vremove(struct inode *, int);
70 70
71#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) 71#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
72 72
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index f9472a2076d4..0b3b5efe848c 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -92,16 +92,6 @@
92 ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ 92 ((__u8*)(pointer))[1] = (((value) ) & 0xff); \
93 } 93 }
94 94
95/* define generic INT_ macros */
96
97#define INT_GET(reference,arch) \
98 (((arch) == ARCH_NOCONVERT) \
99 ? \
100 (reference) \
101 : \
102 INT_SWAP((reference),(reference)) \
103 )
104
105/* does not return a value */ 95/* does not return a value */
106#define INT_SET(reference,arch,valueref) \ 96#define INT_SET(reference,arch,valueref) \
107 (__builtin_constant_p(valueref) ? \ 97 (__builtin_constant_p(valueref) ? \
@@ -112,64 +102,6 @@
112 ) \ 102 ) \
113 ) 103 )
114 104
115/* does not return a value */
116#define INT_MOD_EXPR(reference,arch,code) \
117 (((arch) == ARCH_NOCONVERT) \
118 ? \
119 (void)((reference) code) \
120 : \
121 (void)( \
122 (reference) = INT_GET((reference),arch) , \
123 ((reference) code), \
124 INT_SET(reference, arch, reference) \
125 ) \
126 )
127
128/* does not return a value */
129#define INT_MOD(reference,arch,delta) \
130 (void)( \
131 INT_MOD_EXPR(reference,arch,+=(delta)) \
132 )
133
134/*
135 * INT_COPY - copy a value between two locations with the
136 * _same architecture_ but _potentially different sizes_
137 *
138 * if the types of the two parameters are equal or they are
139 * in native architecture, a simple copy is done
140 *
141 * otherwise, architecture conversions are done
142 *
143 */
144
145/* does not return a value */
146#define INT_COPY(dst,src,arch) \
147 ( \
148 ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
149 ? \
150 (void)((dst) = (src)) \
151 : \
152 INT_SET(dst, arch, INT_GET(src, arch)) \
153 )
154
155/*
156 * INT_XLATE - copy a value in either direction between two locations
157 * with different architectures
158 *
159 * dir < 0 - copy from memory to buffer (native to arch)
160 * dir > 0 - copy from buffer to memory (arch to native)
161 */
162
163/* does not return a value */
164#define INT_XLATE(buf,mem,dir,arch) {\
165 ASSERT(dir); \
166 if (dir>0) { \
167 (mem)=INT_GET(buf, arch); \
168 } else { \
169 INT_SET(buf, arch, mem); \
170 } \
171}
172
173/* 105/*
174 * In directories inode numbers are stored as unaligned arrays of unsigned 106 * In directories inode numbers are stored as unaligned arrays of unsigned
175 * 8bit integers on disk. 107 * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 78de80e3caa2..f7cdc28aff41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -194,6 +194,46 @@ xfs_attr_get(
194 return(error); 194 return(error);
195} 195}
196 196
197/*
198 * Calculate how many blocks we need for the new attribute,
199 */
200int
201xfs_attr_calc_size(
202 struct xfs_inode *ip,
203 int namelen,
204 int valuelen,
205 int *local)
206{
207 struct xfs_mount *mp = ip->i_mount;
208 int size;
209 int nblks;
210
211 /*
212 * Determine space new attribute will use, and if it would be
213 * "local" or "remote" (note: local != inline).
214 */
215 size = xfs_attr_leaf_newentsize(namelen, valuelen,
216 mp->m_sb.sb_blocksize, local);
217
218 nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
219 if (*local) {
220 if (size > (mp->m_sb.sb_blocksize >> 1)) {
221 /* Double split possible */
222 nblks *= 2;
223 }
224 } else {
225 /*
226 * Out of line attribute, cannot double split, but
227 * make room for the attribute value itself.
228 */
229 uint dblocks = XFS_B_TO_FSB(mp, valuelen);
230 nblks += dblocks;
231 nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
232 }
233
234 return nblks;
235}
236
197STATIC int 237STATIC int
198xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, 238xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
199 char *value, int valuelen, int flags) 239 char *value, int valuelen, int flags)
@@ -202,10 +242,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
202 xfs_fsblock_t firstblock; 242 xfs_fsblock_t firstblock;
203 xfs_bmap_free_t flist; 243 xfs_bmap_free_t flist;
204 int error, err2, committed; 244 int error, err2, committed;
205 int local, size;
206 uint nblks;
207 xfs_mount_t *mp = dp->i_mount; 245 xfs_mount_t *mp = dp->i_mount;
208 int rsvd = (flags & ATTR_ROOT) != 0; 246 int rsvd = (flags & ATTR_ROOT) != 0;
247 int local;
209 248
210 /* 249 /*
211 * Attach the dquots to the inode. 250 * Attach the dquots to the inode.
@@ -241,30 +280,8 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
241 args.whichfork = XFS_ATTR_FORK; 280 args.whichfork = XFS_ATTR_FORK;
242 args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; 281 args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
243 282
244 /*
245 * Determine space new attribute will use, and if it would be
246 * "local" or "remote" (note: local != inline).
247 */
248 size = xfs_attr_leaf_newentsize(name->len, valuelen,
249 mp->m_sb.sb_blocksize, &local);
250
251 nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
252 if (local) {
253 if (size > (mp->m_sb.sb_blocksize >> 1)) {
254 /* Double split possible */
255 nblks <<= 1;
256 }
257 } else {
258 uint dblocks = XFS_B_TO_FSB(mp, valuelen);
259 /* Out of line attribute, cannot double split, but make
260 * room for the attribute value itself.
261 */
262 nblks += dblocks;
263 nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
264 }
265
266 /* Size is now blocks for attribute data */ 283 /* Size is now blocks for attribute data */
267 args.total = nblks; 284 args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
268 285
269 /* 286 /*
270 * Start our first transaction of the day. 287 * Start our first transaction of the day.
@@ -286,18 +303,17 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
286 if (rsvd) 303 if (rsvd)
287 args.trans->t_flags |= XFS_TRANS_RESERVE; 304 args.trans->t_flags |= XFS_TRANS_RESERVE;
288 305
289 if ((error = xfs_trans_reserve(args.trans, (uint) nblks, 306 if ((error = xfs_trans_reserve(args.trans, args.total,
290 XFS_ATTRSET_LOG_RES(mp, nblks), 307 XFS_ATTRSET_LOG_RES(mp, args.total), 0,
291 0, XFS_TRANS_PERM_LOG_RES, 308 XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
292 XFS_ATTRSET_LOG_COUNT))) {
293 xfs_trans_cancel(args.trans, 0); 309 xfs_trans_cancel(args.trans, 0);
294 return(error); 310 return(error);
295 } 311 }
296 xfs_ilock(dp, XFS_ILOCK_EXCL); 312 xfs_ilock(dp, XFS_ILOCK_EXCL);
297 313
298 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0, 314 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
299 rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : 315 rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
300 XFS_QMOPT_RES_REGBLKS); 316 XFS_QMOPT_RES_REGBLKS);
301 if (error) { 317 if (error) {
302 xfs_iunlock(dp, XFS_ILOCK_EXCL); 318 xfs_iunlock(dp, XFS_ILOCK_EXCL);
303 xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); 319 xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
@@ -384,7 +400,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
384 * Commit the leaf transformation. We'll need another (linked) 400 * Commit the leaf transformation. We'll need another (linked)
385 * transaction to add the new attribute to the leaf. 401 * transaction to add the new attribute to the leaf.
386 */ 402 */
387 if ((error = xfs_attr_rolltrans(&args.trans, dp))) 403
404 error = xfs_trans_roll(&args.trans, dp);
405 if (error)
388 goto out; 406 goto out;
389 407
390 } 408 }
@@ -964,7 +982,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
964 * Commit the current trans (including the inode) and start 982 * Commit the current trans (including the inode) and start
965 * a new one. 983 * a new one.
966 */ 984 */
967 if ((error = xfs_attr_rolltrans(&args->trans, dp))) 985 error = xfs_trans_roll(&args->trans, dp);
986 if (error)
968 return (error); 987 return (error);
969 988
970 /* 989 /*
@@ -978,7 +997,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
978 * Commit the transaction that added the attr name so that 997 * Commit the transaction that added the attr name so that
979 * later routines can manage their own transactions. 998 * later routines can manage their own transactions.
980 */ 999 */
981 if ((error = xfs_attr_rolltrans(&args->trans, dp))) 1000 error = xfs_trans_roll(&args->trans, dp);
1001 if (error)
982 return (error); 1002 return (error);
983 1003
984 /* 1004 /*
@@ -1067,7 +1087,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1067 /* 1087 /*
1068 * Commit the remove and start the next trans in series. 1088 * Commit the remove and start the next trans in series.
1069 */ 1089 */
1070 error = xfs_attr_rolltrans(&args->trans, dp); 1090 error = xfs_trans_roll(&args->trans, dp);
1071 1091
1072 } else if (args->rmtblkno > 0) { 1092 } else if (args->rmtblkno > 0) {
1073 /* 1093 /*
@@ -1298,7 +1318,8 @@ restart:
1298 * Commit the node conversion and start the next 1318 * Commit the node conversion and start the next
1299 * trans in the chain. 1319 * trans in the chain.
1300 */ 1320 */
1301 if ((error = xfs_attr_rolltrans(&args->trans, dp))) 1321 error = xfs_trans_roll(&args->trans, dp);
1322 if (error)
1302 goto out; 1323 goto out;
1303 1324
1304 goto restart; 1325 goto restart;
@@ -1349,7 +1370,8 @@ restart:
1349 * Commit the leaf addition or btree split and start the next 1370 * Commit the leaf addition or btree split and start the next
1350 * trans in the chain. 1371 * trans in the chain.
1351 */ 1372 */
1352 if ((error = xfs_attr_rolltrans(&args->trans, dp))) 1373 error = xfs_trans_roll(&args->trans, dp);
1374 if (error)
1353 goto out; 1375 goto out;
1354 1376
1355 /* 1377 /*
@@ -1449,7 +1471,8 @@ restart:
1449 /* 1471 /*
1450 * Commit and start the next trans in the chain. 1472 * Commit and start the next trans in the chain.
1451 */ 1473 */
1452 if ((error = xfs_attr_rolltrans(&args->trans, dp))) 1474 error = xfs_trans_roll(&args->trans, dp);
1475 if (error)
1453 goto out; 1476 goto out;
1454 1477
1455 } else if (args->rmtblkno > 0) { 1478 } else if (args->rmtblkno > 0) {
@@ -1581,7 +1604,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1581 /* 1604 /*
1582 * Commit the Btree join operation and start a new trans. 1605 * Commit the Btree join operation and start a new trans.
1583 */ 1606 */
1584 if ((error = xfs_attr_rolltrans(&args->trans, dp))) 1607 error = xfs_trans_roll(&args->trans, dp);
1608 if (error)
1585 goto out; 1609 goto out;
1586 } 1610 }
1587 1611
@@ -2082,7 +2106,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2082 /* 2106 /*
2083 * Start the next trans in the chain. 2107 * Start the next trans in the chain.
2084 */ 2108 */
2085 if ((error = xfs_attr_rolltrans(&args->trans, dp))) 2109 error = xfs_trans_roll(&args->trans, dp);
2110 if (error)
2086 return (error); 2111 return (error);
2087 } 2112 }
2088 2113
@@ -2232,7 +2257,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2232 /* 2257 /*
2233 * Close out trans and start the next one in the chain. 2258 * Close out trans and start the next one in the chain.
2234 */ 2259 */
2235 if ((error = xfs_attr_rolltrans(&args->trans, args->dp))) 2260 error = xfs_trans_roll(&args->trans, args->dp);
2261 if (error)
2236 return (error); 2262 return (error);
2237 } 2263 }
2238 return(0); 2264 return(0);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 8b2d31c19e4d..fb3b2a68b9b9 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -129,6 +129,7 @@ typedef struct xfs_attr_list_context {
129/* 129/*
130 * Overall external interface routines. 130 * Overall external interface routines.
131 */ 131 */
132int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
132int xfs_attr_inactive(struct xfs_inode *dp); 133int xfs_attr_inactive(struct xfs_inode *dp);
133int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int); 134int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
134int xfs_attr_rmtval_get(struct xfs_da_args *args); 135int xfs_attr_rmtval_get(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 23ef5d7c87e1..79da6b2ea99e 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2498,9 +2498,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2498 /* 2498 /*
2499 * Commit the flag value change and start the next trans in series. 2499 * Commit the flag value change and start the next trans in series.
2500 */ 2500 */
2501 error = xfs_attr_rolltrans(&args->trans, args->dp); 2501 return xfs_trans_roll(&args->trans, args->dp);
2502
2503 return(error);
2504} 2502}
2505 2503
2506/* 2504/*
@@ -2547,9 +2545,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
2547 /* 2545 /*
2548 * Commit the flag value change and start the next trans in series. 2546 * Commit the flag value change and start the next trans in series.
2549 */ 2547 */
2550 error = xfs_attr_rolltrans(&args->trans, args->dp); 2548 return xfs_trans_roll(&args->trans, args->dp);
2551
2552 return(error);
2553} 2549}
2554 2550
2555/* 2551/*
@@ -2665,7 +2661,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2665 /* 2661 /*
2666 * Commit the flag value change and start the next trans in series. 2662 * Commit the flag value change and start the next trans in series.
2667 */ 2663 */
2668 error = xfs_attr_rolltrans(&args->trans, args->dp); 2664 error = xfs_trans_roll(&args->trans, args->dp);
2669 2665
2670 return(error); 2666 return(error);
2671} 2667}
@@ -2723,7 +2719,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
2723 /* 2719 /*
2724 * Commit the invalidate and start the next transaction. 2720 * Commit the invalidate and start the next transaction.
2725 */ 2721 */
2726 error = xfs_attr_rolltrans(trans, dp); 2722 error = xfs_trans_roll(trans, dp);
2727 2723
2728 return (error); 2724 return (error);
2729} 2725}
@@ -2825,7 +2821,8 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
2825 /* 2821 /*
2826 * Atomically commit the whole invalidate stuff. 2822 * Atomically commit the whole invalidate stuff.
2827 */ 2823 */
2828 if ((error = xfs_attr_rolltrans(trans, dp))) 2824 error = xfs_trans_roll(trans, dp);
2825 if (error)
2829 return (error); 2826 return (error);
2830 } 2827 }
2831 2828
@@ -2964,7 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2964 /* 2961 /*
2965 * Roll to next transaction. 2962 * Roll to next transaction.
2966 */ 2963 */
2967 if ((error = xfs_attr_rolltrans(trans, dp))) 2964 error = xfs_trans_roll(trans, dp);
2965 if (error)
2968 return (error); 2966 return (error);
2969 } 2967 }
2970 2968
@@ -2974,60 +2972,3 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
2974 2972
2975 return(0); 2973 return(0);
2976} 2974}
2977
2978
2979/*
2980 * Roll from one trans in the sequence of PERMANENT transactions to the next.
2981 */
2982int
2983xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
2984{
2985 xfs_trans_t *trans;
2986 unsigned int logres, count;
2987 int error;
2988
2989 /*
2990 * Ensure that the inode is always logged.
2991 */
2992 trans = *transp;
2993 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
2994
2995 /*
2996 * Copy the critical parameters from one trans to the next.
2997 */
2998 logres = trans->t_log_res;
2999 count = trans->t_log_count;
3000 *transp = xfs_trans_dup(trans);
3001
3002 /*
3003 * Commit the current transaction.
3004 * If this commit failed, then it'd just unlock those items that
3005 * are not marked ihold. That also means that a filesystem shutdown
3006 * is in progress. The caller takes the responsibility to cancel
3007 * the duplicate transaction that gets returned.
3008 */
3009 if ((error = xfs_trans_commit(trans, 0)))
3010 return (error);
3011
3012 trans = *transp;
3013
3014 /*
3015 * Reserve space in the log for th next transaction.
3016 * This also pushes items in the "AIL", the list of logged items,
3017 * out to disk if they are taking up space at the tail of the log
3018 * that we want to use. This requires that either nothing be locked
3019 * across this call, or that anything that is locked be logged in
3020 * the prior and the next transactions.
3021 */
3022 error = xfs_trans_reserve(trans, 0, logres, 0,
3023 XFS_TRANS_PERM_LOG_RES, count);
3024 /*
3025 * Ensure that the inode is in the new transaction and locked.
3026 */
3027 if (!error) {
3028 xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
3029 xfs_trans_ihold(trans, dp);
3030 }
3031 return (error);
3032
3033}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 5ecf437b7825..83e9af417ca2 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -274,6 +274,4 @@ int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
274 struct xfs_dabuf *leaf2_bp); 274 struct xfs_dabuf *leaf2_bp);
275int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, 275int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
276 int *local); 276 int *local);
277int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
278
279#endif /* __XFS_ATTR_LEAF_H__ */ 277#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index fab0b6d5a41b..48228848f5ae 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -25,109 +25,6 @@
25 * XFS bit manipulation routines, used in non-realtime code. 25 * XFS bit manipulation routines, used in non-realtime code.
26 */ 26 */
27 27
28#ifndef HAVE_ARCH_HIGHBIT
29/*
30 * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
31 */
32static const char xfs_highbit[256] = {
33 -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */
34 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */
35 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */
36 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */
37 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */
38 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */
39 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */
40 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */
41 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */
42 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */
43 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */
44 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */
45 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */
46 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */
47 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */
48 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */
49 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */
50 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */
51 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */
52 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */
53 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */
54 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */
55 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */
56 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */
57 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */
58 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */
59 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */
60 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */
61 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */
62 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */
63 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */
64 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */
65};
66#endif
67
68/*
69 * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
70 */
71inline int
72xfs_highbit32(
73 __uint32_t v)
74{
75#ifdef HAVE_ARCH_HIGHBIT
76 return highbit32(v);
77#else
78 int i;
79
80 if (v & 0xffff0000)
81 if (v & 0xff000000)
82 i = 24;
83 else
84 i = 16;
85 else if (v & 0x0000ffff)
86 if (v & 0x0000ff00)
87 i = 8;
88 else
89 i = 0;
90 else
91 return -1;
92 return i + xfs_highbit[(v >> i) & 0xff];
93#endif
94}
95
96/*
97 * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
98 */
99int
100xfs_lowbit64(
101 __uint64_t v)
102{
103 __uint32_t w = (__uint32_t)v;
104 int n = 0;
105
106 if (w) { /* lower bits */
107 n = ffs(w);
108 } else { /* upper bits */
109 w = (__uint32_t)(v >> 32);
110 if (w && (n = ffs(w)))
111 n += 32;
112 }
113 return n - 1;
114}
115
116/*
117 * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
118 */
119int
120xfs_highbit64(
121 __uint64_t v)
122{
123 __uint32_t h = (__uint32_t)(v >> 32);
124
125 if (h)
126 return xfs_highbit32(h) + 32;
127 return xfs_highbit32((__uint32_t)v);
128}
129
130
131/* 28/*
132 * Return whether bitmap is empty. 29 * Return whether bitmap is empty.
133 * Size is number of words in the bitmap, which is padded to word boundary 30 * Size is number of words in the bitmap, which is padded to word boundary
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 082641a9782c..8e0e463dae2d 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -47,13 +47,39 @@ static inline __uint64_t xfs_mask64lo(int n)
47} 47}
48 48
49/* Get high bit set out of 32-bit argument, -1 if none set */ 49/* Get high bit set out of 32-bit argument, -1 if none set */
50extern int xfs_highbit32(__uint32_t v); 50static inline int xfs_highbit32(__uint32_t v)
51{
52 return fls(v) - 1;
53}
54
55/* Get high bit set out of 64-bit argument, -1 if none set */
56static inline int xfs_highbit64(__uint64_t v)
57{
58 return fls64(v) - 1;
59}
60
61/* Get low bit set out of 32-bit argument, -1 if none set */
62static inline int xfs_lowbit32(__uint32_t v)
63{
64 unsigned long t = v;
65 return (v) ? find_first_bit(&t, 32) : -1;
66}
51 67
52/* Get low bit set out of 64-bit argument, -1 if none set */ 68/* Get low bit set out of 64-bit argument, -1 if none set */
53extern int xfs_lowbit64(__uint64_t v); 69static inline int xfs_lowbit64(__uint64_t v)
70{
71 __uint32_t w = (__uint32_t)v;
72 int n = 0;
54 73
55/* Get high bit set out of 64-bit argument, -1 if none set */ 74 if (w) { /* lower bits */
56extern int xfs_highbit64(__uint64_t); 75 n = ffs(w);
76 } else { /* upper bits */
77 w = (__uint32_t)(v >> 32);
78 if (w && (n = ffs(w)))
79 n += 32;
80 }
81 return n - 1;
82}
57 83
58/* Return whether bitmap is empty (1 == empty) */ 84/* Return whether bitmap is empty (1 == empty) */
59extern int xfs_bitmap_empty(uint *map, uint size); 85extern int xfs_bitmap_empty(uint *map, uint size);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3c4beb3a4326..a1aab9275d5a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -384,14 +384,14 @@ xfs_bmap_count_tree(
384 int levelin, 384 int levelin,
385 int *count); 385 int *count);
386 386
387STATIC int 387STATIC void
388xfs_bmap_count_leaves( 388xfs_bmap_count_leaves(
389 xfs_ifork_t *ifp, 389 xfs_ifork_t *ifp,
390 xfs_extnum_t idx, 390 xfs_extnum_t idx,
391 int numrecs, 391 int numrecs,
392 int *count); 392 int *count);
393 393
394STATIC int 394STATIC void
395xfs_bmap_disk_count_leaves( 395xfs_bmap_disk_count_leaves(
396 xfs_extnum_t idx, 396 xfs_extnum_t idx,
397 xfs_bmbt_block_t *block, 397 xfs_bmbt_block_t *block,
@@ -4000,7 +4000,7 @@ xfs_bmap_add_attrfork(
4000 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 4000 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
4001 } 4001 }
4002 ASSERT(ip->i_d.di_anextents == 0); 4002 ASSERT(ip->i_d.di_anextents == 0);
4003 VN_HOLD(XFS_ITOV(ip)); 4003 IHOLD(ip);
4004 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 4004 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4005 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 4005 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4006 switch (ip->i_d.di_format) { 4006 switch (ip->i_d.di_format) {
@@ -6096,7 +6096,7 @@ xfs_bmap_get_bp(
6096 tp = cur->bc_tp; 6096 tp = cur->bc_tp;
6097 licp = &tp->t_items; 6097 licp = &tp->t_items;
6098 while (!bp && licp != NULL) { 6098 while (!bp && licp != NULL) {
6099 if (XFS_LIC_ARE_ALL_FREE(licp)) { 6099 if (xfs_lic_are_all_free(licp)) {
6100 licp = licp->lic_next; 6100 licp = licp->lic_next;
6101 continue; 6101 continue;
6102 } 6102 }
@@ -6106,11 +6106,11 @@ xfs_bmap_get_bp(
6106 xfs_buf_log_item_t *bip; 6106 xfs_buf_log_item_t *bip;
6107 xfs_buf_t *lbp; 6107 xfs_buf_t *lbp;
6108 6108
6109 if (XFS_LIC_ISFREE(licp, i)) { 6109 if (xfs_lic_isfree(licp, i)) {
6110 continue; 6110 continue;
6111 } 6111 }
6112 6112
6113 lidp = XFS_LIC_SLOT(licp, i); 6113 lidp = xfs_lic_slot(licp, i);
6114 lip = lidp->lid_item; 6114 lip = lidp->lid_item;
6115 if (lip->li_type != XFS_LI_BUF) 6115 if (lip->li_type != XFS_LI_BUF)
6116 continue; 6116 continue;
@@ -6367,13 +6367,9 @@ xfs_bmap_count_blocks(
6367 mp = ip->i_mount; 6367 mp = ip->i_mount;
6368 ifp = XFS_IFORK_PTR(ip, whichfork); 6368 ifp = XFS_IFORK_PTR(ip, whichfork);
6369 if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { 6369 if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
6370 if (unlikely(xfs_bmap_count_leaves(ifp, 0, 6370 xfs_bmap_count_leaves(ifp, 0,
6371 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), 6371 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
6372 count) < 0)) { 6372 count);
6373 XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
6374 XFS_ERRLEVEL_LOW, mp);
6375 return XFS_ERROR(EFSCORRUPTED);
6376 }
6377 return 0; 6373 return 0;
6378 } 6374 }
6379 6375
@@ -6454,13 +6450,7 @@ xfs_bmap_count_tree(
6454 for (;;) { 6450 for (;;) {
6455 nextbno = be64_to_cpu(block->bb_rightsib); 6451 nextbno = be64_to_cpu(block->bb_rightsib);
6456 numrecs = be16_to_cpu(block->bb_numrecs); 6452 numrecs = be16_to_cpu(block->bb_numrecs);
6457 if (unlikely(xfs_bmap_disk_count_leaves(0, 6453 xfs_bmap_disk_count_leaves(0, block, numrecs, count);
6458 block, numrecs, count) < 0)) {
6459 xfs_trans_brelse(tp, bp);
6460 XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
6461 XFS_ERRLEVEL_LOW, mp);
6462 return XFS_ERROR(EFSCORRUPTED);
6463 }
6464 xfs_trans_brelse(tp, bp); 6454 xfs_trans_brelse(tp, bp);
6465 if (nextbno == NULLFSBLOCK) 6455 if (nextbno == NULLFSBLOCK)
6466 break; 6456 break;
@@ -6478,7 +6468,7 @@ xfs_bmap_count_tree(
6478/* 6468/*
6479 * Count leaf blocks given a range of extent records. 6469 * Count leaf blocks given a range of extent records.
6480 */ 6470 */
6481STATIC int 6471STATIC void
6482xfs_bmap_count_leaves( 6472xfs_bmap_count_leaves(
6483 xfs_ifork_t *ifp, 6473 xfs_ifork_t *ifp,
6484 xfs_extnum_t idx, 6474 xfs_extnum_t idx,
@@ -6491,14 +6481,13 @@ xfs_bmap_count_leaves(
6491 xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); 6481 xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
6492 *count += xfs_bmbt_get_blockcount(frp); 6482 *count += xfs_bmbt_get_blockcount(frp);
6493 } 6483 }
6494 return 0;
6495} 6484}
6496 6485
6497/* 6486/*
6498 * Count leaf blocks given a range of extent records originally 6487 * Count leaf blocks given a range of extent records originally
6499 * in btree format. 6488 * in btree format.
6500 */ 6489 */
6501STATIC int 6490STATIC void
6502xfs_bmap_disk_count_leaves( 6491xfs_bmap_disk_count_leaves(
6503 xfs_extnum_t idx, 6492 xfs_extnum_t idx,
6504 xfs_bmbt_block_t *block, 6493 xfs_bmbt_block_t *block,
@@ -6512,5 +6501,4 @@ xfs_bmap_disk_count_leaves(
6512 frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b); 6501 frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
6513 *count += xfs_bmbt_disk_get_blockcount(frp); 6502 *count += xfs_bmbt_disk_get_blockcount(frp);
6514 } 6503 }
6515 return 0;
6516} 6504}
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index aeb87ca69fcc..cc593a84c345 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -46,38 +46,11 @@ kmem_zone_t *xfs_btree_cur_zone;
46/* 46/*
47 * Btree magic numbers. 47 * Btree magic numbers.
48 */ 48 */
49const __uint32_t xfs_magics[XFS_BTNUM_MAX] = 49const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
50{
51 XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC 50 XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
52}; 51};
53 52
54/* 53/*
55 * Prototypes for internal routines.
56 */
57
58/*
59 * Checking routine: return maxrecs for the block.
60 */
61STATIC int /* number of records fitting in block */
62xfs_btree_maxrecs(
63 xfs_btree_cur_t *cur, /* btree cursor */
64 xfs_btree_block_t *block);/* generic btree block pointer */
65
66/*
67 * Internal routines.
68 */
69
70/*
71 * Retrieve the block pointer from the cursor at the given level.
72 * This may be a bmap btree root or from a buffer.
73 */
74STATIC xfs_btree_block_t * /* generic btree block pointer */
75xfs_btree_get_block(
76 xfs_btree_cur_t *cur, /* btree cursor */
77 int level, /* level in btree */
78 struct xfs_buf **bpp); /* buffer containing the block */
79
80/*
81 * Checking routine: return maxrecs for the block. 54 * Checking routine: return maxrecs for the block.
82 */ 55 */
83STATIC int /* number of records fitting in block */ 56STATIC int /* number of records fitting in block */
@@ -457,35 +430,6 @@ xfs_btree_dup_cursor(
457} 430}
458 431
459/* 432/*
460 * Change the cursor to point to the first record at the given level.
461 * Other levels are unaffected.
462 */
463int /* success=1, failure=0 */
464xfs_btree_firstrec(
465 xfs_btree_cur_t *cur, /* btree cursor */
466 int level) /* level to change */
467{
468 xfs_btree_block_t *block; /* generic btree block pointer */
469 xfs_buf_t *bp; /* buffer containing block */
470
471 /*
472 * Get the block pointer for this level.
473 */
474 block = xfs_btree_get_block(cur, level, &bp);
475 xfs_btree_check_block(cur, block, level, bp);
476 /*
477 * It's empty, there is no such record.
478 */
479 if (!block->bb_h.bb_numrecs)
480 return 0;
481 /*
482 * Set the ptr value to 1, that's the first record/key.
483 */
484 cur->bc_ptrs[level] = 1;
485 return 1;
486}
487
488/*
489 * Retrieve the block pointer from the cursor at the given level. 433 * Retrieve the block pointer from the cursor at the given level.
490 * This may be a bmap btree root or from a buffer. 434 * This may be a bmap btree root or from a buffer.
491 */ 435 */
@@ -626,6 +570,13 @@ xfs_btree_init_cursor(
626 cur->bc_private.a.agbp = agbp; 570 cur->bc_private.a.agbp = agbp;
627 cur->bc_private.a.agno = agno; 571 cur->bc_private.a.agno = agno;
628 break; 572 break;
573 case XFS_BTNUM_INO:
574 /*
575 * Inode allocation btree fields.
576 */
577 cur->bc_private.a.agbp = agbp;
578 cur->bc_private.a.agno = agno;
579 break;
629 case XFS_BTNUM_BMAP: 580 case XFS_BTNUM_BMAP:
630 /* 581 /*
631 * Bmap btree fields. 582 * Bmap btree fields.
@@ -638,13 +589,6 @@ xfs_btree_init_cursor(
638 cur->bc_private.b.flags = 0; 589 cur->bc_private.b.flags = 0;
639 cur->bc_private.b.whichfork = whichfork; 590 cur->bc_private.b.whichfork = whichfork;
640 break; 591 break;
641 case XFS_BTNUM_INO:
642 /*
643 * Inode allocation btree fields.
644 */
645 cur->bc_private.i.agbp = agbp;
646 cur->bc_private.i.agno = agno;
647 break;
648 default: 592 default:
649 ASSERT(0); 593 ASSERT(0);
650 } 594 }
@@ -671,6 +615,35 @@ xfs_btree_islastblock(
671} 615}
672 616
673/* 617/*
618 * Change the cursor to point to the first record at the given level.
619 * Other levels are unaffected.
620 */
621int /* success=1, failure=0 */
622xfs_btree_firstrec(
623 xfs_btree_cur_t *cur, /* btree cursor */
624 int level) /* level to change */
625{
626 xfs_btree_block_t *block; /* generic btree block pointer */
627 xfs_buf_t *bp; /* buffer containing block */
628
629 /*
630 * Get the block pointer for this level.
631 */
632 block = xfs_btree_get_block(cur, level, &bp);
633 xfs_btree_check_block(cur, block, level, bp);
634 /*
635 * It's empty, there is no such record.
636 */
637 if (!block->bb_h.bb_numrecs)
638 return 0;
639 /*
640 * Set the ptr value to 1, that's the first record/key.
641 */
642 cur->bc_ptrs[level] = 1;
643 return 1;
644}
645
646/*
674 * Change the cursor to point to the last record in the current block 647 * Change the cursor to point to the last record in the current block
675 * at the given level. Other levels are unaffected. 648 * at the given level. Other levels are unaffected.
676 */ 649 */
@@ -890,12 +863,12 @@ xfs_btree_readahead_core(
890 case XFS_BTNUM_INO: 863 case XFS_BTNUM_INO:
891 i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); 864 i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
892 if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) { 865 if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
893 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, 866 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
894 be32_to_cpu(i->bb_leftsib), 1); 867 be32_to_cpu(i->bb_leftsib), 1);
895 rval++; 868 rval++;
896 } 869 }
897 if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) { 870 if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
898 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, 871 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
899 be32_to_cpu(i->bb_rightsib), 1); 872 be32_to_cpu(i->bb_rightsib), 1);
900 rval++; 873 rval++;
901 } 874 }
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7440b78f9cec..1f528a2a3754 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -158,8 +158,8 @@ typedef struct xfs_btree_cur
158 __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ 158 __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
159 xfs_btnum_t bc_btnum; /* identifies which btree type */ 159 xfs_btnum_t bc_btnum; /* identifies which btree type */
160 union { 160 union {
161 struct { /* needed for BNO, CNT */ 161 struct { /* needed for BNO, CNT, INO */
162 struct xfs_buf *agbp; /* agf buffer pointer */ 162 struct xfs_buf *agbp; /* agf/agi buffer pointer */
163 xfs_agnumber_t agno; /* ag number */ 163 xfs_agnumber_t agno; /* ag number */
164 } a; 164 } a;
165 struct { /* needed for BMAP */ 165 struct { /* needed for BMAP */
@@ -172,10 +172,6 @@ typedef struct xfs_btree_cur
172 char flags; /* flags */ 172 char flags; /* flags */
173#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ 173#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
174 } b; 174 } b;
175 struct { /* needed for INO */
176 struct xfs_buf *agbp; /* agi buffer pointer */
177 xfs_agnumber_t agno; /* ag number */
178 } i;
179 } bc_private; /* per-btree type data */ 175 } bc_private; /* per-btree type data */
180} xfs_btree_cur_t; 176} xfs_btree_cur_t;
181 177
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index d86ca2c03a70..002fc2617c8e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,12 +732,13 @@ xfs_buf_item_init(
732 bip->bli_item.li_ops = &xfs_buf_item_ops; 732 bip->bli_item.li_ops = &xfs_buf_item_ops;
733 bip->bli_item.li_mountp = mp; 733 bip->bli_item.li_mountp = mp;
734 bip->bli_buf = bp; 734 bip->bli_buf = bp;
735 xfs_buf_hold(bp);
735 bip->bli_format.blf_type = XFS_LI_BUF; 736 bip->bli_format.blf_type = XFS_LI_BUF;
736 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); 737 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
737 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); 738 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
738 bip->bli_format.blf_map_size = map_size; 739 bip->bli_format.blf_map_size = map_size;
739#ifdef XFS_BLI_TRACE 740#ifdef XFS_BLI_TRACE
740 bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP); 741 bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
741#endif 742#endif
742 743
743#ifdef XFS_TRANS_DEBUG 744#ifdef XFS_TRANS_DEBUG
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
867 return (bip->bli_flags & XFS_BLI_DIRTY); 868 return (bip->bli_flags & XFS_BLI_DIRTY);
868} 869}
869 870
871STATIC void
872xfs_buf_item_free(
873 xfs_buf_log_item_t *bip)
874{
875#ifdef XFS_TRANS_DEBUG
876 kmem_free(bip->bli_orig);
877 kmem_free(bip->bli_logged);
878#endif /* XFS_TRANS_DEBUG */
879
880#ifdef XFS_BLI_TRACE
881 ktrace_free(bip->bli_trace);
882#endif
883 kmem_zone_free(xfs_buf_item_zone, bip);
884}
885
870/* 886/*
871 * This is called when the buf log item is no longer needed. It should 887 * This is called when the buf log item is no longer needed. It should
872 * free the buf log item associated with the given buffer and clear 888 * free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
887 (XFS_BUF_IODONE_FUNC(bp) != NULL)) { 903 (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
888 XFS_BUF_CLR_IODONE_FUNC(bp); 904 XFS_BUF_CLR_IODONE_FUNC(bp);
889 } 905 }
890 906 xfs_buf_rele(bp);
891#ifdef XFS_TRANS_DEBUG 907 xfs_buf_item_free(bip);
892 kmem_free(bip->bli_orig);
893 bip->bli_orig = NULL;
894 kmem_free(bip->bli_logged);
895 bip->bli_logged = NULL;
896#endif /* XFS_TRANS_DEBUG */
897
898#ifdef XFS_BLI_TRACE
899 ktrace_free(bip->bli_trace);
900#endif
901 kmem_zone_free(xfs_buf_item_zone, bip);
902} 908}
903 909
904 910
@@ -1056,7 +1062,7 @@ xfs_buf_iodone_callbacks(
1056 anyway. */ 1062 anyway. */
1057 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); 1063 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1058 XFS_BUF_DONE(bp); 1064 XFS_BUF_DONE(bp);
1059 XFS_BUF_V_IODONESEMA(bp); 1065 XFS_BUF_FINISH_IOWAIT(bp);
1060 } 1066 }
1061 return; 1067 return;
1062 } 1068 }
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
1120 1126
1121 ASSERT(bip->bli_buf == bp); 1127 ASSERT(bip->bli_buf == bp);
1122 1128
1129 xfs_buf_rele(bp);
1123 mp = bip->bli_item.li_mountp; 1130 mp = bip->bli_item.li_mountp;
1124 1131
1125 /* 1132 /*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
1136 * xfs_trans_delete_ail() drops the AIL lock. 1143 * xfs_trans_delete_ail() drops the AIL lock.
1137 */ 1144 */
1138 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); 1145 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
1139 1146 xfs_buf_item_free(bip);
1140#ifdef XFS_TRANS_DEBUG
1141 kmem_free(bip->bli_orig);
1142 bip->bli_orig = NULL;
1143 kmem_free(bip->bli_logged);
1144 bip->bli_logged = NULL;
1145#endif /* XFS_TRANS_DEBUG */
1146
1147#ifdef XFS_BLI_TRACE
1148 ktrace_free(bip->bli_trace);
1149#endif
1150 kmem_zone_free(xfs_buf_item_zone, bip);
1151} 1147}
1152 1148
1153#if defined(XFS_BLI_TRACE) 1149#if defined(XFS_BLI_TRACE)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 2211e885ef24..75b0cd4da0ea 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -128,10 +128,8 @@ xfs_swap_extents(
128 xfs_swapext_t *sxp) 128 xfs_swapext_t *sxp)
129{ 129{
130 xfs_mount_t *mp; 130 xfs_mount_t *mp;
131 xfs_inode_t *ips[2];
132 xfs_trans_t *tp; 131 xfs_trans_t *tp;
133 xfs_bstat_t *sbp = &sxp->sx_stat; 132 xfs_bstat_t *sbp = &sxp->sx_stat;
134 bhv_vnode_t *vp, *tvp;
135 xfs_ifork_t *tempifp, *ifp, *tifp; 133 xfs_ifork_t *tempifp, *ifp, *tifp;
136 int ilf_fields, tilf_fields; 134 int ilf_fields, tilf_fields;
137 static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; 135 static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
@@ -150,19 +148,15 @@ xfs_swap_extents(
150 } 148 }
151 149
152 sbp = &sxp->sx_stat; 150 sbp = &sxp->sx_stat;
153 vp = XFS_ITOV(ip);
154 tvp = XFS_ITOV(tip);
155
156 /* Lock in i_ino order */
157 if (ip->i_ino < tip->i_ino) {
158 ips[0] = ip;
159 ips[1] = tip;
160 } else {
161 ips[0] = tip;
162 ips[1] = ip;
163 }
164 151
165 xfs_lock_inodes(ips, 2, lock_flags); 152 /*
153 * we have to do two separate lock calls here to keep lockdep
154 * happy. If we try to get all the locks in one call, lock will
155 * report false positives when we drop the ILOCK and regain them
156 * below.
157 */
158 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
159 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
166 locked = 1; 160 locked = 1;
167 161
168 /* Verify that both files have the same format */ 162 /* Verify that both files have the same format */
@@ -184,7 +178,7 @@ xfs_swap_extents(
184 goto error0; 178 goto error0;
185 } 179 }
186 180
187 if (VN_CACHED(tvp) != 0) { 181 if (VN_CACHED(VFS_I(tip)) != 0) {
188 xfs_inval_cached_trace(tip, 0, -1, 0, -1); 182 xfs_inval_cached_trace(tip, 0, -1, 0, -1);
189 error = xfs_flushinval_pages(tip, 0, -1, 183 error = xfs_flushinval_pages(tip, 0, -1,
190 FI_REMAPF_LOCKED); 184 FI_REMAPF_LOCKED);
@@ -193,7 +187,7 @@ xfs_swap_extents(
193 } 187 }
194 188
195 /* Verify O_DIRECT for ftmp */ 189 /* Verify O_DIRECT for ftmp */
196 if (VN_CACHED(tvp) != 0) { 190 if (VN_CACHED(VFS_I(tip)) != 0) {
197 error = XFS_ERROR(EINVAL); 191 error = XFS_ERROR(EINVAL);
198 goto error0; 192 goto error0;
199 } 193 }
@@ -237,7 +231,7 @@ xfs_swap_extents(
237 * vop_read (or write in the case of autogrow) they block on the iolock 231 * vop_read (or write in the case of autogrow) they block on the iolock
238 * until we have switched the extents. 232 * until we have switched the extents.
239 */ 233 */
240 if (VN_MAPPED(vp)) { 234 if (VN_MAPPED(VFS_I(ip))) {
241 error = XFS_ERROR(EBUSY); 235 error = XFS_ERROR(EBUSY);
242 goto error0; 236 goto error0;
243 } 237 }
@@ -265,7 +259,7 @@ xfs_swap_extents(
265 locked = 0; 259 locked = 0;
266 goto error0; 260 goto error0;
267 } 261 }
268 xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL); 262 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
269 263
270 /* 264 /*
271 * Count the number of extended attribute blocks 265 * Count the number of extended attribute blocks
@@ -350,15 +344,11 @@ xfs_swap_extents(
350 break; 344 break;
351 } 345 }
352 346
353 /*
354 * Increment vnode ref counts since xfs_trans_commit &
355 * xfs_trans_cancel will both unlock the inodes and
356 * decrement the associated ref counts.
357 */
358 VN_HOLD(vp);
359 VN_HOLD(tvp);
360 347
348 IHOLD(ip);
361 xfs_trans_ijoin(tp, ip, lock_flags); 349 xfs_trans_ijoin(tp, ip, lock_flags);
350
351 IHOLD(tip);
362 xfs_trans_ijoin(tp, tip, lock_flags); 352 xfs_trans_ijoin(tp, tip, lock_flags);
363 353
364 xfs_trans_log_inode(tp, ip, ilf_fields); 354 xfs_trans_log_inode(tp, ip, ilf_fields);
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index cdc2d3464a1a..2813cdd72375 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -18,7 +18,6 @@
18#ifndef __XFS_DMAPI_H__ 18#ifndef __XFS_DMAPI_H__
19#define __XFS_DMAPI_H__ 19#define __XFS_DMAPI_H__
20 20
21#include <linux/version.h>
22/* Values used to define the on-disk version of dm_attrname_t. All 21/* Values used to define the on-disk version of dm_attrname_t. All
23 * on-disk attribute names start with the 8-byte string "SGI_DMI_". 22 * on-disk attribute names start with the 8-byte string "SGI_DMI_".
24 * 23 *
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index f66756cfb5e8..f227ecd1a294 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,9 +58,6 @@ xfs_error_trap(int e)
58 } 58 }
59 return e; 59 return e;
60} 60}
61#endif
62
63#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
64 61
65int xfs_etest[XFS_NUM_INJECT_ERROR]; 62int xfs_etest[XFS_NUM_INJECT_ERROR];
66int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; 63int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
@@ -154,7 +151,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
154 151
155 return 0; 152 return 0;
156} 153}
157#endif /* DEBUG || INDUCE_IO_ERROR */ 154#endif /* DEBUG */
158 155
159static void 156static void
160xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap) 157xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index d8559d132efa..11543f10b0c6 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -125,22 +125,14 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
125#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) 125#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
126#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT 126#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
127 127
128#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) 128#ifdef DEBUG
129extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); 129extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
130 130
131#define XFS_NUM_INJECT_ERROR 10 131#define XFS_NUM_INJECT_ERROR 10
132
133#ifdef __ANSI_CPP__
134#define XFS_TEST_ERROR(expr, mp, tag, rf) \
135 ((expr) || \
136 xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
137 (rf)))
138#else
139#define XFS_TEST_ERROR(expr, mp, tag, rf) \ 132#define XFS_TEST_ERROR(expr, mp, tag, rf) \
140 ((expr) || \ 133 ((expr) || \
141 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 134 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
142 (rf))) 135 (rf)))
143#endif /* __ANSI_CPP__ */
144 136
145extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); 137extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
146extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); 138extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
@@ -148,7 +140,7 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
148#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 140#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
149#define xfs_errortag_add(tag, mp) (ENOSYS) 141#define xfs_errortag_add(tag, mp) (ENOSYS)
150#define xfs_errortag_clearall(mp, loud) (ENOSYS) 142#define xfs_errortag_clearall(mp, loud) (ENOSYS)
151#endif /* (DEBUG || INDUCE_IO_ERROR) */ 143#endif /* DEBUG */
152 144
153/* 145/*
154 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into 146 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c38fd14fca29..f3bb75da384e 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -400,7 +400,7 @@ xfs_filestream_init(void)
400 if (!item_zone) 400 if (!item_zone)
401 return -ENOMEM; 401 return -ENOMEM;
402#ifdef XFS_FILESTREAMS_TRACE 402#ifdef XFS_FILESTREAMS_TRACE
403 xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP); 403 xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
404#endif 404#endif
405 return 0; 405 return 0;
406} 406}
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index e5310c90e50f..83502f3edef0 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -181,7 +181,7 @@ xfs_inobt_delrec(
181 * then we can get rid of this level. 181 * then we can get rid of this level.
182 */ 182 */
183 if (numrecs == 1 && level > 0) { 183 if (numrecs == 1 && level > 0) {
184 agbp = cur->bc_private.i.agbp; 184 agbp = cur->bc_private.a.agbp;
185 agi = XFS_BUF_TO_AGI(agbp); 185 agi = XFS_BUF_TO_AGI(agbp);
186 /* 186 /*
187 * pp is still set to the first pointer in the block. 187 * pp is still set to the first pointer in the block.
@@ -194,7 +194,7 @@ xfs_inobt_delrec(
194 * Free the block. 194 * Free the block.
195 */ 195 */
196 if ((error = xfs_free_extent(cur->bc_tp, 196 if ((error = xfs_free_extent(cur->bc_tp,
197 XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1))) 197 XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
198 return error; 198 return error;
199 xfs_trans_binval(cur->bc_tp, bp); 199 xfs_trans_binval(cur->bc_tp, bp);
200 xfs_ialloc_log_agi(cur->bc_tp, agbp, 200 xfs_ialloc_log_agi(cur->bc_tp, agbp,
@@ -379,7 +379,7 @@ xfs_inobt_delrec(
379 rrecs = be16_to_cpu(right->bb_numrecs); 379 rrecs = be16_to_cpu(right->bb_numrecs);
380 rbp = bp; 380 rbp = bp;
381 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, 381 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
382 cur->bc_private.i.agno, lbno, 0, &lbp, 382 cur->bc_private.a.agno, lbno, 0, &lbp,
383 XFS_INO_BTREE_REF))) 383 XFS_INO_BTREE_REF)))
384 return error; 384 return error;
385 left = XFS_BUF_TO_INOBT_BLOCK(lbp); 385 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -401,7 +401,7 @@ xfs_inobt_delrec(
401 lrecs = be16_to_cpu(left->bb_numrecs); 401 lrecs = be16_to_cpu(left->bb_numrecs);
402 lbp = bp; 402 lbp = bp;
403 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, 403 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
404 cur->bc_private.i.agno, rbno, 0, &rbp, 404 cur->bc_private.a.agno, rbno, 0, &rbp,
405 XFS_INO_BTREE_REF))) 405 XFS_INO_BTREE_REF)))
406 return error; 406 return error;
407 right = XFS_BUF_TO_INOBT_BLOCK(rbp); 407 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -484,7 +484,7 @@ xfs_inobt_delrec(
484 xfs_buf_t *rrbp; 484 xfs_buf_t *rrbp;
485 485
486 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, 486 if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
487 cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0, 487 cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
488 &rrbp, XFS_INO_BTREE_REF))) 488 &rrbp, XFS_INO_BTREE_REF)))
489 return error; 489 return error;
490 rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); 490 rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
@@ -497,7 +497,7 @@ xfs_inobt_delrec(
497 * Free the deleting block. 497 * Free the deleting block.
498 */ 498 */
499 if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp, 499 if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
500 cur->bc_private.i.agno, rbno), 1))) 500 cur->bc_private.a.agno, rbno), 1)))
501 return error; 501 return error;
502 xfs_trans_binval(cur->bc_tp, rbp); 502 xfs_trans_binval(cur->bc_tp, rbp);
503 /* 503 /*
@@ -854,7 +854,7 @@ xfs_inobt_lookup(
854 { 854 {
855 xfs_agi_t *agi; /* a.g. inode header */ 855 xfs_agi_t *agi; /* a.g. inode header */
856 856
857 agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); 857 agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
858 agno = be32_to_cpu(agi->agi_seqno); 858 agno = be32_to_cpu(agi->agi_seqno);
859 agbno = be32_to_cpu(agi->agi_root); 859 agbno = be32_to_cpu(agi->agi_root);
860 } 860 }
@@ -1089,7 +1089,7 @@ xfs_inobt_lshift(
1089 * Set up the left neighbor as "left". 1089 * Set up the left neighbor as "left".
1090 */ 1090 */
1091 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, 1091 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1092 cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib), 1092 cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
1093 0, &lbp, XFS_INO_BTREE_REF))) 1093 0, &lbp, XFS_INO_BTREE_REF)))
1094 return error; 1094 return error;
1095 left = XFS_BUF_TO_INOBT_BLOCK(lbp); 1095 left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -1207,10 +1207,10 @@ xfs_inobt_newroot(
1207 /* 1207 /*
1208 * Get a block & a buffer. 1208 * Get a block & a buffer.
1209 */ 1209 */
1210 agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); 1210 agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
1211 args.tp = cur->bc_tp; 1211 args.tp = cur->bc_tp;
1212 args.mp = cur->bc_mp; 1212 args.mp = cur->bc_mp;
1213 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, 1213 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
1214 be32_to_cpu(agi->agi_root)); 1214 be32_to_cpu(agi->agi_root));
1215 args.mod = args.minleft = args.alignment = args.total = args.wasdel = 1215 args.mod = args.minleft = args.alignment = args.total = args.wasdel =
1216 args.isfl = args.userdata = args.minalignslop = 0; 1216 args.isfl = args.userdata = args.minalignslop = 0;
@@ -1233,7 +1233,7 @@ xfs_inobt_newroot(
1233 */ 1233 */
1234 agi->agi_root = cpu_to_be32(args.agbno); 1234 agi->agi_root = cpu_to_be32(args.agbno);
1235 be32_add_cpu(&agi->agi_level, 1); 1235 be32_add_cpu(&agi->agi_level, 1);
1236 xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp, 1236 xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
1237 XFS_AGI_ROOT | XFS_AGI_LEVEL); 1237 XFS_AGI_ROOT | XFS_AGI_LEVEL);
1238 /* 1238 /*
1239 * At the previous root level there are now two blocks: the old 1239 * At the previous root level there are now two blocks: the old
@@ -1376,7 +1376,7 @@ xfs_inobt_rshift(
1376 * Set up the right neighbor as "right". 1376 * Set up the right neighbor as "right".
1377 */ 1377 */
1378 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, 1378 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1379 cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 1379 cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
1380 0, &rbp, XFS_INO_BTREE_REF))) 1380 0, &rbp, XFS_INO_BTREE_REF)))
1381 return error; 1381 return error;
1382 right = XFS_BUF_TO_INOBT_BLOCK(rbp); 1382 right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -1492,7 +1492,7 @@ xfs_inobt_split(
1492 * Allocate the new block. 1492 * Allocate the new block.
1493 * If we can't do it, we're toast. Give up. 1493 * If we can't do it, we're toast. Give up.
1494 */ 1494 */
1495 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno); 1495 args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
1496 args.mod = args.minleft = args.alignment = args.total = args.wasdel = 1496 args.mod = args.minleft = args.alignment = args.total = args.wasdel =
1497 args.isfl = args.userdata = args.minalignslop = 0; 1497 args.isfl = args.userdata = args.minalignslop = 0;
1498 args.minlen = args.maxlen = args.prod = 1; 1498 args.minlen = args.maxlen = args.prod = 1;
@@ -1725,7 +1725,7 @@ xfs_inobt_decrement(
1725 1725
1726 agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); 1726 agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
1727 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, 1727 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1728 cur->bc_private.i.agno, agbno, 0, &bp, 1728 cur->bc_private.a.agno, agbno, 0, &bp,
1729 XFS_INO_BTREE_REF))) 1729 XFS_INO_BTREE_REF)))
1730 return error; 1730 return error;
1731 lev--; 1731 lev--;
@@ -1897,7 +1897,7 @@ xfs_inobt_increment(
1897 1897
1898 agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); 1898 agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
1899 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, 1899 if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
1900 cur->bc_private.i.agno, agbno, 0, &bp, 1900 cur->bc_private.a.agno, agbno, 0, &bp,
1901 XFS_INO_BTREE_REF))) 1901 XFS_INO_BTREE_REF)))
1902 return error; 1902 return error;
1903 lev--; 1903 lev--;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b07604b94d9f..e229e9e001c2 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -216,7 +216,14 @@ finish_inode:
216 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 216 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
217 init_waitqueue_head(&ip->i_ipin_wait); 217 init_waitqueue_head(&ip->i_ipin_wait);
218 atomic_set(&ip->i_pincount, 0); 218 atomic_set(&ip->i_pincount, 0);
219 initnsema(&ip->i_flock, 1, "xfsfino"); 219
220 /*
221 * Because we want to use a counting completion, complete
222 * the flush completion once to allow a single access to
223 * the flush completion without blocking.
224 */
225 init_completion(&ip->i_flush);
226 complete(&ip->i_flush);
220 227
221 if (lock_flags) 228 if (lock_flags)
222 xfs_ilock(ip, lock_flags); 229 xfs_ilock(ip, lock_flags);
@@ -288,10 +295,17 @@ finish_inode:
288 *ipp = ip; 295 *ipp = ip;
289 296
290 /* 297 /*
298 * Set up the Linux with the Linux inode.
299 */
300 ip->i_vnode = inode;
301 inode->i_private = ip;
302
303 /*
291 * If we have a real type for an on-disk inode, we can set ops(&unlock) 304 * If we have a real type for an on-disk inode, we can set ops(&unlock)
292 * now. If it's a new inode being created, xfs_ialloc will handle it. 305 * now. If it's a new inode being created, xfs_ialloc will handle it.
293 */ 306 */
294 xfs_initialize_vnode(mp, inode, ip); 307 if (ip->i_d.di_mode != 0)
308 xfs_setup_inode(ip);
295 return 0; 309 return 0;
296} 310}
297 311
@@ -411,10 +425,11 @@ xfs_iput(xfs_inode_t *ip,
411 * Special iput for brand-new inodes that are still locked 425 * Special iput for brand-new inodes that are still locked
412 */ 426 */
413void 427void
414xfs_iput_new(xfs_inode_t *ip, 428xfs_iput_new(
415 uint lock_flags) 429 xfs_inode_t *ip,
430 uint lock_flags)
416{ 431{
417 struct inode *inode = ip->i_vnode; 432 struct inode *inode = VFS_I(ip);
418 433
419 xfs_itrace_entry(ip); 434 xfs_itrace_entry(ip);
420 435
@@ -775,26 +790,3 @@ xfs_isilocked(
775} 790}
776#endif 791#endif
777 792
778/*
779 * The following three routines simply manage the i_flock
780 * semaphore embedded in the inode. This semaphore synchronizes
781 * processes attempting to flush the in-core inode back to disk.
782 */
783void
784xfs_iflock(xfs_inode_t *ip)
785{
786 psema(&(ip->i_flock), PINOD|PLTWAIT);
787}
788
789int
790xfs_iflock_nowait(xfs_inode_t *ip)
791{
792 return (cpsema(&(ip->i_flock)));
793}
794
795void
796xfs_ifunlock(xfs_inode_t *ip)
797{
798 ASSERT(issemalocked(&(ip->i_flock)));
799 vsema(&(ip->i_flock));
800}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bedc66163176..dbd9cef852ec 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -580,8 +580,8 @@ xfs_iformat_extents(
580 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); 580 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
581 for (i = 0; i < nex; i++, dp++) { 581 for (i = 0; i < nex; i++, dp++) {
582 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 582 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
583 ep->l0 = be64_to_cpu(get_unaligned(&dp->l0)); 583 ep->l0 = get_unaligned_be64(&dp->l0);
584 ep->l1 = be64_to_cpu(get_unaligned(&dp->l1)); 584 ep->l1 = get_unaligned_be64(&dp->l1);
585 } 585 }
586 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); 586 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
587 if (whichfork != XFS_DATA_FORK || 587 if (whichfork != XFS_DATA_FORK ||
@@ -835,22 +835,22 @@ xfs_iread(
835 * Do this before xfs_iformat in case it adds entries. 835 * Do this before xfs_iformat in case it adds entries.
836 */ 836 */
837#ifdef XFS_INODE_TRACE 837#ifdef XFS_INODE_TRACE
838 ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_SLEEP); 838 ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
839#endif 839#endif
840#ifdef XFS_BMAP_TRACE 840#ifdef XFS_BMAP_TRACE
841 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP); 841 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
842#endif 842#endif
843#ifdef XFS_BMBT_TRACE 843#ifdef XFS_BMBT_TRACE
844 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP); 844 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
845#endif 845#endif
846#ifdef XFS_RW_TRACE 846#ifdef XFS_RW_TRACE
847 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP); 847 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
848#endif 848#endif
849#ifdef XFS_ILOCK_TRACE 849#ifdef XFS_ILOCK_TRACE
850 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP); 850 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
851#endif 851#endif
852#ifdef XFS_DIR2_TRACE 852#ifdef XFS_DIR2_TRACE
853 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP); 853 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
854#endif 854#endif
855 855
856 /* 856 /*
@@ -1046,9 +1046,9 @@ xfs_ialloc(
1046{ 1046{
1047 xfs_ino_t ino; 1047 xfs_ino_t ino;
1048 xfs_inode_t *ip; 1048 xfs_inode_t *ip;
1049 bhv_vnode_t *vp;
1050 uint flags; 1049 uint flags;
1051 int error; 1050 int error;
1051 timespec_t tv;
1052 1052
1053 /* 1053 /*
1054 * Call the space management code to pick 1054 * Call the space management code to pick
@@ -1077,13 +1077,12 @@ xfs_ialloc(
1077 } 1077 }
1078 ASSERT(ip != NULL); 1078 ASSERT(ip != NULL);
1079 1079
1080 vp = XFS_ITOV(ip);
1081 ip->i_d.di_mode = (__uint16_t)mode; 1080 ip->i_d.di_mode = (__uint16_t)mode;
1082 ip->i_d.di_onlink = 0; 1081 ip->i_d.di_onlink = 0;
1083 ip->i_d.di_nlink = nlink; 1082 ip->i_d.di_nlink = nlink;
1084 ASSERT(ip->i_d.di_nlink == nlink); 1083 ASSERT(ip->i_d.di_nlink == nlink);
1085 ip->i_d.di_uid = current_fsuid(cr); 1084 ip->i_d.di_uid = current_fsuid();
1086 ip->i_d.di_gid = current_fsgid(cr); 1085 ip->i_d.di_gid = current_fsgid();
1087 ip->i_d.di_projid = prid; 1086 ip->i_d.di_projid = prid;
1088 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1087 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1089 1088
@@ -1130,7 +1129,13 @@ xfs_ialloc(
1130 ip->i_size = 0; 1129 ip->i_size = 0;
1131 ip->i_d.di_nextents = 0; 1130 ip->i_d.di_nextents = 0;
1132 ASSERT(ip->i_d.di_nblocks == 0); 1131 ASSERT(ip->i_d.di_nblocks == 0);
1133 xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); 1132
1133 nanotime(&tv);
1134 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
1135 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
1136 ip->i_d.di_atime = ip->i_d.di_mtime;
1137 ip->i_d.di_ctime = ip->i_d.di_mtime;
1138
1134 /* 1139 /*
1135 * di_gen will have been taken care of in xfs_iread. 1140 * di_gen will have been taken care of in xfs_iread.
1136 */ 1141 */
@@ -1220,7 +1225,7 @@ xfs_ialloc(
1220 xfs_trans_log_inode(tp, ip, flags); 1225 xfs_trans_log_inode(tp, ip, flags);
1221 1226
1222 /* now that we have an i_mode we can setup inode ops and unlock */ 1227 /* now that we have an i_mode we can setup inode ops and unlock */
1223 xfs_initialize_vnode(tp->t_mountp, vp, ip); 1228 xfs_setup_inode(ip);
1224 1229
1225 *ipp = ip; 1230 *ipp = ip;
1226 return 0; 1231 return 0;
@@ -1399,7 +1404,6 @@ xfs_itruncate_start(
1399 xfs_fsize_t last_byte; 1404 xfs_fsize_t last_byte;
1400 xfs_off_t toss_start; 1405 xfs_off_t toss_start;
1401 xfs_mount_t *mp; 1406 xfs_mount_t *mp;
1402 bhv_vnode_t *vp;
1403 int error = 0; 1407 int error = 0;
1404 1408
1405 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1409 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1408,7 +1412,6 @@ xfs_itruncate_start(
1408 (flags == XFS_ITRUNC_MAYBE)); 1412 (flags == XFS_ITRUNC_MAYBE));
1409 1413
1410 mp = ip->i_mount; 1414 mp = ip->i_mount;
1411 vp = XFS_ITOV(ip);
1412 1415
1413 /* wait for the completion of any pending DIOs */ 1416 /* wait for the completion of any pending DIOs */
1414 if (new_size < ip->i_size) 1417 if (new_size < ip->i_size)
@@ -1457,7 +1460,7 @@ xfs_itruncate_start(
1457 1460
1458#ifdef DEBUG 1461#ifdef DEBUG
1459 if (new_size == 0) { 1462 if (new_size == 0) {
1460 ASSERT(VN_CACHED(vp) == 0); 1463 ASSERT(VN_CACHED(VFS_I(ip)) == 0);
1461 } 1464 }
1462#endif 1465#endif
1463 return error; 1466 return error;
@@ -2630,7 +2633,6 @@ xfs_idestroy(
2630 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 2633 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
2631 mrfree(&ip->i_lock); 2634 mrfree(&ip->i_lock);
2632 mrfree(&ip->i_iolock); 2635 mrfree(&ip->i_iolock);
2633 freesema(&ip->i_flock);
2634 2636
2635#ifdef XFS_INODE_TRACE 2637#ifdef XFS_INODE_TRACE
2636 ktrace_free(ip->i_trace); 2638 ktrace_free(ip->i_trace);
@@ -3048,10 +3050,10 @@ cluster_corrupt_out:
3048/* 3050/*
3049 * xfs_iflush() will write a modified inode's changes out to the 3051 * xfs_iflush() will write a modified inode's changes out to the
3050 * inode's on disk home. The caller must have the inode lock held 3052 * inode's on disk home. The caller must have the inode lock held
3051 * in at least shared mode and the inode flush semaphore must be 3053 * in at least shared mode and the inode flush completion must be
3052 * held as well. The inode lock will still be held upon return from 3054 * active as well. The inode lock will still be held upon return from
3053 * the call and the caller is free to unlock it. 3055 * the call and the caller is free to unlock it.
3054 * The inode flush lock will be unlocked when the inode reaches the disk. 3056 * The inode flush will be completed when the inode reaches the disk.
3055 * The flags indicate how the inode's buffer should be written out. 3057 * The flags indicate how the inode's buffer should be written out.
3056 */ 3058 */
3057int 3059int
@@ -3070,7 +3072,7 @@ xfs_iflush(
3070 XFS_STATS_INC(xs_iflush_count); 3072 XFS_STATS_INC(xs_iflush_count);
3071 3073
3072 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 3074 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3073 ASSERT(issemalocked(&(ip->i_flock))); 3075 ASSERT(!completion_done(&ip->i_flush));
3074 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3076 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3075 ip->i_d.di_nextents > ip->i_df.if_ext_max); 3077 ip->i_d.di_nextents > ip->i_df.if_ext_max);
3076 3078
@@ -3233,7 +3235,7 @@ xfs_iflush_int(
3233#endif 3235#endif
3234 3236
3235 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 3237 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3236 ASSERT(issemalocked(&(ip->i_flock))); 3238 ASSERT(!completion_done(&ip->i_flush));
3237 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3239 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3238 ip->i_d.di_nextents > ip->i_df.if_ext_max); 3240 ip->i_d.di_nextents > ip->i_df.if_ext_max);
3239 3241
@@ -3465,7 +3467,6 @@ xfs_iflush_all(
3465 xfs_mount_t *mp) 3467 xfs_mount_t *mp)
3466{ 3468{
3467 xfs_inode_t *ip; 3469 xfs_inode_t *ip;
3468 bhv_vnode_t *vp;
3469 3470
3470 again: 3471 again:
3471 XFS_MOUNT_ILOCK(mp); 3472 XFS_MOUNT_ILOCK(mp);
@@ -3480,14 +3481,13 @@ xfs_iflush_all(
3480 continue; 3481 continue;
3481 } 3482 }
3482 3483
3483 vp = XFS_ITOV_NULL(ip); 3484 if (!VFS_I(ip)) {
3484 if (!vp) {
3485 XFS_MOUNT_IUNLOCK(mp); 3485 XFS_MOUNT_IUNLOCK(mp);
3486 xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); 3486 xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
3487 goto again; 3487 goto again;
3488 } 3488 }
3489 3489
3490 ASSERT(vn_count(vp) == 0); 3490 ASSERT(vn_count(VFS_I(ip)) == 0);
3491 3491
3492 ip = ip->i_mnext; 3492 ip = ip->i_mnext;
3493 } while (ip != mp->m_inodes); 3493 } while (ip != mp->m_inodes);
@@ -3707,7 +3707,7 @@ xfs_iext_add_indirect_multi(
3707 * (all extents past */ 3707 * (all extents past */
3708 if (nex2) { 3708 if (nex2) {
3709 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3709 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3710 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP); 3710 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
3711 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); 3711 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3712 erp->er_extcount -= nex2; 3712 erp->er_extcount -= nex2;
3713 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); 3713 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
@@ -4007,8 +4007,7 @@ xfs_iext_realloc_direct(
4007 ifp->if_u1.if_extents = 4007 ifp->if_u1.if_extents =
4008 kmem_realloc(ifp->if_u1.if_extents, 4008 kmem_realloc(ifp->if_u1.if_extents,
4009 rnew_size, 4009 rnew_size,
4010 ifp->if_real_bytes, 4010 ifp->if_real_bytes, KM_NOFS);
4011 KM_SLEEP);
4012 } 4011 }
4013 if (rnew_size > ifp->if_real_bytes) { 4012 if (rnew_size > ifp->if_real_bytes) {
4014 memset(&ifp->if_u1.if_extents[ifp->if_bytes / 4013 memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -4067,7 +4066,7 @@ xfs_iext_inline_to_direct(
4067 xfs_ifork_t *ifp, /* inode fork pointer */ 4066 xfs_ifork_t *ifp, /* inode fork pointer */
4068 int new_size) /* number of extents in file */ 4067 int new_size) /* number of extents in file */
4069{ 4068{
4070 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_SLEEP); 4069 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
4071 memset(ifp->if_u1.if_extents, 0, new_size); 4070 memset(ifp->if_u1.if_extents, 0, new_size);
4072 if (ifp->if_bytes) { 4071 if (ifp->if_bytes) {
4073 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 4072 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
@@ -4099,7 +4098,7 @@ xfs_iext_realloc_indirect(
4099 } else { 4098 } else {
4100 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) 4099 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
4101 kmem_realloc(ifp->if_u1.if_ext_irec, 4100 kmem_realloc(ifp->if_u1.if_ext_irec,
4102 new_size, size, KM_SLEEP); 4101 new_size, size, KM_NOFS);
4103 } 4102 }
4104} 4103}
4105 4104
@@ -4119,7 +4118,7 @@ xfs_iext_indirect_to_direct(
4119 ASSERT(nextents <= XFS_LINEAR_EXTS); 4118 ASSERT(nextents <= XFS_LINEAR_EXTS);
4120 size = nextents * sizeof(xfs_bmbt_rec_t); 4119 size = nextents * sizeof(xfs_bmbt_rec_t);
4121 4120
4122 xfs_iext_irec_compact_full(ifp); 4121 xfs_iext_irec_compact_pages(ifp);
4123 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 4122 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
4124 4123
4125 ep = ifp->if_u1.if_ext_irec->er_extbuf; 4124 ep = ifp->if_u1.if_ext_irec->er_extbuf;
@@ -4341,11 +4340,10 @@ xfs_iext_irec_init(
4341 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4340 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4342 ASSERT(nextents <= XFS_LINEAR_EXTS); 4341 ASSERT(nextents <= XFS_LINEAR_EXTS);
4343 4342
4344 erp = (xfs_ext_irec_t *) 4343 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
4345 kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP);
4346 4344
4347 if (nextents == 0) { 4345 if (nextents == 0) {
4348 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); 4346 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
4349 } else if (!ifp->if_real_bytes) { 4347 } else if (!ifp->if_real_bytes) {
4350 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); 4348 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
4351 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { 4349 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
@@ -4393,7 +4391,7 @@ xfs_iext_irec_new(
4393 4391
4394 /* Initialize new extent record */ 4392 /* Initialize new extent record */
4395 erp = ifp->if_u1.if_ext_irec; 4393 erp = ifp->if_u1.if_ext_irec;
4396 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); 4394 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
4397 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 4395 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
4398 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); 4396 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
4399 erp[erp_idx].er_extcount = 0; 4397 erp[erp_idx].er_extcount = 0;
@@ -4451,8 +4449,7 @@ xfs_iext_irec_remove(
4451 * compaction policy is as follows: 4449 * compaction policy is as follows:
4452 * 4450 *
4453 * Full Compaction: Extents fit into a single page (or inline buffer) 4451 * Full Compaction: Extents fit into a single page (or inline buffer)
4454 * Full Compaction: Extents occupy less than 10% of allocated space 4452 * Partial Compaction: Extents occupy less than 50% of allocated space
4455 * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
4456 * No Compaction: Extents occupy at least 50% of allocated space 4453 * No Compaction: Extents occupy at least 50% of allocated space
4457 */ 4454 */
4458void 4455void
@@ -4473,8 +4470,6 @@ xfs_iext_irec_compact(
4473 xfs_iext_direct_to_inline(ifp, nextents); 4470 xfs_iext_direct_to_inline(ifp, nextents);
4474 } else if (nextents <= XFS_LINEAR_EXTS) { 4471 } else if (nextents <= XFS_LINEAR_EXTS) {
4475 xfs_iext_indirect_to_direct(ifp); 4472 xfs_iext_indirect_to_direct(ifp);
4476 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
4477 xfs_iext_irec_compact_full(ifp);
4478 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 4473 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
4479 xfs_iext_irec_compact_pages(ifp); 4474 xfs_iext_irec_compact_pages(ifp);
4480 } 4475 }
@@ -4498,7 +4493,7 @@ xfs_iext_irec_compact_pages(
4498 erp_next = erp + 1; 4493 erp_next = erp + 1;
4499 if (erp_next->er_extcount <= 4494 if (erp_next->er_extcount <=
4500 (XFS_LINEAR_EXTS - erp->er_extcount)) { 4495 (XFS_LINEAR_EXTS - erp->er_extcount)) {
4501 memmove(&erp->er_extbuf[erp->er_extcount], 4496 memcpy(&erp->er_extbuf[erp->er_extcount],
4502 erp_next->er_extbuf, erp_next->er_extcount * 4497 erp_next->er_extbuf, erp_next->er_extcount *
4503 sizeof(xfs_bmbt_rec_t)); 4498 sizeof(xfs_bmbt_rec_t));
4504 erp->er_extcount += erp_next->er_extcount; 4499 erp->er_extcount += erp_next->er_extcount;
@@ -4518,91 +4513,6 @@ xfs_iext_irec_compact_pages(
4518} 4513}
4519 4514
4520/* 4515/*
4521 * Fully compact the extent records managed by the indirection array.
4522 */
4523void
4524xfs_iext_irec_compact_full(
4525 xfs_ifork_t *ifp) /* inode fork pointer */
4526{
4527 xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */
4528 xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */
4529 int erp_idx = 0; /* extent irec index */
4530 int ext_avail; /* empty entries in ex list */
4531 int ext_diff; /* number of exts to add */
4532 int nlists; /* number of irec's (ex lists) */
4533
4534 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4535
4536 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4537 erp = ifp->if_u1.if_ext_irec;
4538 ep = &erp->er_extbuf[erp->er_extcount];
4539 erp_next = erp + 1;
4540 ep_next = erp_next->er_extbuf;
4541
4542 while (erp_idx < nlists - 1) {
4543 /*
4544 * Check how many extent records are available in this irec.
4545 * If there is none skip the whole exercise.
4546 */
4547 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
4548 if (ext_avail) {
4549
4550 /*
4551 * Copy over as many as possible extent records into
4552 * the previous page.
4553 */
4554 ext_diff = MIN(ext_avail, erp_next->er_extcount);
4555 memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
4556 erp->er_extcount += ext_diff;
4557 erp_next->er_extcount -= ext_diff;
4558
4559 /*
4560 * If the next irec is empty now we can simply
4561 * remove it.
4562 */
4563 if (erp_next->er_extcount == 0) {
4564 /*
4565 * Free page before removing extent record
4566 * so er_extoffs don't get modified in
4567 * xfs_iext_irec_remove.
4568 */
4569 kmem_free(erp_next->er_extbuf);
4570 erp_next->er_extbuf = NULL;
4571 xfs_iext_irec_remove(ifp, erp_idx + 1);
4572 erp = &ifp->if_u1.if_ext_irec[erp_idx];
4573 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4574
4575 /*
4576 * If the next irec is not empty move up the content
4577 * that has not been copied to the previous page to
4578 * the beggining of this one.
4579 */
4580 } else {
4581 memmove(erp_next->er_extbuf, &ep_next[ext_diff],
4582 erp_next->er_extcount *
4583 sizeof(xfs_bmbt_rec_t));
4584 ep_next = erp_next->er_extbuf;
4585 memset(&ep_next[erp_next->er_extcount], 0,
4586 (XFS_LINEAR_EXTS -
4587 erp_next->er_extcount) *
4588 sizeof(xfs_bmbt_rec_t));
4589 }
4590 }
4591
4592 if (erp->er_extcount == XFS_LINEAR_EXTS) {
4593 erp_idx++;
4594 if (erp_idx < nlists)
4595 erp = &ifp->if_u1.if_ext_irec[erp_idx];
4596 else
4597 break;
4598 }
4599 ep = &erp->er_extbuf[erp->er_extcount];
4600 erp_next = erp + 1;
4601 ep_next = erp_next->er_extbuf;
4602 }
4603}
4604
4605/*
4606 * This is called to update the er_extoff field in the indirection 4516 * This is called to update the er_extoff field in the indirection
4607 * array when extents have been added or removed from one of the 4517 * array when extents have been added or removed from one of the
4608 * extent lists. erp_idx contains the irec index to begin updating 4518 * extent lists. erp_idx contains the irec index to begin updating
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 17a04b6321ed..1420c49674d7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -87,8 +87,7 @@ typedef struct xfs_ifork {
87 * Flags for xfs_ichgtime(). 87 * Flags for xfs_ichgtime().
88 */ 88 */
89#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ 89#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
90#define XFS_ICHGTIME_ACC 0x2 /* data fork access timestamp */ 90#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
91#define XFS_ICHGTIME_CHG 0x4 /* inode field change timestamp */
92 91
93/* 92/*
94 * Per-fork incore inode flags. 93 * Per-fork incore inode flags.
@@ -204,7 +203,7 @@ typedef struct xfs_inode {
204 struct xfs_inode *i_mprev; /* ptr to prev inode */ 203 struct xfs_inode *i_mprev; /* ptr to prev inode */
205 struct xfs_mount *i_mount; /* fs mount struct ptr */ 204 struct xfs_mount *i_mount; /* fs mount struct ptr */
206 struct list_head i_reclaim; /* reclaim list */ 205 struct list_head i_reclaim; /* reclaim list */
207 bhv_vnode_t *i_vnode; /* vnode backpointer */ 206 struct inode *i_vnode; /* vnode backpointer */
208 struct xfs_dquot *i_udquot; /* user dquot */ 207 struct xfs_dquot *i_udquot; /* user dquot */
209 struct xfs_dquot *i_gdquot; /* group dquot */ 208 struct xfs_dquot *i_gdquot; /* group dquot */
210 209
@@ -223,7 +222,7 @@ typedef struct xfs_inode {
223 struct xfs_inode_log_item *i_itemp; /* logging information */ 222 struct xfs_inode_log_item *i_itemp; /* logging information */
224 mrlock_t i_lock; /* inode lock */ 223 mrlock_t i_lock; /* inode lock */
225 mrlock_t i_iolock; /* inode IO lock */ 224 mrlock_t i_iolock; /* inode IO lock */
226 sema_t i_flock; /* inode flush lock */ 225 struct completion i_flush; /* inode flush completion q */
227 atomic_t i_pincount; /* inode pin count */ 226 atomic_t i_pincount; /* inode pin count */
228 wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ 227 wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
229 spinlock_t i_flags_lock; /* inode i_flags lock */ 228 spinlock_t i_flags_lock; /* inode i_flags lock */
@@ -263,6 +262,18 @@ typedef struct xfs_inode {
263#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \ 262#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
264 (ip)->i_size : (ip)->i_d.di_size; 263 (ip)->i_size : (ip)->i_d.di_size;
265 264
265/* Convert from vfs inode to xfs inode */
266static inline struct xfs_inode *XFS_I(struct inode *inode)
267{
268 return (struct xfs_inode *)inode->i_private;
269}
270
271/* convert from xfs inode to vfs inode */
272static inline struct inode *VFS_I(struct xfs_inode *ip)
273{
274 return (struct inode *)ip->i_vnode;
275}
276
266/* 277/*
267 * i_flags helper functions 278 * i_flags helper functions
268 */ 279 */
@@ -439,9 +450,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
439#define XFS_ITRUNC_DEFINITE 0x1 450#define XFS_ITRUNC_DEFINITE 0x1
440#define XFS_ITRUNC_MAYBE 0x2 451#define XFS_ITRUNC_MAYBE 0x2
441 452
442#define XFS_ITOV(ip) ((ip)->i_vnode)
443#define XFS_ITOV_NULL(ip) ((ip)->i_vnode)
444
445/* 453/*
446 * For multiple groups support: if S_ISGID bit is set in the parent 454 * For multiple groups support: if S_ISGID bit is set in the parent
447 * directory, group of new file is set to that of the parent, and 455 * directory, group of new file is set to that of the parent, and
@@ -473,11 +481,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint);
473void xfs_iunlock(xfs_inode_t *, uint); 481void xfs_iunlock(xfs_inode_t *, uint);
474void xfs_ilock_demote(xfs_inode_t *, uint); 482void xfs_ilock_demote(xfs_inode_t *, uint);
475int xfs_isilocked(xfs_inode_t *, uint); 483int xfs_isilocked(xfs_inode_t *, uint);
476void xfs_iflock(xfs_inode_t *);
477int xfs_iflock_nowait(xfs_inode_t *);
478uint xfs_ilock_map_shared(xfs_inode_t *); 484uint xfs_ilock_map_shared(xfs_inode_t *);
479void xfs_iunlock_map_shared(xfs_inode_t *, uint); 485void xfs_iunlock_map_shared(xfs_inode_t *, uint);
480void xfs_ifunlock(xfs_inode_t *);
481void xfs_ireclaim(xfs_inode_t *); 486void xfs_ireclaim(xfs_inode_t *);
482int xfs_finish_reclaim(xfs_inode_t *, int, int); 487int xfs_finish_reclaim(xfs_inode_t *, int, int);
483int xfs_finish_reclaim_all(struct xfs_mount *, int); 488int xfs_finish_reclaim_all(struct xfs_mount *, int);
@@ -522,6 +527,7 @@ void xfs_iflush_all(struct xfs_mount *);
522void xfs_ichgtime(xfs_inode_t *, int); 527void xfs_ichgtime(xfs_inode_t *, int);
523xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); 528xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
524void xfs_lock_inodes(xfs_inode_t **, int, uint); 529void xfs_lock_inodes(xfs_inode_t **, int, uint);
530void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
525 531
526void xfs_synchronize_atime(xfs_inode_t *); 532void xfs_synchronize_atime(xfs_inode_t *);
527void xfs_mark_inode_dirty_sync(xfs_inode_t *); 533void xfs_mark_inode_dirty_sync(xfs_inode_t *);
@@ -570,6 +576,26 @@ extern struct kmem_zone *xfs_ifork_zone;
570extern struct kmem_zone *xfs_inode_zone; 576extern struct kmem_zone *xfs_inode_zone;
571extern struct kmem_zone *xfs_ili_zone; 577extern struct kmem_zone *xfs_ili_zone;
572 578
579/*
580 * Manage the i_flush queue embedded in the inode. This completion
581 * queue synchronizes processes attempting to flush the in-core
582 * inode back to disk.
583 */
584static inline void xfs_iflock(xfs_inode_t *ip)
585{
586 wait_for_completion(&ip->i_flush);
587}
588
589static inline int xfs_iflock_nowait(xfs_inode_t *ip)
590{
591 return try_wait_for_completion(&ip->i_flush);
592}
593
594static inline void xfs_ifunlock(xfs_inode_t *ip)
595{
596 complete(&ip->i_flush);
597}
598
573#endif /* __KERNEL__ */ 599#endif /* __KERNEL__ */
574 600
575#endif /* __XFS_INODE_H__ */ 601#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0eee08a32c26..97c7452e2620 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -779,11 +779,10 @@ xfs_inode_item_pushbuf(
779 ASSERT(iip->ili_push_owner == current_pid()); 779 ASSERT(iip->ili_push_owner == current_pid());
780 780
781 /* 781 /*
782 * If flushlock isn't locked anymore, chances are that the 782 * If a flush is not in progress anymore, chances are that the
783 * inode flush completed and the inode was taken off the AIL. 783 * inode was taken off the AIL. So, just get out.
784 * So, just get out.
785 */ 784 */
786 if (!issemalocked(&(ip->i_flock)) || 785 if (completion_done(&ip->i_flush) ||
787 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 786 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
788 iip->ili_pushbuf_flag = 0; 787 iip->ili_pushbuf_flag = 0;
789 xfs_iunlock(ip, XFS_ILOCK_SHARED); 788 xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -805,7 +804,7 @@ xfs_inode_item_pushbuf(
805 * If not, we can flush it async. 804 * If not, we can flush it async.
806 */ 805 */
807 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && 806 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
808 issemalocked(&(ip->i_flock))); 807 !completion_done(&ip->i_flush));
809 iip->ili_pushbuf_flag = 0; 808 iip->ili_pushbuf_flag = 0;
810 xfs_iunlock(ip, XFS_ILOCK_SHARED); 809 xfs_iunlock(ip, XFS_ILOCK_SHARED);
811 xfs_buftrace("INODE ITEM PUSH", bp); 810 xfs_buftrace("INODE ITEM PUSH", bp);
@@ -858,7 +857,7 @@ xfs_inode_item_push(
858 ip = iip->ili_inode; 857 ip = iip->ili_inode;
859 858
860 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 859 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
861 ASSERT(issemalocked(&(ip->i_flock))); 860 ASSERT(!completion_done(&ip->i_flush));
862 /* 861 /*
863 * Since we were able to lock the inode's flush lock and 862 * Since we were able to lock the inode's flush lock and
864 * we found it on the AIL, the inode must be dirty. This 863 * we found it on the AIL, the inode must be dirty. This
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 9a3ef9dcaeb9..cf6754a3c5b3 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,7 +59,6 @@ xfs_bulkstat_one_iget(
59{ 59{
60 xfs_icdinode_t *dic; /* dinode core info pointer */ 60 xfs_icdinode_t *dic; /* dinode core info pointer */
61 xfs_inode_t *ip; /* incore inode pointer */ 61 xfs_inode_t *ip; /* incore inode pointer */
62 bhv_vnode_t *vp;
63 int error; 62 int error;
64 63
65 error = xfs_iget(mp, NULL, ino, 64 error = xfs_iget(mp, NULL, ino,
@@ -72,7 +71,6 @@ xfs_bulkstat_one_iget(
72 ASSERT(ip != NULL); 71 ASSERT(ip != NULL);
73 ASSERT(ip->i_blkno != (xfs_daddr_t)0); 72 ASSERT(ip->i_blkno != (xfs_daddr_t)0);
74 73
75 vp = XFS_ITOV(ip);
76 dic = &ip->i_d; 74 dic = &ip->i_d;
77 75
78 /* xfs_iget returns the following without needing 76 /* xfs_iget returns the following without needing
@@ -85,7 +83,7 @@ xfs_bulkstat_one_iget(
85 buf->bs_uid = dic->di_uid; 83 buf->bs_uid = dic->di_uid;
86 buf->bs_gid = dic->di_gid; 84 buf->bs_gid = dic->di_gid;
87 buf->bs_size = dic->di_size; 85 buf->bs_size = dic->di_size;
88 vn_atime_to_bstime(vp, &buf->bs_atime); 86 vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
89 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; 87 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
90 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; 88 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
91 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; 89 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 91b00a5686cd..0b02c6443551 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
124STATIC int xlog_iclogs_empty(xlog_t *log); 124STATIC int xlog_iclogs_empty(xlog_t *log);
125 125
126#if defined(XFS_LOG_TRACE) 126#if defined(XFS_LOG_TRACE)
127
128#define XLOG_TRACE_LOGGRANT_SIZE 2048
129#define XLOG_TRACE_ICLOG_SIZE 256
130
131void
132xlog_trace_loggrant_alloc(xlog_t *log)
133{
134 log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
135}
136
137void
138xlog_trace_loggrant_dealloc(xlog_t *log)
139{
140 ktrace_free(log->l_grant_trace);
141}
142
127void 143void
128xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) 144xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
129{ 145{
130 unsigned long cnts; 146 unsigned long cnts;
131 147
132 if (!log->l_grant_trace) {
133 log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
134 if (!log->l_grant_trace)
135 return;
136 }
137 /* ticket counts are 1 byte each */ 148 /* ticket counts are 1 byte each */
138 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; 149 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
139 150
@@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
157} 168}
158 169
159void 170void
171xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
172{
173 iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
174}
175
176void
177xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
178{
179 ktrace_free(iclog->ic_trace);
180}
181
182void
160xlog_trace_iclog(xlog_in_core_t *iclog, uint state) 183xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
161{ 184{
162 if (!iclog->ic_trace)
163 iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
164 ktrace_enter(iclog->ic_trace, 185 ktrace_enter(iclog->ic_trace,
165 (void *)((unsigned long)state), 186 (void *)((unsigned long)state),
166 (void *)((unsigned long)current_pid()), 187 (void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
170 (void *)NULL, (void *)NULL); 191 (void *)NULL, (void *)NULL);
171} 192}
172#else 193#else
194
195#define xlog_trace_loggrant_alloc(log)
196#define xlog_trace_loggrant_dealloc(log)
173#define xlog_trace_loggrant(log,tic,string) 197#define xlog_trace_loggrant(log,tic,string)
198
199#define xlog_trace_iclog_alloc(iclog)
200#define xlog_trace_iclog_dealloc(iclog)
174#define xlog_trace_iclog(iclog,state) 201#define xlog_trace_iclog(iclog,state)
202
175#endif /* XFS_LOG_TRACE */ 203#endif /* XFS_LOG_TRACE */
176 204
177 205
@@ -336,15 +364,12 @@ xfs_log_done(xfs_mount_t *mp,
336 } else { 364 } else {
337 xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); 365 xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
338 xlog_regrant_reserve_log_space(log, ticket); 366 xlog_regrant_reserve_log_space(log, ticket);
339 } 367 /* If this ticket was a permanent reservation and we aren't
340 368 * trying to release it, reset the inited flags; so next time
341 /* If this ticket was a permanent reservation and we aren't 369 * we write, a start record will be written out.
342 * trying to release it, reset the inited flags; so next time 370 */
343 * we write, a start record will be written out.
344 */
345 if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
346 (flags & XFS_LOG_REL_PERM_RESERV) == 0)
347 ticket->t_flags |= XLOG_TIC_INITED; 371 ticket->t_flags |= XLOG_TIC_INITED;
372 }
348 373
349 return lsn; 374 return lsn;
350} /* xfs_log_done */ 375} /* xfs_log_done */
@@ -357,11 +382,11 @@ xfs_log_done(xfs_mount_t *mp,
357 * Asynchronous forces are implemented by setting the WANT_SYNC 382 * Asynchronous forces are implemented by setting the WANT_SYNC
358 * bit in the appropriate in-core log and then returning. 383 * bit in the appropriate in-core log and then returning.
359 * 384 *
360 * Synchronous forces are implemented with a semaphore. All callers 385 * Synchronous forces are implemented with a signal variable. All callers
361 * to force a given lsn to disk will wait on a semaphore attached to the 386 * to force a given lsn to disk will wait on a the sv attached to the
362 * specific in-core log. When given in-core log finally completes its 387 * specific in-core log. When given in-core log finally completes its
363 * write to disk, that thread will wake up all threads waiting on the 388 * write to disk, that thread will wake up all threads waiting on the
364 * semaphore. 389 * sv.
365 */ 390 */
366int 391int
367_xfs_log_force( 392_xfs_log_force(
@@ -588,12 +613,12 @@ error:
588 * mp - ubiquitous xfs mount point structure 613 * mp - ubiquitous xfs mount point structure
589 */ 614 */
590int 615int
591xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags) 616xfs_log_mount_finish(xfs_mount_t *mp)
592{ 617{
593 int error; 618 int error;
594 619
595 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 620 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
596 error = xlog_recover_finish(mp->m_log, mfsi_flags); 621 error = xlog_recover_finish(mp->m_log);
597 else { 622 else {
598 error = 0; 623 error = 0;
599 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 624 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
@@ -707,7 +732,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
707 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 732 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
708 iclog->ic_state == XLOG_STATE_DIRTY)) { 733 iclog->ic_state == XLOG_STATE_DIRTY)) {
709 if (!XLOG_FORCED_SHUTDOWN(log)) { 734 if (!XLOG_FORCED_SHUTDOWN(log)) {
710 sv_wait(&iclog->ic_forcesema, PMEM, 735 sv_wait(&iclog->ic_force_wait, PMEM,
711 &log->l_icloglock, s); 736 &log->l_icloglock, s);
712 } else { 737 } else {
713 spin_unlock(&log->l_icloglock); 738 spin_unlock(&log->l_icloglock);
@@ -748,7 +773,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
748 || iclog->ic_state == XLOG_STATE_DIRTY 773 || iclog->ic_state == XLOG_STATE_DIRTY
749 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 774 || iclog->ic_state == XLOG_STATE_IOERROR) ) {
750 775
751 sv_wait(&iclog->ic_forcesema, PMEM, 776 sv_wait(&iclog->ic_force_wait, PMEM,
752 &log->l_icloglock, s); 777 &log->l_icloglock, s);
753 } else { 778 } else {
754 spin_unlock(&log->l_icloglock); 779 spin_unlock(&log->l_icloglock);
@@ -838,7 +863,7 @@ xfs_log_move_tail(xfs_mount_t *mp,
838 break; 863 break;
839 tail_lsn = 0; 864 tail_lsn = 0;
840 free_bytes -= tic->t_unit_res; 865 free_bytes -= tic->t_unit_res;
841 sv_signal(&tic->t_sema); 866 sv_signal(&tic->t_wait);
842 tic = tic->t_next; 867 tic = tic->t_next;
843 } while (tic != log->l_write_headq); 868 } while (tic != log->l_write_headq);
844 } 869 }
@@ -859,7 +884,7 @@ xfs_log_move_tail(xfs_mount_t *mp,
859 break; 884 break;
860 tail_lsn = 0; 885 tail_lsn = 0;
861 free_bytes -= need_bytes; 886 free_bytes -= need_bytes;
862 sv_signal(&tic->t_sema); 887 sv_signal(&tic->t_wait);
863 tic = tic->t_next; 888 tic = tic->t_next;
864 } while (tic != log->l_reserve_headq); 889 } while (tic != log->l_reserve_headq);
865 } 890 }
@@ -1008,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp)
1008 l = iclog->ic_log; 1033 l = iclog->ic_log;
1009 1034
1010 /* 1035 /*
1011 * If the ordered flag has been removed by a lower 1036 * If the _XFS_BARRIER_FAILED flag was set by a lower
1012 * layer, it means the underlyin device no longer supports 1037 * layer, it means the underlying device no longer supports
1013 * barrier I/O. Warn loudly and turn off barriers. 1038 * barrier I/O. Warn loudly and turn off barriers.
1014 */ 1039 */
1015 if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { 1040 if (bp->b_flags & _XFS_BARRIER_FAILED) {
1041 bp->b_flags &= ~_XFS_BARRIER_FAILED;
1016 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; 1042 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
1017 xfs_fs_cmn_err(CE_WARN, l->l_mp, 1043 xfs_fs_cmn_err(CE_WARN, l->l_mp,
1018 "xlog_iodone: Barriers are no longer supported" 1044 "xlog_iodone: Barriers are no longer supported"
@@ -1234,6 +1260,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1234 spin_lock_init(&log->l_grant_lock); 1260 spin_lock_init(&log->l_grant_lock);
1235 sv_init(&log->l_flush_wait, 0, "flush_wait"); 1261 sv_init(&log->l_flush_wait, 0, "flush_wait");
1236 1262
1263 xlog_trace_loggrant_alloc(log);
1237 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1264 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1238 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1265 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
1239 1266
@@ -1285,8 +1312,10 @@ xlog_alloc_log(xfs_mount_t *mp,
1285 1312
1286 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); 1313 ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
1287 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); 1314 ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
1288 sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force"); 1315 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
1289 sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write"); 1316 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
1317
1318 xlog_trace_iclog_alloc(iclog);
1290 1319
1291 iclogp = &iclog->ic_next; 1320 iclogp = &iclog->ic_next;
1292 } 1321 }
@@ -1565,14 +1594,10 @@ xlog_dealloc_log(xlog_t *log)
1565 1594
1566 iclog = log->l_iclog; 1595 iclog = log->l_iclog;
1567 for (i=0; i<log->l_iclog_bufs; i++) { 1596 for (i=0; i<log->l_iclog_bufs; i++) {
1568 sv_destroy(&iclog->ic_forcesema); 1597 sv_destroy(&iclog->ic_force_wait);
1569 sv_destroy(&iclog->ic_writesema); 1598 sv_destroy(&iclog->ic_write_wait);
1570 xfs_buf_free(iclog->ic_bp); 1599 xfs_buf_free(iclog->ic_bp);
1571#ifdef XFS_LOG_TRACE 1600 xlog_trace_iclog_dealloc(iclog);
1572 if (iclog->ic_trace != NULL) {
1573 ktrace_free(iclog->ic_trace);
1574 }
1575#endif
1576 next_iclog = iclog->ic_next; 1601 next_iclog = iclog->ic_next;
1577 kmem_free(iclog); 1602 kmem_free(iclog);
1578 iclog = next_iclog; 1603 iclog = next_iclog;
@@ -1581,14 +1606,7 @@ xlog_dealloc_log(xlog_t *log)
1581 spinlock_destroy(&log->l_grant_lock); 1606 spinlock_destroy(&log->l_grant_lock);
1582 1607
1583 xfs_buf_free(log->l_xbuf); 1608 xfs_buf_free(log->l_xbuf);
1584#ifdef XFS_LOG_TRACE 1609 xlog_trace_loggrant_dealloc(log);
1585 if (log->l_trace != NULL) {
1586 ktrace_free(log->l_trace);
1587 }
1588 if (log->l_grant_trace != NULL) {
1589 ktrace_free(log->l_grant_trace);
1590 }
1591#endif
1592 log->l_mp->m_log = NULL; 1610 log->l_mp->m_log = NULL;
1593 kmem_free(log); 1611 kmem_free(log);
1594} /* xlog_dealloc_log */ 1612} /* xlog_dealloc_log */
@@ -1976,7 +1994,7 @@ xlog_write(xfs_mount_t * mp,
1976/* Clean iclogs starting from the head. This ordering must be 1994/* Clean iclogs starting from the head. This ordering must be
1977 * maintained, so an iclog doesn't become ACTIVE beyond one that 1995 * maintained, so an iclog doesn't become ACTIVE beyond one that
1978 * is SYNCING. This is also required to maintain the notion that we use 1996 * is SYNCING. This is also required to maintain the notion that we use
1979 * a counting semaphore to hold off would be writers to the log when every 1997 * a ordered wait queue to hold off would be writers to the log when every
1980 * iclog is trying to sync to disk. 1998 * iclog is trying to sync to disk.
1981 * 1999 *
1982 * State Change: DIRTY -> ACTIVE 2000 * State Change: DIRTY -> ACTIVE
@@ -2240,7 +2258,7 @@ xlog_state_do_callback(
2240 xlog_state_clean_log(log); 2258 xlog_state_clean_log(log);
2241 2259
2242 /* wake up threads waiting in xfs_log_force() */ 2260 /* wake up threads waiting in xfs_log_force() */
2243 sv_broadcast(&iclog->ic_forcesema); 2261 sv_broadcast(&iclog->ic_force_wait);
2244 2262
2245 iclog = iclog->ic_next; 2263 iclog = iclog->ic_next;
2246 } while (first_iclog != iclog); 2264 } while (first_iclog != iclog);
@@ -2302,8 +2320,7 @@ xlog_state_do_callback(
2302 * the second completion goes through. 2320 * the second completion goes through.
2303 * 2321 *
2304 * Callbacks could take time, so they are done outside the scope of the 2322 * Callbacks could take time, so they are done outside the scope of the
2305 * global state machine log lock. Assume that the calls to cvsema won't 2323 * global state machine log lock.
2306 * take a long time. At least we know it won't sleep.
2307 */ 2324 */
2308STATIC void 2325STATIC void
2309xlog_state_done_syncing( 2326xlog_state_done_syncing(
@@ -2339,7 +2356,7 @@ xlog_state_done_syncing(
2339 * iclog buffer, we wake them all, one will get to do the 2356 * iclog buffer, we wake them all, one will get to do the
2340 * I/O, the others get to wait for the result. 2357 * I/O, the others get to wait for the result.
2341 */ 2358 */
2342 sv_broadcast(&iclog->ic_writesema); 2359 sv_broadcast(&iclog->ic_write_wait);
2343 spin_unlock(&log->l_icloglock); 2360 spin_unlock(&log->l_icloglock);
2344 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2361 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
2345} /* xlog_state_done_syncing */ 2362} /* xlog_state_done_syncing */
@@ -2347,11 +2364,9 @@ xlog_state_done_syncing(
2347 2364
2348/* 2365/*
2349 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must 2366 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
2350 * sleep. The flush semaphore is set to the number of in-core buffers and 2367 * sleep. We wait on the flush queue on the head iclog as that should be
2351 * decremented around disk syncing. Therefore, if all buffers are syncing, 2368 * the first iclog to complete flushing. Hence if all iclogs are syncing,
2352 * this semaphore will cause new writes to sleep until a sync completes. 2369 * we will wait here and all new writes will sleep until a sync completes.
2353 * Otherwise, this code just does p() followed by v(). This approximates
2354 * a sleep/wakeup except we can't race.
2355 * 2370 *
2356 * The in-core logs are used in a circular fashion. They are not used 2371 * The in-core logs are used in a circular fashion. They are not used
2357 * out-of-order even when an iclog past the head is free. 2372 * out-of-order even when an iclog past the head is free.
@@ -2508,7 +2523,7 @@ xlog_grant_log_space(xlog_t *log,
2508 goto error_return; 2523 goto error_return;
2509 2524
2510 XFS_STATS_INC(xs_sleep_logspace); 2525 XFS_STATS_INC(xs_sleep_logspace);
2511 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); 2526 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2512 /* 2527 /*
2513 * If we got an error, and the filesystem is shutting down, 2528 * If we got an error, and the filesystem is shutting down,
2514 * we'll catch it down below. So just continue... 2529 * we'll catch it down below. So just continue...
@@ -2534,7 +2549,7 @@ redo:
2534 xlog_trace_loggrant(log, tic, 2549 xlog_trace_loggrant(log, tic,
2535 "xlog_grant_log_space: sleep 2"); 2550 "xlog_grant_log_space: sleep 2");
2536 XFS_STATS_INC(xs_sleep_logspace); 2551 XFS_STATS_INC(xs_sleep_logspace);
2537 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); 2552 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2538 2553
2539 if (XLOG_FORCED_SHUTDOWN(log)) { 2554 if (XLOG_FORCED_SHUTDOWN(log)) {
2540 spin_lock(&log->l_grant_lock); 2555 spin_lock(&log->l_grant_lock);
@@ -2633,7 +2648,7 @@ xlog_regrant_write_log_space(xlog_t *log,
2633 if (free_bytes < ntic->t_unit_res) 2648 if (free_bytes < ntic->t_unit_res)
2634 break; 2649 break;
2635 free_bytes -= ntic->t_unit_res; 2650 free_bytes -= ntic->t_unit_res;
2636 sv_signal(&ntic->t_sema); 2651 sv_signal(&ntic->t_wait);
2637 ntic = ntic->t_next; 2652 ntic = ntic->t_next;
2638 } while (ntic != log->l_write_headq); 2653 } while (ntic != log->l_write_headq);
2639 2654
@@ -2644,7 +2659,7 @@ xlog_regrant_write_log_space(xlog_t *log,
2644 xlog_trace_loggrant(log, tic, 2659 xlog_trace_loggrant(log, tic,
2645 "xlog_regrant_write_log_space: sleep 1"); 2660 "xlog_regrant_write_log_space: sleep 1");
2646 XFS_STATS_INC(xs_sleep_logspace); 2661 XFS_STATS_INC(xs_sleep_logspace);
2647 sv_wait(&tic->t_sema, PINOD|PLTWAIT, 2662 sv_wait(&tic->t_wait, PINOD|PLTWAIT,
2648 &log->l_grant_lock, s); 2663 &log->l_grant_lock, s);
2649 2664
2650 /* If we're shutting down, this tic is already 2665 /* If we're shutting down, this tic is already
@@ -2673,7 +2688,7 @@ redo:
2673 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) 2688 if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
2674 xlog_ins_ticketq(&log->l_write_headq, tic); 2689 xlog_ins_ticketq(&log->l_write_headq, tic);
2675 XFS_STATS_INC(xs_sleep_logspace); 2690 XFS_STATS_INC(xs_sleep_logspace);
2676 sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); 2691 sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
2677 2692
2678 /* If we're shutting down, this tic is already off the queue */ 2693 /* If we're shutting down, this tic is already off the queue */
2679 if (XLOG_FORCED_SHUTDOWN(log)) { 2694 if (XLOG_FORCED_SHUTDOWN(log)) {
@@ -2916,7 +2931,7 @@ xlog_state_switch_iclogs(xlog_t *log,
2916 * 2. the current iclog is drity, and the previous iclog is in the 2931 * 2. the current iclog is drity, and the previous iclog is in the
2917 * active or dirty state. 2932 * active or dirty state.
2918 * 2933 *
2919 * We may sleep (call psema) if: 2934 * We may sleep if:
2920 * 2935 *
2921 * 1. the current iclog is not in the active nor dirty state. 2936 * 1. the current iclog is not in the active nor dirty state.
2922 * 2. the current iclog dirty, and the previous iclog is not in the 2937 * 2. the current iclog dirty, and the previous iclog is not in the
@@ -3013,7 +3028,7 @@ maybe_sleep:
3013 return XFS_ERROR(EIO); 3028 return XFS_ERROR(EIO);
3014 } 3029 }
3015 XFS_STATS_INC(xs_log_force_sleep); 3030 XFS_STATS_INC(xs_log_force_sleep);
3016 sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s); 3031 sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
3017 /* 3032 /*
3018 * No need to grab the log lock here since we're 3033 * No need to grab the log lock here since we're
3019 * only deciding whether or not to return EIO 3034 * only deciding whether or not to return EIO
@@ -3096,7 +3111,7 @@ try_again:
3096 XLOG_STATE_SYNCING))) { 3111 XLOG_STATE_SYNCING))) {
3097 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); 3112 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
3098 XFS_STATS_INC(xs_log_force_sleep); 3113 XFS_STATS_INC(xs_log_force_sleep);
3099 sv_wait(&iclog->ic_prev->ic_writesema, PSWP, 3114 sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
3100 &log->l_icloglock, s); 3115 &log->l_icloglock, s);
3101 *log_flushed = 1; 3116 *log_flushed = 1;
3102 already_slept = 1; 3117 already_slept = 1;
@@ -3116,7 +3131,7 @@ try_again:
3116 !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { 3131 !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
3117 3132
3118 /* 3133 /*
3119 * Don't wait on the forcesema if we know that we've 3134 * Don't wait on completion if we know that we've
3120 * gotten a log write error. 3135 * gotten a log write error.
3121 */ 3136 */
3122 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3137 if (iclog->ic_state & XLOG_STATE_IOERROR) {
@@ -3124,7 +3139,7 @@ try_again:
3124 return XFS_ERROR(EIO); 3139 return XFS_ERROR(EIO);
3125 } 3140 }
3126 XFS_STATS_INC(xs_log_force_sleep); 3141 XFS_STATS_INC(xs_log_force_sleep);
3127 sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s); 3142 sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
3128 /* 3143 /*
3129 * No need to grab the log lock here since we're 3144 * No need to grab the log lock here since we're
3130 * only deciding whether or not to return EIO 3145 * only deciding whether or not to return EIO
@@ -3180,7 +3195,7 @@ STATIC void
3180xlog_ticket_put(xlog_t *log, 3195xlog_ticket_put(xlog_t *log,
3181 xlog_ticket_t *ticket) 3196 xlog_ticket_t *ticket)
3182{ 3197{
3183 sv_destroy(&ticket->t_sema); 3198 sv_destroy(&ticket->t_wait);
3184 kmem_zone_free(xfs_log_ticket_zone, ticket); 3199 kmem_zone_free(xfs_log_ticket_zone, ticket);
3185} /* xlog_ticket_put */ 3200} /* xlog_ticket_put */
3186 3201
@@ -3270,7 +3285,7 @@ xlog_ticket_get(xlog_t *log,
3270 tic->t_trans_type = 0; 3285 tic->t_trans_type = 0;
3271 if (xflags & XFS_LOG_PERM_RESERV) 3286 if (xflags & XFS_LOG_PERM_RESERV)
3272 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3287 tic->t_flags |= XLOG_TIC_PERM_RESERV;
3273 sv_init(&(tic->t_sema), SV_DEFAULT, "logtick"); 3288 sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
3274 3289
3275 xlog_tic_reset_res(tic); 3290 xlog_tic_reset_res(tic);
3276 3291
@@ -3557,14 +3572,14 @@ xfs_log_force_umount(
3557 */ 3572 */
3558 if ((tic = log->l_reserve_headq)) { 3573 if ((tic = log->l_reserve_headq)) {
3559 do { 3574 do {
3560 sv_signal(&tic->t_sema); 3575 sv_signal(&tic->t_wait);
3561 tic = tic->t_next; 3576 tic = tic->t_next;
3562 } while (tic != log->l_reserve_headq); 3577 } while (tic != log->l_reserve_headq);
3563 } 3578 }
3564 3579
3565 if ((tic = log->l_write_headq)) { 3580 if ((tic = log->l_write_headq)) {
3566 do { 3581 do {
3567 sv_signal(&tic->t_sema); 3582 sv_signal(&tic->t_wait);
3568 tic = tic->t_next; 3583 tic = tic->t_next;
3569 } while (tic != log->l_write_headq); 3584 } while (tic != log->l_write_headq);
3570 } 3585 }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d1d678ecb63e..d47b91f10822 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -149,7 +149,7 @@ int xfs_log_mount(struct xfs_mount *mp,
149 struct xfs_buftarg *log_target, 149 struct xfs_buftarg *log_target,
150 xfs_daddr_t start_block, 150 xfs_daddr_t start_block,
151 int num_bblocks); 151 int num_bblocks);
152int xfs_log_mount_finish(struct xfs_mount *mp, int); 152int xfs_log_mount_finish(struct xfs_mount *mp);
153void xfs_log_move_tail(struct xfs_mount *mp, 153void xfs_log_move_tail(struct xfs_mount *mp,
154 xfs_lsn_t tail_lsn); 154 xfs_lsn_t tail_lsn);
155int xfs_log_notify(struct xfs_mount *mp, 155int xfs_log_notify(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 6245913196b4..e7d8f84443fa 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -241,7 +241,7 @@ typedef struct xlog_res {
241} xlog_res_t; 241} xlog_res_t;
242 242
243typedef struct xlog_ticket { 243typedef struct xlog_ticket {
244 sv_t t_sema; /* sleep on this semaphore : 20 */ 244 sv_t t_wait; /* ticket wait queue : 20 */
245 struct xlog_ticket *t_next; /* :4|8 */ 245 struct xlog_ticket *t_next; /* :4|8 */
246 struct xlog_ticket *t_prev; /* :4|8 */ 246 struct xlog_ticket *t_prev; /* :4|8 */
247 xlog_tid_t t_tid; /* transaction identifier : 4 */ 247 xlog_tid_t t_tid; /* transaction identifier : 4 */
@@ -314,7 +314,7 @@ typedef struct xlog_rec_ext_header {
314 * xlog_rec_header_t into the reserved space. 314 * xlog_rec_header_t into the reserved space.
315 * - ic_data follows, so a write to disk can start at the beginning of 315 * - ic_data follows, so a write to disk can start at the beginning of
316 * the iclog. 316 * the iclog.
317 * - ic_forcesema is used to implement synchronous forcing of the iclog to disk. 317 * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
318 * - ic_next is the pointer to the next iclog in the ring. 318 * - ic_next is the pointer to the next iclog in the ring.
319 * - ic_bp is a pointer to the buffer used to write this incore log to disk. 319 * - ic_bp is a pointer to the buffer used to write this incore log to disk.
320 * - ic_log is a pointer back to the global log structure. 320 * - ic_log is a pointer back to the global log structure.
@@ -339,8 +339,8 @@ typedef struct xlog_rec_ext_header {
339 * and move everything else out to subsequent cachelines. 339 * and move everything else out to subsequent cachelines.
340 */ 340 */
341typedef struct xlog_iclog_fields { 341typedef struct xlog_iclog_fields {
342 sv_t ic_forcesema; 342 sv_t ic_force_wait;
343 sv_t ic_writesema; 343 sv_t ic_write_wait;
344 struct xlog_in_core *ic_next; 344 struct xlog_in_core *ic_next;
345 struct xlog_in_core *ic_prev; 345 struct xlog_in_core *ic_prev;
346 struct xfs_buf *ic_bp; 346 struct xfs_buf *ic_bp;
@@ -377,8 +377,8 @@ typedef struct xlog_in_core {
377/* 377/*
378 * Defines to save our code from this glop. 378 * Defines to save our code from this glop.
379 */ 379 */
380#define ic_forcesema hic_fields.ic_forcesema 380#define ic_force_wait hic_fields.ic_force_wait
381#define ic_writesema hic_fields.ic_writesema 381#define ic_write_wait hic_fields.ic_write_wait
382#define ic_next hic_fields.ic_next 382#define ic_next hic_fields.ic_next
383#define ic_prev hic_fields.ic_prev 383#define ic_prev hic_fields.ic_prev
384#define ic_bp hic_fields.ic_bp 384#define ic_bp hic_fields.ic_bp
@@ -448,7 +448,6 @@ typedef struct log {
448 int l_grant_write_bytes; 448 int l_grant_write_bytes;
449 449
450#ifdef XFS_LOG_TRACE 450#ifdef XFS_LOG_TRACE
451 struct ktrace *l_trace;
452 struct ktrace *l_grant_trace; 451 struct ktrace *l_grant_trace;
453#endif 452#endif
454 453
@@ -468,7 +467,7 @@ extern int xlog_find_tail(xlog_t *log,
468 xfs_daddr_t *head_blk, 467 xfs_daddr_t *head_blk,
469 xfs_daddr_t *tail_blk); 468 xfs_daddr_t *tail_blk);
470extern int xlog_recover(xlog_t *log); 469extern int xlog_recover(xlog_t *log);
471extern int xlog_recover_finish(xlog_t *log, int mfsi_flags); 470extern int xlog_recover_finish(xlog_t *log);
472extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 471extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
473extern void xlog_recover_process_iunlinks(xlog_t *log); 472extern void xlog_recover_process_iunlinks(xlog_t *log);
474 473
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9eb722ec744e..82d46ce69d5f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3940,8 +3940,7 @@ xlog_recover(
3940 */ 3940 */
3941int 3941int
3942xlog_recover_finish( 3942xlog_recover_finish(
3943 xlog_t *log, 3943 xlog_t *log)
3944 int mfsi_flags)
3945{ 3944{
3946 /* 3945 /*
3947 * Now we're ready to do the transactions needed for the 3946 * Now we're ready to do the transactions needed for the
@@ -3969,9 +3968,7 @@ xlog_recover_finish(
3969 xfs_log_force(log->l_mp, (xfs_lsn_t)0, 3968 xfs_log_force(log->l_mp, (xfs_lsn_t)0,
3970 (XFS_LOG_FORCE | XFS_LOG_SYNC)); 3969 (XFS_LOG_FORCE | XFS_LOG_SYNC));
3971 3970
3972 if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) { 3971 xlog_recover_process_iunlinks(log);
3973 xlog_recover_process_iunlinks(log);
3974 }
3975 3972
3976 xlog_recover_check_summary(log); 3973 xlog_recover_check_summary(log);
3977 3974
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6c5d1325e7f6..a4503f5e9497 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -128,7 +128,7 @@ static const struct {
128 * initialized. 128 * initialized.
129 */ 129 */
130STATIC void 130STATIC void
131xfs_mount_free( 131xfs_free_perag(
132 xfs_mount_t *mp) 132 xfs_mount_t *mp)
133{ 133{
134 if (mp->m_perag) { 134 if (mp->m_perag) {
@@ -139,20 +139,6 @@ xfs_mount_free(
139 kmem_free(mp->m_perag[agno].pagb_list); 139 kmem_free(mp->m_perag[agno].pagb_list);
140 kmem_free(mp->m_perag); 140 kmem_free(mp->m_perag);
141 } 141 }
142
143 spinlock_destroy(&mp->m_ail_lock);
144 spinlock_destroy(&mp->m_sb_lock);
145 mutex_destroy(&mp->m_ilock);
146 mutex_destroy(&mp->m_growlock);
147 if (mp->m_quotainfo)
148 XFS_QM_DONE(mp);
149
150 if (mp->m_fsname != NULL)
151 kmem_free(mp->m_fsname);
152 if (mp->m_rtname != NULL)
153 kmem_free(mp->m_rtname);
154 if (mp->m_logname != NULL)
155 kmem_free(mp->m_logname);
156} 142}
157 143
158/* 144/*
@@ -704,11 +690,11 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
704 * Update alignment values based on mount options and sb values 690 * Update alignment values based on mount options and sb values
705 */ 691 */
706STATIC int 692STATIC int
707xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags) 693xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
708{ 694{
709 xfs_sb_t *sbp = &(mp->m_sb); 695 xfs_sb_t *sbp = &(mp->m_sb);
710 696
711 if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) { 697 if (mp->m_dalign) {
712 /* 698 /*
713 * If stripe unit and stripe width are not multiples 699 * If stripe unit and stripe width are not multiples
714 * of the fs blocksize turn off alignment. 700 * of the fs blocksize turn off alignment.
@@ -864,7 +850,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
864 * Check that the data (and log if separate) are an ok size. 850 * Check that the data (and log if separate) are an ok size.
865 */ 851 */
866STATIC int 852STATIC int
867xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags) 853xfs_check_sizes(xfs_mount_t *mp)
868{ 854{
869 xfs_buf_t *bp; 855 xfs_buf_t *bp;
870 xfs_daddr_t d; 856 xfs_daddr_t d;
@@ -887,8 +873,7 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
887 return error; 873 return error;
888 } 874 }
889 875
890 if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) && 876 if (mp->m_logdev_targp != mp->m_ddev_targp) {
891 mp->m_logdev_targp != mp->m_ddev_targp) {
892 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 877 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
893 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 878 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
894 cmn_err(CE_WARN, "XFS: size check 3 failed"); 879 cmn_err(CE_WARN, "XFS: size check 3 failed");
@@ -923,15 +908,13 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
923 */ 908 */
924int 909int
925xfs_mountfs( 910xfs_mountfs(
926 xfs_mount_t *mp, 911 xfs_mount_t *mp)
927 int mfsi_flags)
928{ 912{
929 xfs_sb_t *sbp = &(mp->m_sb); 913 xfs_sb_t *sbp = &(mp->m_sb);
930 xfs_inode_t *rip; 914 xfs_inode_t *rip;
931 __uint64_t resblks; 915 __uint64_t resblks;
932 __int64_t update_flags = 0LL; 916 __int64_t update_flags = 0LL;
933 uint quotamount, quotaflags; 917 uint quotamount, quotaflags;
934 int agno;
935 int uuid_mounted = 0; 918 int uuid_mounted = 0;
936 int error = 0; 919 int error = 0;
937 920
@@ -985,7 +968,7 @@ xfs_mountfs(
985 * allocator alignment is within an ag, therefore ag has 968 * allocator alignment is within an ag, therefore ag has
986 * to be aligned at stripe boundary. 969 * to be aligned at stripe boundary.
987 */ 970 */
988 error = xfs_update_alignment(mp, mfsi_flags, &update_flags); 971 error = xfs_update_alignment(mp, &update_flags);
989 if (error) 972 if (error)
990 goto error1; 973 goto error1;
991 974
@@ -1004,8 +987,7 @@ xfs_mountfs(
1004 * since a single partition filesystem is identical to a single 987 * since a single partition filesystem is identical to a single
1005 * partition volume/filesystem. 988 * partition volume/filesystem.
1006 */ 989 */
1007 if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && 990 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
1008 (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
1009 if (xfs_uuid_mount(mp)) { 991 if (xfs_uuid_mount(mp)) {
1010 error = XFS_ERROR(EINVAL); 992 error = XFS_ERROR(EINVAL);
1011 goto error1; 993 goto error1;
@@ -1033,7 +1015,7 @@ xfs_mountfs(
1033 /* 1015 /*
1034 * Check that the data (and log if separate) are an ok size. 1016 * Check that the data (and log if separate) are an ok size.
1035 */ 1017 */
1036 error = xfs_check_sizes(mp, mfsi_flags); 1018 error = xfs_check_sizes(mp);
1037 if (error) 1019 if (error)
1038 goto error1; 1020 goto error1;
1039 1021
@@ -1047,13 +1029,6 @@ xfs_mountfs(
1047 } 1029 }
1048 1030
1049 /* 1031 /*
1050 * For client case we are done now
1051 */
1052 if (mfsi_flags & XFS_MFSI_CLIENT) {
1053 return 0;
1054 }
1055
1056 /*
1057 * Copies the low order bits of the timestamp and the randomly 1032 * Copies the low order bits of the timestamp and the randomly
1058 * set "sequence" number out of a UUID. 1033 * set "sequence" number out of a UUID.
1059 */ 1034 */
@@ -1077,8 +1052,10 @@ xfs_mountfs(
1077 * Allocate and initialize the per-ag data. 1052 * Allocate and initialize the per-ag data.
1078 */ 1053 */
1079 init_rwsem(&mp->m_peraglock); 1054 init_rwsem(&mp->m_peraglock);
1080 mp->m_perag = 1055 mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
1081 kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP); 1056 KM_MAYFAIL);
1057 if (!mp->m_perag)
1058 goto error1;
1082 1059
1083 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); 1060 mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
1084 1061
@@ -1190,7 +1167,7 @@ xfs_mountfs(
1190 * delayed until after the root and real-time bitmap inodes 1167 * delayed until after the root and real-time bitmap inodes
1191 * were consistently read in. 1168 * were consistently read in.
1192 */ 1169 */
1193 error = xfs_log_mount_finish(mp, mfsi_flags); 1170 error = xfs_log_mount_finish(mp);
1194 if (error) { 1171 if (error) {
1195 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1172 cmn_err(CE_WARN, "XFS: log mount finish failed");
1196 goto error4; 1173 goto error4;
@@ -1199,7 +1176,7 @@ xfs_mountfs(
1199 /* 1176 /*
1200 * Complete the quota initialisation, post-log-replay component. 1177 * Complete the quota initialisation, post-log-replay component.
1201 */ 1178 */
1202 error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags); 1179 error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
1203 if (error) 1180 if (error)
1204 goto error4; 1181 goto error4;
1205 1182
@@ -1233,12 +1210,7 @@ xfs_mountfs(
1233 error3: 1210 error3:
1234 xfs_log_unmount_dealloc(mp); 1211 xfs_log_unmount_dealloc(mp);
1235 error2: 1212 error2:
1236 for (agno = 0; agno < sbp->sb_agcount; agno++) 1213 xfs_free_perag(mp);
1237 if (mp->m_perag[agno].pagb_list)
1238 kmem_free(mp->m_perag[agno].pagb_list);
1239 kmem_free(mp->m_perag);
1240 mp->m_perag = NULL;
1241 /* FALLTHROUGH */
1242 error1: 1214 error1:
1243 if (uuid_mounted) 1215 if (uuid_mounted)
1244 uuid_table_remove(&mp->m_sb.sb_uuid); 1216 uuid_table_remove(&mp->m_sb.sb_uuid);
@@ -1246,16 +1218,17 @@ xfs_mountfs(
1246} 1218}
1247 1219
1248/* 1220/*
1249 * xfs_unmountfs
1250 *
1251 * This flushes out the inodes,dquots and the superblock, unmounts the 1221 * This flushes out the inodes,dquots and the superblock, unmounts the
1252 * log and makes sure that incore structures are freed. 1222 * log and makes sure that incore structures are freed.
1253 */ 1223 */
1254int 1224void
1255xfs_unmountfs(xfs_mount_t *mp) 1225xfs_unmountfs(
1226 struct xfs_mount *mp)
1256{ 1227{
1257 __uint64_t resblks; 1228 __uint64_t resblks;
1258 int error = 0; 1229 int error;
1230
1231 IRELE(mp->m_rootip);
1259 1232
1260 /* 1233 /*
1261 * We can potentially deadlock here if we have an inode cluster 1234 * We can potentially deadlock here if we have an inode cluster
@@ -1312,8 +1285,6 @@ xfs_unmountfs(xfs_mount_t *mp)
1312 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1285 xfs_unmountfs_wait(mp); /* wait for async bufs */
1313 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1286 xfs_log_unmount(mp); /* Done! No more fs ops. */
1314 1287
1315 xfs_freesb(mp);
1316
1317 /* 1288 /*
1318 * All inodes from this mount point should be freed. 1289 * All inodes from this mount point should be freed.
1319 */ 1290 */
@@ -1322,11 +1293,12 @@ xfs_unmountfs(xfs_mount_t *mp)
1322 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) 1293 if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
1323 uuid_table_remove(&mp->m_sb.sb_uuid); 1294 uuid_table_remove(&mp->m_sb.sb_uuid);
1324 1295
1325#if defined(DEBUG) || defined(INDUCE_IO_ERROR) 1296#if defined(DEBUG)
1326 xfs_errortag_clearall(mp, 0); 1297 xfs_errortag_clearall(mp, 0);
1327#endif 1298#endif
1328 xfs_mount_free(mp); 1299 xfs_free_perag(mp);
1329 return 0; 1300 if (mp->m_quotainfo)
1301 XFS_QM_DONE(mp);
1330} 1302}
1331 1303
1332STATIC void 1304STATIC void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5269bd6e3df0..f3c1024b1241 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -114,7 +114,7 @@ struct xfs_dqtrxops;
114struct xfs_quotainfo; 114struct xfs_quotainfo;
115 115
116typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *); 116typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
117typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int); 117typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
118typedef int (*xfs_qmunmount_t)(struct xfs_mount *); 118typedef int (*xfs_qmunmount_t)(struct xfs_mount *);
119typedef void (*xfs_qmdone_t)(struct xfs_mount *); 119typedef void (*xfs_qmdone_t)(struct xfs_mount *);
120typedef void (*xfs_dqrele_t)(struct xfs_dquot *); 120typedef void (*xfs_dqrele_t)(struct xfs_dquot *);
@@ -158,8 +158,8 @@ typedef struct xfs_qmops {
158 158
159#define XFS_QM_INIT(mp, mnt, fl) \ 159#define XFS_QM_INIT(mp, mnt, fl) \
160 (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl) 160 (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
161#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \ 161#define XFS_QM_MOUNT(mp, mnt, fl) \
162 (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl, mfsi_flags) 162 (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
163#define XFS_QM_UNMOUNT(mp) \ 163#define XFS_QM_UNMOUNT(mp) \
164 (*(mp)->m_qm_ops->xfs_qmunmount)(mp) 164 (*(mp)->m_qm_ops->xfs_qmunmount)(mp)
165#define XFS_QM_DONE(mp) \ 165#define XFS_QM_DONE(mp) \
@@ -442,13 +442,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
442/* 442/*
443 * Flags for xfs_mountfs 443 * Flags for xfs_mountfs
444 */ 444 */
445#define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */
446#define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */
447/* XFS_MFSI_RRINODES */
448#define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */
449 /* log recovery */
450#define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */
451/* XFS_MFSI_CONVERT_SUNIT */
452#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ 445#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */
453 446
454#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d) 447#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d)
@@ -517,10 +510,10 @@ typedef struct xfs_mod_sb {
517 510
518extern void xfs_mod_sb(xfs_trans_t *, __int64_t); 511extern void xfs_mod_sb(xfs_trans_t *, __int64_t);
519extern int xfs_log_sbcount(xfs_mount_t *, uint); 512extern int xfs_log_sbcount(xfs_mount_t *, uint);
520extern int xfs_mountfs(xfs_mount_t *mp, int); 513extern int xfs_mountfs(xfs_mount_t *mp);
521extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); 514extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
522 515
523extern int xfs_unmountfs(xfs_mount_t *); 516extern void xfs_unmountfs(xfs_mount_t *);
524extern int xfs_unmountfs_writesb(xfs_mount_t *); 517extern int xfs_unmountfs_writesb(xfs_mount_t *);
525extern int xfs_unmount_flush(xfs_mount_t *, int); 518extern int xfs_unmount_flush(xfs_mount_t *, int);
526extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 519extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index bf87a5913504..e2f68de16159 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -74,18 +74,6 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
74 */ 74 */
75 75
76/* 76/*
77 * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
78 */
79STATIC int
80xfs_lowbit32(
81 __uint32_t v)
82{
83 if (v)
84 return ffs(v) - 1;
85 return -1;
86}
87
88/*
89 * Allocate space to the bitmap or summary file, and zero it, for growfs. 77 * Allocate space to the bitmap or summary file, and zero it, for growfs.
90 */ 78 */
91STATIC int /* error */ 79STATIC int /* error */
@@ -450,6 +438,7 @@ xfs_rtallocate_extent_near(
450 } 438 }
451 bbno = XFS_BITTOBLOCK(mp, bno); 439 bbno = XFS_BITTOBLOCK(mp, bno);
452 i = 0; 440 i = 0;
441 ASSERT(minlen != 0);
453 log2len = xfs_highbit32(minlen); 442 log2len = xfs_highbit32(minlen);
454 /* 443 /*
455 * Loop over all bitmap blocks (bbno + i is current block). 444 * Loop over all bitmap blocks (bbno + i is current block).
@@ -618,6 +607,8 @@ xfs_rtallocate_extent_size(
618 xfs_suminfo_t sum; /* summary information for extents */ 607 xfs_suminfo_t sum; /* summary information for extents */
619 608
620 ASSERT(minlen % prod == 0 && maxlen % prod == 0); 609 ASSERT(minlen % prod == 0 && maxlen % prod == 0);
610 ASSERT(maxlen != 0);
611
621 /* 612 /*
622 * Loop over all the levels starting with maxlen. 613 * Loop over all the levels starting with maxlen.
623 * At each level, look at all the bitmap blocks, to see if there 614 * At each level, look at all the bitmap blocks, to see if there
@@ -675,6 +666,9 @@ xfs_rtallocate_extent_size(
675 *rtblock = NULLRTBLOCK; 666 *rtblock = NULLRTBLOCK;
676 return 0; 667 return 0;
677 } 668 }
669 ASSERT(minlen != 0);
670 ASSERT(maxlen != 0);
671
678 /* 672 /*
679 * Loop over sizes, from maxlen down to minlen. 673 * Loop over sizes, from maxlen down to minlen.
680 * This time, when we do the allocations, allow smaller ones 674 * This time, when we do the allocations, allow smaller ones
@@ -1961,6 +1955,7 @@ xfs_growfs_rt(
1961 nsbp->sb_blocksize * nsbp->sb_rextsize); 1955 nsbp->sb_blocksize * nsbp->sb_rextsize);
1962 nsbp->sb_rextents = nsbp->sb_rblocks; 1956 nsbp->sb_rextents = nsbp->sb_rblocks;
1963 do_div(nsbp->sb_rextents, nsbp->sb_rextsize); 1957 do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
1958 ASSERT(nsbp->sb_rextents != 0);
1964 nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); 1959 nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
1965 nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; 1960 nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
1966 nrsumsize = 1961 nrsumsize =
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index b0f31c09a76d..3a82576dde9a 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -314,7 +314,7 @@ xfs_bioerror_relse(
314 * ASYNC buffers. 314 * ASYNC buffers.
315 */ 315 */
316 XFS_BUF_ERROR(bp, EIO); 316 XFS_BUF_ERROR(bp, EIO);
317 XFS_BUF_V_IODONESEMA(bp); 317 XFS_BUF_FINISH_IOWAIT(bp);
318 } else { 318 } else {
319 xfs_buf_relse(bp); 319 xfs_buf_relse(bp);
320 } 320 }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index e4ebddd3c500..4e1c22a23be5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -43,6 +43,7 @@
43#include "xfs_quota.h" 43#include "xfs_quota.h"
44#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
45#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
46#include "xfs_inode_item.h"
46 47
47 48
48STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); 49STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *);
@@ -253,7 +254,7 @@ _xfs_trans_alloc(
253 tp->t_mountp = mp; 254 tp->t_mountp = mp;
254 tp->t_items_free = XFS_LIC_NUM_SLOTS; 255 tp->t_items_free = XFS_LIC_NUM_SLOTS;
255 tp->t_busy_free = XFS_LBC_NUM_SLOTS; 256 tp->t_busy_free = XFS_LBC_NUM_SLOTS;
256 XFS_LIC_INIT(&(tp->t_items)); 257 xfs_lic_init(&(tp->t_items));
257 XFS_LBC_INIT(&(tp->t_busy)); 258 XFS_LBC_INIT(&(tp->t_busy));
258 return tp; 259 return tp;
259} 260}
@@ -282,7 +283,7 @@ xfs_trans_dup(
282 ntp->t_mountp = tp->t_mountp; 283 ntp->t_mountp = tp->t_mountp;
283 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 284 ntp->t_items_free = XFS_LIC_NUM_SLOTS;
284 ntp->t_busy_free = XFS_LBC_NUM_SLOTS; 285 ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
285 XFS_LIC_INIT(&(ntp->t_items)); 286 xfs_lic_init(&(ntp->t_items));
286 XFS_LBC_INIT(&(ntp->t_busy)); 287 XFS_LBC_INIT(&(ntp->t_busy));
287 288
288 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 289 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -1169,7 +1170,7 @@ xfs_trans_cancel(
1169 while (licp != NULL) { 1170 while (licp != NULL) {
1170 lidp = licp->lic_descs; 1171 lidp = licp->lic_descs;
1171 for (i = 0; i < licp->lic_unused; i++, lidp++) { 1172 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1172 if (XFS_LIC_ISFREE(licp, i)) { 1173 if (xfs_lic_isfree(licp, i)) {
1173 continue; 1174 continue;
1174 } 1175 }
1175 1176
@@ -1216,6 +1217,68 @@ xfs_trans_free(
1216 kmem_zone_free(xfs_trans_zone, tp); 1217 kmem_zone_free(xfs_trans_zone, tp);
1217} 1218}
1218 1219
1220/*
1221 * Roll from one trans in the sequence of PERMANENT transactions to
1222 * the next: permanent transactions are only flushed out when
1223 * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
1224 * as possible to let chunks of it go to the log. So we commit the
1225 * chunk we've been working on and get a new transaction to continue.
1226 */
1227int
1228xfs_trans_roll(
1229 struct xfs_trans **tpp,
1230 struct xfs_inode *dp)
1231{
1232 struct xfs_trans *trans;
1233 unsigned int logres, count;
1234 int error;
1235
1236 /*
1237 * Ensure that the inode is always logged.
1238 */
1239 trans = *tpp;
1240 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
1241
1242 /*
1243 * Copy the critical parameters from one trans to the next.
1244 */
1245 logres = trans->t_log_res;
1246 count = trans->t_log_count;
1247 *tpp = xfs_trans_dup(trans);
1248
1249 /*
1250 * Commit the current transaction.
1251 * If this commit failed, then it'd just unlock those items that
1252 * are not marked ihold. That also means that a filesystem shutdown
1253 * is in progress. The caller takes the responsibility to cancel
1254 * the duplicate transaction that gets returned.
1255 */
1256 error = xfs_trans_commit(trans, 0);
1257 if (error)
1258 return (error);
1259
1260 trans = *tpp;
1261
1262 /*
1263 * Reserve space in the log for th next transaction.
1264 * This also pushes items in the "AIL", the list of logged items,
1265 * out to disk if they are taking up space at the tail of the log
1266 * that we want to use. This requires that either nothing be locked
1267 * across this call, or that anything that is locked be logged in
1268 * the prior and the next transactions.
1269 */
1270 error = xfs_trans_reserve(trans, 0, logres, 0,
1271 XFS_TRANS_PERM_LOG_RES, count);
1272 /*
1273 * Ensure that the inode is in the new transaction and locked.
1274 */
1275 if (error)
1276 return error;
1277
1278 xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
1279 xfs_trans_ihold(trans, dp);
1280 return 0;
1281}
1219 1282
1220/* 1283/*
1221 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). 1284 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
@@ -1253,7 +1316,7 @@ xfs_trans_committed(
1253 * Special case the chunk embedded in the transaction. 1316 * Special case the chunk embedded in the transaction.
1254 */ 1317 */
1255 licp = &(tp->t_items); 1318 licp = &(tp->t_items);
1256 if (!(XFS_LIC_ARE_ALL_FREE(licp))) { 1319 if (!(xfs_lic_are_all_free(licp))) {
1257 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); 1320 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1258 } 1321 }
1259 1322
@@ -1262,7 +1325,7 @@ xfs_trans_committed(
1262 */ 1325 */
1263 licp = licp->lic_next; 1326 licp = licp->lic_next;
1264 while (licp != NULL) { 1327 while (licp != NULL) {
1265 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); 1328 ASSERT(!xfs_lic_are_all_free(licp));
1266 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); 1329 xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
1267 next_licp = licp->lic_next; 1330 next_licp = licp->lic_next;
1268 kmem_free(licp); 1331 kmem_free(licp);
@@ -1325,7 +1388,7 @@ xfs_trans_chunk_committed(
1325 1388
1326 lidp = licp->lic_descs; 1389 lidp = licp->lic_descs;
1327 for (i = 0; i < licp->lic_unused; i++, lidp++) { 1390 for (i = 0; i < licp->lic_unused; i++, lidp++) {
1328 if (XFS_LIC_ISFREE(licp, i)) { 1391 if (xfs_lic_isfree(licp, i)) {
1329 continue; 1392 continue;
1330 } 1393 }
1331 1394
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0804207c7391..74c80bd2b0ec 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -210,62 +210,52 @@ typedef struct xfs_log_item_chunk {
210 * lic_unused to the right value (0 matches all free). The 210 * lic_unused to the right value (0 matches all free). The
211 * lic_descs.lid_index values are set up as each desc is allocated. 211 * lic_descs.lid_index values are set up as each desc is allocated.
212 */ 212 */
213#define XFS_LIC_INIT(cp) xfs_lic_init(cp)
214static inline void xfs_lic_init(xfs_log_item_chunk_t *cp) 213static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
215{ 214{
216 cp->lic_free = XFS_LIC_FREEMASK; 215 cp->lic_free = XFS_LIC_FREEMASK;
217} 216}
218 217
219#define XFS_LIC_INIT_SLOT(cp,slot) xfs_lic_init_slot(cp, slot)
220static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot) 218static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
221{ 219{
222 cp->lic_descs[slot].lid_index = (unsigned char)(slot); 220 cp->lic_descs[slot].lid_index = (unsigned char)(slot);
223} 221}
224 222
225#define XFS_LIC_VACANCY(cp) xfs_lic_vacancy(cp)
226static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp) 223static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
227{ 224{
228 return cp->lic_free & XFS_LIC_FREEMASK; 225 return cp->lic_free & XFS_LIC_FREEMASK;
229} 226}
230 227
231#define XFS_LIC_ALL_FREE(cp) xfs_lic_all_free(cp)
232static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp) 228static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
233{ 229{
234 cp->lic_free = XFS_LIC_FREEMASK; 230 cp->lic_free = XFS_LIC_FREEMASK;
235} 231}
236 232
237#define XFS_LIC_ARE_ALL_FREE(cp) xfs_lic_are_all_free(cp)
238static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp) 233static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
239{ 234{
240 return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK); 235 return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
241} 236}
242 237
243#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
244static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot) 238static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
245{ 239{
246 return (cp->lic_free & (1 << slot)); 240 return (cp->lic_free & (1 << slot));
247} 241}
248 242
249#define XFS_LIC_CLAIM(cp,slot) xfs_lic_claim(cp,slot)
250static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot) 243static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
251{ 244{
252 cp->lic_free &= ~(1 << slot); 245 cp->lic_free &= ~(1 << slot);
253} 246}
254 247
255#define XFS_LIC_RELSE(cp,slot) xfs_lic_relse(cp,slot)
256static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot) 248static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
257{ 249{
258 cp->lic_free |= 1 << slot; 250 cp->lic_free |= 1 << slot;
259} 251}
260 252
261#define XFS_LIC_SLOT(cp,slot) xfs_lic_slot(cp,slot)
262static inline xfs_log_item_desc_t * 253static inline xfs_log_item_desc_t *
263xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot) 254xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
264{ 255{
265 return &(cp->lic_descs[slot]); 256 return &(cp->lic_descs[slot]);
266} 257}
267 258
268#define XFS_LIC_DESC_TO_SLOT(dp) xfs_lic_desc_to_slot(dp)
269static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp) 259static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
270{ 260{
271 return (uint)dp->lid_index; 261 return (uint)dp->lid_index;
@@ -278,7 +268,6 @@ static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
278 * All of this yields the address of the chunk, which is 268 * All of this yields the address of the chunk, which is
279 * cast to a chunk pointer. 269 * cast to a chunk pointer.
280 */ 270 */
281#define XFS_LIC_DESC_TO_CHUNK(dp) xfs_lic_desc_to_chunk(dp)
282static inline xfs_log_item_chunk_t * 271static inline xfs_log_item_chunk_t *
283xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) 272xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
284{ 273{
@@ -986,6 +975,7 @@ int _xfs_trans_commit(xfs_trans_t *,
986 int *); 975 int *);
987#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL) 976#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL)
988void xfs_trans_cancel(xfs_trans_t *, int); 977void xfs_trans_cancel(xfs_trans_t *, int);
978int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
989int xfs_trans_ail_init(struct xfs_mount *); 979int xfs_trans_ail_init(struct xfs_mount *);
990void xfs_trans_ail_destroy(struct xfs_mount *); 980void xfs_trans_ail_destroy(struct xfs_mount *);
991void xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t); 981void xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index cb0c5839154b..4e855b5ced66 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -1021,16 +1021,16 @@ xfs_trans_buf_item_match(
1021 bp = NULL; 1021 bp = NULL;
1022 len = BBTOB(len); 1022 len = BBTOB(len);
1023 licp = &tp->t_items; 1023 licp = &tp->t_items;
1024 if (!XFS_LIC_ARE_ALL_FREE(licp)) { 1024 if (!xfs_lic_are_all_free(licp)) {
1025 for (i = 0; i < licp->lic_unused; i++) { 1025 for (i = 0; i < licp->lic_unused; i++) {
1026 /* 1026 /*
1027 * Skip unoccupied slots. 1027 * Skip unoccupied slots.
1028 */ 1028 */
1029 if (XFS_LIC_ISFREE(licp, i)) { 1029 if (xfs_lic_isfree(licp, i)) {
1030 continue; 1030 continue;
1031 } 1031 }
1032 1032
1033 lidp = XFS_LIC_SLOT(licp, i); 1033 lidp = xfs_lic_slot(licp, i);
1034 blip = (xfs_buf_log_item_t *)lidp->lid_item; 1034 blip = (xfs_buf_log_item_t *)lidp->lid_item;
1035 if (blip->bli_item.li_type != XFS_LI_BUF) { 1035 if (blip->bli_item.li_type != XFS_LI_BUF) {
1036 continue; 1036 continue;
@@ -1074,7 +1074,7 @@ xfs_trans_buf_item_match_all(
1074 bp = NULL; 1074 bp = NULL;
1075 len = BBTOB(len); 1075 len = BBTOB(len);
1076 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { 1076 for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
1077 if (XFS_LIC_ARE_ALL_FREE(licp)) { 1077 if (xfs_lic_are_all_free(licp)) {
1078 ASSERT(licp == &tp->t_items); 1078 ASSERT(licp == &tp->t_items);
1079 ASSERT(licp->lic_next == NULL); 1079 ASSERT(licp->lic_next == NULL);
1080 return NULL; 1080 return NULL;
@@ -1083,11 +1083,11 @@ xfs_trans_buf_item_match_all(
1083 /* 1083 /*
1084 * Skip unoccupied slots. 1084 * Skip unoccupied slots.
1085 */ 1085 */
1086 if (XFS_LIC_ISFREE(licp, i)) { 1086 if (xfs_lic_isfree(licp, i)) {
1087 continue; 1087 continue;
1088 } 1088 }
1089 1089
1090 lidp = XFS_LIC_SLOT(licp, i); 1090 lidp = xfs_lic_slot(licp, i);
1091 blip = (xfs_buf_log_item_t *)lidp->lid_item; 1091 blip = (xfs_buf_log_item_t *)lidp->lid_item;
1092 if (blip->bli_item.li_type != XFS_LI_BUF) { 1092 if (blip->bli_item.li_type != XFS_LI_BUF) {
1093 continue; 1093 continue;
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index db5c83595526..3c666e8317f8 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -53,11 +53,11 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
53 * Initialize the chunk, and then 53 * Initialize the chunk, and then
54 * claim the first slot in the newly allocated chunk. 54 * claim the first slot in the newly allocated chunk.
55 */ 55 */
56 XFS_LIC_INIT(licp); 56 xfs_lic_init(licp);
57 XFS_LIC_CLAIM(licp, 0); 57 xfs_lic_claim(licp, 0);
58 licp->lic_unused = 1; 58 licp->lic_unused = 1;
59 XFS_LIC_INIT_SLOT(licp, 0); 59 xfs_lic_init_slot(licp, 0);
60 lidp = XFS_LIC_SLOT(licp, 0); 60 lidp = xfs_lic_slot(licp, 0);
61 61
62 /* 62 /*
63 * Link in the new chunk and update the free count. 63 * Link in the new chunk and update the free count.
@@ -88,14 +88,14 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
88 */ 88 */
89 licp = &tp->t_items; 89 licp = &tp->t_items;
90 while (licp != NULL) { 90 while (licp != NULL) {
91 if (XFS_LIC_VACANCY(licp)) { 91 if (xfs_lic_vacancy(licp)) {
92 if (licp->lic_unused <= XFS_LIC_MAX_SLOT) { 92 if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
93 i = licp->lic_unused; 93 i = licp->lic_unused;
94 ASSERT(XFS_LIC_ISFREE(licp, i)); 94 ASSERT(xfs_lic_isfree(licp, i));
95 break; 95 break;
96 } 96 }
97 for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) { 97 for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
98 if (XFS_LIC_ISFREE(licp, i)) 98 if (xfs_lic_isfree(licp, i))
99 break; 99 break;
100 } 100 }
101 ASSERT(i <= XFS_LIC_MAX_SLOT); 101 ASSERT(i <= XFS_LIC_MAX_SLOT);
@@ -108,12 +108,12 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
108 * If we find a free descriptor, claim it, 108 * If we find a free descriptor, claim it,
109 * initialize it, and return it. 109 * initialize it, and return it.
110 */ 110 */
111 XFS_LIC_CLAIM(licp, i); 111 xfs_lic_claim(licp, i);
112 if (licp->lic_unused <= i) { 112 if (licp->lic_unused <= i) {
113 licp->lic_unused = i + 1; 113 licp->lic_unused = i + 1;
114 XFS_LIC_INIT_SLOT(licp, i); 114 xfs_lic_init_slot(licp, i);
115 } 115 }
116 lidp = XFS_LIC_SLOT(licp, i); 116 lidp = xfs_lic_slot(licp, i);
117 tp->t_items_free--; 117 tp->t_items_free--;
118 lidp->lid_item = lip; 118 lidp->lid_item = lip;
119 lidp->lid_flags = 0; 119 lidp->lid_flags = 0;
@@ -136,9 +136,9 @@ xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
136 xfs_log_item_chunk_t *licp; 136 xfs_log_item_chunk_t *licp;
137 xfs_log_item_chunk_t **licpp; 137 xfs_log_item_chunk_t **licpp;
138 138
139 slot = XFS_LIC_DESC_TO_SLOT(lidp); 139 slot = xfs_lic_desc_to_slot(lidp);
140 licp = XFS_LIC_DESC_TO_CHUNK(lidp); 140 licp = xfs_lic_desc_to_chunk(lidp);
141 XFS_LIC_RELSE(licp, slot); 141 xfs_lic_relse(licp, slot);
142 lidp->lid_item->li_desc = NULL; 142 lidp->lid_item->li_desc = NULL;
143 tp->t_items_free++; 143 tp->t_items_free++;
144 144
@@ -154,7 +154,7 @@ xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
154 * Also decrement the transaction structure's count of free items 154 * Also decrement the transaction structure's count of free items
155 * by the number in a chunk since we are freeing an empty chunk. 155 * by the number in a chunk since we are freeing an empty chunk.
156 */ 156 */
157 if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) { 157 if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
158 licpp = &(tp->t_items.lic_next); 158 licpp = &(tp->t_items.lic_next);
159 while (*licpp != licp) { 159 while (*licpp != licp) {
160 ASSERT(*licpp != NULL); 160 ASSERT(*licpp != NULL);
@@ -207,20 +207,20 @@ xfs_trans_first_item(xfs_trans_t *tp)
207 /* 207 /*
208 * If it's not in the first chunk, skip to the second. 208 * If it's not in the first chunk, skip to the second.
209 */ 209 */
210 if (XFS_LIC_ARE_ALL_FREE(licp)) { 210 if (xfs_lic_are_all_free(licp)) {
211 licp = licp->lic_next; 211 licp = licp->lic_next;
212 } 212 }
213 213
214 /* 214 /*
215 * Return the first non-free descriptor in the chunk. 215 * Return the first non-free descriptor in the chunk.
216 */ 216 */
217 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); 217 ASSERT(!xfs_lic_are_all_free(licp));
218 for (i = 0; i < licp->lic_unused; i++) { 218 for (i = 0; i < licp->lic_unused; i++) {
219 if (XFS_LIC_ISFREE(licp, i)) { 219 if (xfs_lic_isfree(licp, i)) {
220 continue; 220 continue;
221 } 221 }
222 222
223 return XFS_LIC_SLOT(licp, i); 223 return xfs_lic_slot(licp, i);
224 } 224 }
225 cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item"); 225 cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
226 return NULL; 226 return NULL;
@@ -242,18 +242,18 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
242 xfs_log_item_chunk_t *licp; 242 xfs_log_item_chunk_t *licp;
243 int i; 243 int i;
244 244
245 licp = XFS_LIC_DESC_TO_CHUNK(lidp); 245 licp = xfs_lic_desc_to_chunk(lidp);
246 246
247 /* 247 /*
248 * First search the rest of the chunk. The for loop keeps us 248 * First search the rest of the chunk. The for loop keeps us
249 * from referencing things beyond the end of the chunk. 249 * from referencing things beyond the end of the chunk.
250 */ 250 */
251 for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) { 251 for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
252 if (XFS_LIC_ISFREE(licp, i)) { 252 if (xfs_lic_isfree(licp, i)) {
253 continue; 253 continue;
254 } 254 }
255 255
256 return XFS_LIC_SLOT(licp, i); 256 return xfs_lic_slot(licp, i);
257 } 257 }
258 258
259 /* 259 /*
@@ -266,13 +266,13 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
266 } 266 }
267 267
268 licp = licp->lic_next; 268 licp = licp->lic_next;
269 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); 269 ASSERT(!xfs_lic_are_all_free(licp));
270 for (i = 0; i < licp->lic_unused; i++) { 270 for (i = 0; i < licp->lic_unused; i++) {
271 if (XFS_LIC_ISFREE(licp, i)) { 271 if (xfs_lic_isfree(licp, i)) {
272 continue; 272 continue;
273 } 273 }
274 274
275 return XFS_LIC_SLOT(licp, i); 275 return xfs_lic_slot(licp, i);
276 } 276 }
277 ASSERT(0); 277 ASSERT(0);
278 /* NOTREACHED */ 278 /* NOTREACHED */
@@ -300,9 +300,9 @@ xfs_trans_free_items(
300 /* 300 /*
301 * Special case the embedded chunk so we don't free it below. 301 * Special case the embedded chunk so we don't free it below.
302 */ 302 */
303 if (!XFS_LIC_ARE_ALL_FREE(licp)) { 303 if (!xfs_lic_are_all_free(licp)) {
304 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 304 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
305 XFS_LIC_ALL_FREE(licp); 305 xfs_lic_all_free(licp);
306 licp->lic_unused = 0; 306 licp->lic_unused = 0;
307 } 307 }
308 licp = licp->lic_next; 308 licp = licp->lic_next;
@@ -311,7 +311,7 @@ xfs_trans_free_items(
311 * Unlock each item in each chunk and free the chunks. 311 * Unlock each item in each chunk and free the chunks.
312 */ 312 */
313 while (licp != NULL) { 313 while (licp != NULL) {
314 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); 314 ASSERT(!xfs_lic_are_all_free(licp));
315 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 315 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
316 next_licp = licp->lic_next; 316 next_licp = licp->lic_next;
317 kmem_free(licp); 317 kmem_free(licp);
@@ -347,7 +347,7 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
347 /* 347 /*
348 * Special case the embedded chunk so we don't free. 348 * Special case the embedded chunk so we don't free.
349 */ 349 */
350 if (!XFS_LIC_ARE_ALL_FREE(licp)) { 350 if (!xfs_lic_are_all_free(licp)) {
351 freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); 351 freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
352 } 352 }
353 licpp = &(tp->t_items.lic_next); 353 licpp = &(tp->t_items.lic_next);
@@ -358,10 +358,10 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
358 * and free empty chunks. 358 * and free empty chunks.
359 */ 359 */
360 while (licp != NULL) { 360 while (licp != NULL) {
361 ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); 361 ASSERT(!xfs_lic_are_all_free(licp));
362 freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); 362 freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
363 next_licp = licp->lic_next; 363 next_licp = licp->lic_next;
364 if (XFS_LIC_ARE_ALL_FREE(licp)) { 364 if (xfs_lic_are_all_free(licp)) {
365 *licpp = next_licp; 365 *licpp = next_licp;
366 kmem_free(licp); 366 kmem_free(licp);
367 freed -= XFS_LIC_NUM_SLOTS; 367 freed -= XFS_LIC_NUM_SLOTS;
@@ -402,7 +402,7 @@ xfs_trans_unlock_chunk(
402 freed = 0; 402 freed = 0;
403 lidp = licp->lic_descs; 403 lidp = licp->lic_descs;
404 for (i = 0; i < licp->lic_unused; i++, lidp++) { 404 for (i = 0; i < licp->lic_unused; i++, lidp++) {
405 if (XFS_LIC_ISFREE(licp, i)) { 405 if (xfs_lic_isfree(licp, i)) {
406 continue; 406 continue;
407 } 407 }
408 lip = lidp->lid_item; 408 lip = lidp->lid_item;
@@ -421,7 +421,7 @@ xfs_trans_unlock_chunk(
421 */ 421 */
422 if (!(freeing_chunk) && 422 if (!(freeing_chunk) &&
423 (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) { 423 (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
424 XFS_LIC_RELSE(licp, i); 424 xfs_lic_relse(licp, i);
425 freed++; 425 freed++;
426 } 426 }
427 } 427 }
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 98e5f110ba5f..35d4d414bcc2 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -237,7 +237,7 @@ xfs_droplink(
237 237
238 ASSERT (ip->i_d.di_nlink > 0); 238 ASSERT (ip->i_d.di_nlink > 0);
239 ip->i_d.di_nlink--; 239 ip->i_d.di_nlink--;
240 drop_nlink(ip->i_vnode); 240 drop_nlink(VFS_I(ip));
241 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 241 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
242 242
243 error = 0; 243 error = 0;
@@ -301,7 +301,7 @@ xfs_bumplink(
301 301
302 ASSERT(ip->i_d.di_nlink > 0); 302 ASSERT(ip->i_d.di_nlink > 0);
303 ip->i_d.di_nlink++; 303 ip->i_d.di_nlink++;
304 inc_nlink(ip->i_vnode); 304 inc_nlink(VFS_I(ip));
305 if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) && 305 if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
306 (ip->i_d.di_nlink > XFS_MAXLINK_1)) { 306 (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
307 /* 307 /*
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f316cb85d8e2..ef321225d269 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,9 +18,6 @@
18#ifndef __XFS_UTILS_H__ 18#ifndef __XFS_UTILS_H__
19#define __XFS_UTILS_H__ 19#define __XFS_UTILS_H__
20 20
21#define IRELE(ip) VN_RELE(XFS_ITOV(ip))
22#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip))
23
24extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *); 21extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
25extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, 22extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
26 xfs_dev_t, cred_t *, prid_t, int, 23 xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 4a9a43315a86..439dd3939dda 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -128,7 +128,6 @@ xfs_unmount_flush(
128 xfs_inode_t *rip = mp->m_rootip; 128 xfs_inode_t *rip = mp->m_rootip;
129 xfs_inode_t *rbmip; 129 xfs_inode_t *rbmip;
130 xfs_inode_t *rsumip = NULL; 130 xfs_inode_t *rsumip = NULL;
131 bhv_vnode_t *rvp = XFS_ITOV(rip);
132 int error; 131 int error;
133 132
134 xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 133 xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -146,7 +145,7 @@ xfs_unmount_flush(
146 if (error == EFSCORRUPTED) 145 if (error == EFSCORRUPTED)
147 goto fscorrupt_out; 146 goto fscorrupt_out;
148 147
149 ASSERT(vn_count(XFS_ITOV(rbmip)) == 1); 148 ASSERT(vn_count(VFS_I(rbmip)) == 1);
150 149
151 rsumip = mp->m_rsumip; 150 rsumip = mp->m_rsumip;
152 xfs_ilock(rsumip, XFS_ILOCK_EXCL); 151 xfs_ilock(rsumip, XFS_ILOCK_EXCL);
@@ -157,7 +156,7 @@ xfs_unmount_flush(
157 if (error == EFSCORRUPTED) 156 if (error == EFSCORRUPTED)
158 goto fscorrupt_out; 157 goto fscorrupt_out;
159 158
160 ASSERT(vn_count(XFS_ITOV(rsumip)) == 1); 159 ASSERT(vn_count(VFS_I(rsumip)) == 1);
161 } 160 }
162 161
163 /* 162 /*
@@ -167,7 +166,7 @@ xfs_unmount_flush(
167 if (error == EFSCORRUPTED) 166 if (error == EFSCORRUPTED)
168 goto fscorrupt_out2; 167 goto fscorrupt_out2;
169 168
170 if (vn_count(rvp) != 1 && !relocation) { 169 if (vn_count(VFS_I(rip)) != 1 && !relocation) {
171 xfs_iunlock(rip, XFS_ILOCK_EXCL); 170 xfs_iunlock(rip, XFS_ILOCK_EXCL);
172 return XFS_ERROR(EBUSY); 171 return XFS_ERROR(EBUSY);
173 } 172 }
@@ -284,7 +283,7 @@ xfs_sync_inodes(
284 int *bypassed) 283 int *bypassed)
285{ 284{
286 xfs_inode_t *ip = NULL; 285 xfs_inode_t *ip = NULL;
287 bhv_vnode_t *vp = NULL; 286 struct inode *vp = NULL;
288 int error; 287 int error;
289 int last_error; 288 int last_error;
290 uint64_t fflag; 289 uint64_t fflag;
@@ -404,7 +403,7 @@ xfs_sync_inodes(
404 continue; 403 continue;
405 } 404 }
406 405
407 vp = XFS_ITOV_NULL(ip); 406 vp = VFS_I(ip);
408 407
409 /* 408 /*
410 * If the vnode is gone then this is being torn down, 409 * If the vnode is gone then this is being torn down,
@@ -479,7 +478,7 @@ xfs_sync_inodes(
479 IPOINTER_INSERT(ip, mp); 478 IPOINTER_INSERT(ip, mp);
480 xfs_ilock(ip, lock_flags); 479 xfs_ilock(ip, lock_flags);
481 480
482 ASSERT(vp == XFS_ITOV(ip)); 481 ASSERT(vp == VFS_I(ip));
483 ASSERT(ip->i_mount == mp); 482 ASSERT(ip->i_mount == mp);
484 483
485 vnode_refed = B_TRUE; 484 vnode_refed = B_TRUE;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 76a1166af822..8b6812f66a15 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -83,7 +83,7 @@ xfs_setattr(
83 cred_t *credp) 83 cred_t *credp)
84{ 84{
85 xfs_mount_t *mp = ip->i_mount; 85 xfs_mount_t *mp = ip->i_mount;
86 struct inode *inode = XFS_ITOV(ip); 86 struct inode *inode = VFS_I(ip);
87 int mask = iattr->ia_valid; 87 int mask = iattr->ia_valid;
88 xfs_trans_t *tp; 88 xfs_trans_t *tp;
89 int code; 89 int code;
@@ -182,7 +182,7 @@ xfs_setattr(
182 xfs_ilock(ip, lock_flags); 182 xfs_ilock(ip, lock_flags);
183 183
184 /* boolean: are we the file owner? */ 184 /* boolean: are we the file owner? */
185 file_owner = (current_fsuid(credp) == ip->i_d.di_uid); 185 file_owner = (current_fsuid() == ip->i_d.di_uid);
186 186
187 /* 187 /*
188 * Change various properties of a file. 188 * Change various properties of a file.
@@ -513,7 +513,6 @@ xfs_setattr(
513 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; 513 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
514 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; 514 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
515 ip->i_update_core = 1; 515 ip->i_update_core = 1;
516 timeflags &= ~XFS_ICHGTIME_ACC;
517 } 516 }
518 if (mask & ATTR_MTIME) { 517 if (mask & ATTR_MTIME) {
519 inode->i_mtime = iattr->ia_mtime; 518 inode->i_mtime = iattr->ia_mtime;
@@ -714,7 +713,7 @@ xfs_fsync(
714 return XFS_ERROR(EIO); 713 return XFS_ERROR(EIO);
715 714
716 /* capture size updates in I/O completion before writing the inode. */ 715 /* capture size updates in I/O completion before writing the inode. */
717 error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping); 716 error = filemap_fdatawait(VFS_I(ip)->i_mapping);
718 if (error) 717 if (error)
719 return XFS_ERROR(error); 718 return XFS_ERROR(error);
720 719
@@ -1160,7 +1159,6 @@ int
1160xfs_release( 1159xfs_release(
1161 xfs_inode_t *ip) 1160 xfs_inode_t *ip)
1162{ 1161{
1163 bhv_vnode_t *vp = XFS_ITOV(ip);
1164 xfs_mount_t *mp = ip->i_mount; 1162 xfs_mount_t *mp = ip->i_mount;
1165 int error; 1163 int error;
1166 1164
@@ -1195,13 +1193,13 @@ xfs_release(
1195 * be exposed to that problem. 1193 * be exposed to that problem.
1196 */ 1194 */
1197 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1195 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1198 if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0) 1196 if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
1199 xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); 1197 xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1200 } 1198 }
1201 1199
1202 if (ip->i_d.di_nlink != 0) { 1200 if (ip->i_d.di_nlink != 0) {
1203 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 1201 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1204 ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || 1202 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
1205 ip->i_delayed_blks > 0)) && 1203 ip->i_delayed_blks > 0)) &&
1206 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 1204 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
1207 (!(ip->i_d.di_flags & 1205 (!(ip->i_d.di_flags &
@@ -1227,7 +1225,6 @@ int
1227xfs_inactive( 1225xfs_inactive(
1228 xfs_inode_t *ip) 1226 xfs_inode_t *ip)
1229{ 1227{
1230 bhv_vnode_t *vp = XFS_ITOV(ip);
1231 xfs_bmap_free_t free_list; 1228 xfs_bmap_free_t free_list;
1232 xfs_fsblock_t first_block; 1229 xfs_fsblock_t first_block;
1233 int committed; 1230 int committed;
@@ -1242,7 +1239,7 @@ xfs_inactive(
1242 * If the inode is already free, then there can be nothing 1239 * If the inode is already free, then there can be nothing
1243 * to clean up here. 1240 * to clean up here.
1244 */ 1241 */
1245 if (ip->i_d.di_mode == 0 || VN_BAD(vp)) { 1242 if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
1246 ASSERT(ip->i_df.if_real_bytes == 0); 1243 ASSERT(ip->i_df.if_real_bytes == 0);
1247 ASSERT(ip->i_df.if_broot_bytes == 0); 1244 ASSERT(ip->i_df.if_broot_bytes == 0);
1248 return VN_INACTIVE_CACHE; 1245 return VN_INACTIVE_CACHE;
@@ -1272,7 +1269,7 @@ xfs_inactive(
1272 1269
1273 if (ip->i_d.di_nlink != 0) { 1270 if (ip->i_d.di_nlink != 0) {
1274 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 1271 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1275 ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || 1272 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
1276 ip->i_delayed_blks > 0)) && 1273 ip->i_delayed_blks > 0)) &&
1277 (ip->i_df.if_flags & XFS_IFEXTENTS) && 1274 (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1278 (!(ip->i_d.di_flags & 1275 (!(ip->i_d.di_flags &
@@ -1536,7 +1533,7 @@ xfs_create(
1536 * Make sure that we have allocated dquot(s) on disk. 1533 * Make sure that we have allocated dquot(s) on disk.
1537 */ 1534 */
1538 error = XFS_QM_DQVOPALLOC(mp, dp, 1535 error = XFS_QM_DQVOPALLOC(mp, dp,
1539 current_fsuid(credp), current_fsgid(credp), prid, 1536 current_fsuid(), current_fsgid(), prid,
1540 XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); 1537 XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1541 if (error) 1538 if (error)
1542 goto std_return; 1539 goto std_return;
@@ -1708,111 +1705,6 @@ std_return:
1708} 1705}
1709 1706
1710#ifdef DEBUG 1707#ifdef DEBUG
1711/*
1712 * Some counters to see if (and how often) we are hitting some deadlock
1713 * prevention code paths.
1714 */
1715
1716int xfs_rm_locks;
1717int xfs_rm_lock_delays;
1718int xfs_rm_attempts;
1719#endif
1720
1721/*
1722 * The following routine will lock the inodes associated with the
1723 * directory and the named entry in the directory. The locks are
1724 * acquired in increasing inode number.
1725 *
1726 * If the entry is "..", then only the directory is locked. The
1727 * vnode ref count will still include that from the .. entry in
1728 * this case.
1729 *
1730 * There is a deadlock we need to worry about. If the locked directory is
1731 * in the AIL, it might be blocking up the log. The next inode we lock
1732 * could be already locked by another thread waiting for log space (e.g
1733 * a permanent log reservation with a long running transaction (see
1734 * xfs_itruncate_finish)). To solve this, we must check if the directory
1735 * is in the ail and use lock_nowait. If we can't lock, we need to
1736 * drop the inode lock on the directory and try again. xfs_iunlock will
1737 * potentially push the tail if we were holding up the log.
1738 */
1739STATIC int
1740xfs_lock_dir_and_entry(
1741 xfs_inode_t *dp,
1742 xfs_inode_t *ip) /* inode of entry 'name' */
1743{
1744 int attempts;
1745 xfs_ino_t e_inum;
1746 xfs_inode_t *ips[2];
1747 xfs_log_item_t *lp;
1748
1749#ifdef DEBUG
1750 xfs_rm_locks++;
1751#endif
1752 attempts = 0;
1753
1754again:
1755 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1756
1757 e_inum = ip->i_ino;
1758
1759 xfs_itrace_ref(ip);
1760
1761 /*
1762 * We want to lock in increasing inum. Since we've already
1763 * acquired the lock on the directory, we may need to release
1764 * if if the inum of the entry turns out to be less.
1765 */
1766 if (e_inum > dp->i_ino) {
1767 /*
1768 * We are already in the right order, so just
1769 * lock on the inode of the entry.
1770 * We need to use nowait if dp is in the AIL.
1771 */
1772
1773 lp = (xfs_log_item_t *)dp->i_itemp;
1774 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1775 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1776 attempts++;
1777#ifdef DEBUG
1778 xfs_rm_attempts++;
1779#endif
1780
1781 /*
1782 * Unlock dp and try again.
1783 * xfs_iunlock will try to push the tail
1784 * if the inode is in the AIL.
1785 */
1786
1787 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1788
1789 if ((attempts % 5) == 0) {
1790 delay(1); /* Don't just spin the CPU */
1791#ifdef DEBUG
1792 xfs_rm_lock_delays++;
1793#endif
1794 }
1795 goto again;
1796 }
1797 } else {
1798 xfs_ilock(ip, XFS_ILOCK_EXCL);
1799 }
1800 } else if (e_inum < dp->i_ino) {
1801 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1802
1803 ips[0] = ip;
1804 ips[1] = dp;
1805 xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
1806 }
1807 /* else e_inum == dp->i_ino */
1808 /* This can happen if we're asked to lock /x/..
1809 * the entry is "..", which is also the parent directory.
1810 */
1811
1812 return 0;
1813}
1814
1815#ifdef DEBUG
1816int xfs_locked_n; 1708int xfs_locked_n;
1817int xfs_small_retries; 1709int xfs_small_retries;
1818int xfs_middle_retries; 1710int xfs_middle_retries;
@@ -1946,6 +1838,53 @@ again:
1946#endif 1838#endif
1947} 1839}
1948 1840
1841/*
1842 * xfs_lock_two_inodes() can only be used to lock one type of lock
1843 * at a time - the iolock or the ilock, but not both at once. If
1844 * we lock both at once, lockdep will report false positives saying
1845 * we have violated locking orders.
1846 */
1847void
1848xfs_lock_two_inodes(
1849 xfs_inode_t *ip0,
1850 xfs_inode_t *ip1,
1851 uint lock_mode)
1852{
1853 xfs_inode_t *temp;
1854 int attempts = 0;
1855 xfs_log_item_t *lp;
1856
1857 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1858 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
1859 ASSERT(ip0->i_ino != ip1->i_ino);
1860
1861 if (ip0->i_ino > ip1->i_ino) {
1862 temp = ip0;
1863 ip0 = ip1;
1864 ip1 = temp;
1865 }
1866
1867 again:
1868 xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
1869
1870 /*
1871 * If the first lock we have locked is in the AIL, we must TRY to get
1872 * the second lock. If we can't get it, we must release the first one
1873 * and try again.
1874 */
1875 lp = (xfs_log_item_t *)ip0->i_itemp;
1876 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
1877 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
1878 xfs_iunlock(ip0, lock_mode);
1879 if ((++attempts % 5) == 0)
1880 delay(1); /* Don't just spin the CPU */
1881 goto again;
1882 }
1883 } else {
1884 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
1885 }
1886}
1887
1949int 1888int
1950xfs_remove( 1889xfs_remove(
1951 xfs_inode_t *dp, 1890 xfs_inode_t *dp,
@@ -2018,9 +1957,7 @@ xfs_remove(
2018 goto out_trans_cancel; 1957 goto out_trans_cancel;
2019 } 1958 }
2020 1959
2021 error = xfs_lock_dir_and_entry(dp, ip); 1960 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
2022 if (error)
2023 goto out_trans_cancel;
2024 1961
2025 /* 1962 /*
2026 * At this point, we've gotten both the directory and the entry 1963 * At this point, we've gotten both the directory and the entry
@@ -2047,9 +1984,6 @@ xfs_remove(
2047 } 1984 }
2048 } 1985 }
2049 1986
2050 /*
2051 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2052 */
2053 XFS_BMAP_INIT(&free_list, &first_block); 1987 XFS_BMAP_INIT(&free_list, &first_block);
2054 error = xfs_dir_removename(tp, dp, name, ip->i_ino, 1988 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2055 &first_block, &free_list, resblks); 1989 &first_block, &free_list, resblks);
@@ -2155,7 +2089,6 @@ xfs_link(
2155{ 2089{
2156 xfs_mount_t *mp = tdp->i_mount; 2090 xfs_mount_t *mp = tdp->i_mount;
2157 xfs_trans_t *tp; 2091 xfs_trans_t *tp;
2158 xfs_inode_t *ips[2];
2159 int error; 2092 int error;
2160 xfs_bmap_free_t free_list; 2093 xfs_bmap_free_t free_list;
2161 xfs_fsblock_t first_block; 2094 xfs_fsblock_t first_block;
@@ -2203,15 +2136,7 @@ xfs_link(
2203 goto error_return; 2136 goto error_return;
2204 } 2137 }
2205 2138
2206 if (sip->i_ino < tdp->i_ino) { 2139 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
2207 ips[0] = sip;
2208 ips[1] = tdp;
2209 } else {
2210 ips[0] = tdp;
2211 ips[1] = sip;
2212 }
2213
2214 xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
2215 2140
2216 /* 2141 /*
2217 * Increment vnode ref counts since xfs_trans_commit & 2142 * Increment vnode ref counts since xfs_trans_commit &
@@ -2352,7 +2277,7 @@ xfs_mkdir(
2352 * Make sure that we have allocated dquot(s) on disk. 2277 * Make sure that we have allocated dquot(s) on disk.
2353 */ 2278 */
2354 error = XFS_QM_DQVOPALLOC(mp, dp, 2279 error = XFS_QM_DQVOPALLOC(mp, dp,
2355 current_fsuid(credp), current_fsgid(credp), prid, 2280 current_fsuid(), current_fsgid(), prid,
2356 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 2281 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2357 if (error) 2282 if (error)
2358 goto std_return; 2283 goto std_return;
@@ -2578,7 +2503,7 @@ xfs_symlink(
2578 * Make sure that we have allocated dquot(s) on disk. 2503 * Make sure that we have allocated dquot(s) on disk.
2579 */ 2504 */
2580 error = XFS_QM_DQVOPALLOC(mp, dp, 2505 error = XFS_QM_DQVOPALLOC(mp, dp,
2581 current_fsuid(credp), current_fsgid(credp), prid, 2506 current_fsuid(), current_fsgid(), prid,
2582 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 2507 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2583 if (error) 2508 if (error)
2584 goto std_return; 2509 goto std_return;
@@ -2873,14 +2798,13 @@ int
2873xfs_reclaim( 2798xfs_reclaim(
2874 xfs_inode_t *ip) 2799 xfs_inode_t *ip)
2875{ 2800{
2876 bhv_vnode_t *vp = XFS_ITOV(ip);
2877 2801
2878 xfs_itrace_entry(ip); 2802 xfs_itrace_entry(ip);
2879 2803
2880 ASSERT(!VN_MAPPED(vp)); 2804 ASSERT(!VN_MAPPED(VFS_I(ip)));
2881 2805
2882 /* bad inode, get out here ASAP */ 2806 /* bad inode, get out here ASAP */
2883 if (VN_BAD(vp)) { 2807 if (VN_BAD(VFS_I(ip))) {
2884 xfs_ireclaim(ip); 2808 xfs_ireclaim(ip);
2885 return 0; 2809 return 0;
2886 } 2810 }
@@ -2917,7 +2841,7 @@ xfs_reclaim(
2917 XFS_MOUNT_ILOCK(mp); 2841 XFS_MOUNT_ILOCK(mp);
2918 spin_lock(&ip->i_flags_lock); 2842 spin_lock(&ip->i_flags_lock);
2919 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 2843 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
2920 vn_to_inode(vp)->i_private = NULL; 2844 VFS_I(ip)->i_private = NULL;
2921 ip->i_vnode = NULL; 2845 ip->i_vnode = NULL;
2922 spin_unlock(&ip->i_flags_lock); 2846 spin_unlock(&ip->i_flags_lock);
2923 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); 2847 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
@@ -2933,7 +2857,7 @@ xfs_finish_reclaim(
2933 int sync_mode) 2857 int sync_mode)
2934{ 2858{
2935 xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); 2859 xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
2936 bhv_vnode_t *vp = XFS_ITOV_NULL(ip); 2860 struct inode *vp = VFS_I(ip);
2937 2861
2938 if (vp && VN_BAD(vp)) 2862 if (vp && VN_BAD(vp))
2939 goto reclaim; 2863 goto reclaim;
@@ -3236,6 +3160,13 @@ error1: /* Just cancel transaction */
3236/* 3160/*
3237 * Zero file bytes between startoff and endoff inclusive. 3161 * Zero file bytes between startoff and endoff inclusive.
3238 * The iolock is held exclusive and no blocks are buffered. 3162 * The iolock is held exclusive and no blocks are buffered.
3163 *
3164 * This function is used by xfs_free_file_space() to zero
3165 * partial blocks when the range to free is not block aligned.
3166 * When unreserving space with boundaries that are not block
3167 * aligned we round up the start and round down the end
3168 * boundaries and then use this function to zero the parts of
3169 * the blocks that got dropped during the rounding.
3239 */ 3170 */
3240STATIC int 3171STATIC int
3241xfs_zero_remaining_bytes( 3172xfs_zero_remaining_bytes(
@@ -3252,6 +3183,17 @@ xfs_zero_remaining_bytes(
3252 int nimap; 3183 int nimap;
3253 int error = 0; 3184 int error = 0;
3254 3185
3186 /*
3187 * Avoid doing I/O beyond eof - it's not necessary
3188 * since nothing can read beyond eof. The space will
3189 * be zeroed when the file is extended anyway.
3190 */
3191 if (startoff >= ip->i_size)
3192 return 0;
3193
3194 if (endoff > ip->i_size)
3195 endoff = ip->i_size;
3196
3255 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 3197 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3256 XFS_IS_REALTIME_INODE(ip) ? 3198 XFS_IS_REALTIME_INODE(ip) ?
3257 mp->m_rtdev_targp : mp->m_ddev_targp); 3199 mp->m_rtdev_targp : mp->m_ddev_targp);
@@ -3321,7 +3263,6 @@ xfs_free_file_space(
3321 xfs_off_t len, 3263 xfs_off_t len,
3322 int attr_flags) 3264 int attr_flags)
3323{ 3265{
3324 bhv_vnode_t *vp;
3325 int committed; 3266 int committed;
3326 int done; 3267 int done;
3327 xfs_off_t end_dmi_offset; 3268 xfs_off_t end_dmi_offset;
@@ -3341,7 +3282,6 @@ xfs_free_file_space(
3341 xfs_trans_t *tp; 3282 xfs_trans_t *tp;
3342 int need_iolock = 1; 3283 int need_iolock = 1;
3343 3284
3344 vp = XFS_ITOV(ip);
3345 mp = ip->i_mount; 3285 mp = ip->i_mount;
3346 3286
3347 xfs_itrace_entry(ip); 3287 xfs_itrace_entry(ip);
@@ -3378,7 +3318,7 @@ xfs_free_file_space(
3378 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 3318 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
3379 ioffset = offset & ~(rounding - 1); 3319 ioffset = offset & ~(rounding - 1);
3380 3320
3381 if (VN_CACHED(vp) != 0) { 3321 if (VN_CACHED(VFS_I(ip)) != 0) {
3382 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1); 3322 xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
3383 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); 3323 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
3384 if (error) 3324 if (error)