aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_dir.c1
-rw-r--r--fs/9p/vfs_inode.c3
-rw-r--r--fs/Kconfig167
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/Makefile8
-rw-r--r--fs/adfs/dir.c1
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/dir.c1
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/file.c4
-rw-r--r--fs/afs/internal.h8
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/write.c131
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs4/Makefile2
-rw-r--r--fs/autofs4/autofs_i.h44
-rw-r--r--fs/autofs4/dev-ioctl.c863
-rw-r--r--fs/autofs4/expire.c18
-rw-r--r--fs/autofs4/init.c11
-rw-r--r--fs/autofs4/inode.c10
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/autofs4/waitq.c42
-rw-r--r--fs/befs/befs_fs_types.h4
-rw-r--r--fs/befs/linuxvfs.c7
-rw-r--r--fs/befs/super.c6
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/binfmt_elf.c25
-rw-r--r--fs/binfmt_elf_fdpic.c85
-rw-r--r--fs/binfmt_em86.c2
-rw-r--r--fs/binfmt_flat.c6
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/binfmt_script.c5
-rw-r--r--fs/binfmt_som.c2
-rw-r--r--fs/bio-integrity.c29
-rw-r--r--fs/bio.c307
-rw-r--r--fs/block_dev.c180
-rw-r--r--fs/buffer.c13
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/cifs/CHANGES10
-rw-r--r--fs/cifs/README44
-rw-r--r--fs/cifs/asn1.c11
-rw-r--r--fs/cifs/cifs_spnego.c39
-rw-r--r--fs/cifs/cifs_spnego.h2
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h4
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c84
-rw-r--r--fs/cifs/connect.c33
-rw-r--r--fs/cifs/dns_resolve.c77
-rw-r--r--fs/cifs/file.c134
-rw-r--r--fs/cifs/inode.c647
-rw-r--r--fs/cifs/misc.c8
-rw-r--r--fs/cifs/readdir.c128
-rw-r--r--fs/cifs/sess.c17
-rw-r--r--fs/cifs/transport.c3
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/compat.c49
-rw-r--r--fs/configfs/dir.c17
-rw-r--r--fs/dcache.c12
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c68
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/dlm/config.c77
-rw-r--r--fs/dlm/dlm_internal.h7
-rw-r--r--fs/dlm/lockspace.c158
-rw-r--r--fs/dlm/lockspace.h1
-rw-r--r--fs/dlm/user.c124
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/dquot.c8
-rw-r--r--fs/ecryptfs/Makefile2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h28
-rw-r--r--fs/ecryptfs/file.c17
-rw-r--r--fs/ecryptfs/keystore.c32
-rw-r--r--fs/ecryptfs/main.c21
-rw-r--r--fs/ecryptfs/messaging.c118
-rw-r--r--fs/ecryptfs/mmap.c81
-rw-r--r--fs/ecryptfs/netlink.c249
-rw-r--r--fs/efs/namei.c3
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/eventpoll.c9
-rw-r--r--fs/exec.c17
-rw-r--r--fs/ext2/balloc.c3
-rw-r--r--fs/ext2/dir.c60
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext3/file.c1
-rw-r--r--fs/ext3/inode.c8
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/Makefile10
-rw-r--r--fs/ext4/acl.h12
-rw-r--r--fs/ext4/balloc.c1470
-rw-r--r--fs/ext4/bitmap.c6
-rw-r--r--fs/ext4/dir.c84
-rw-r--r--fs/ext4/ext4.h138
-rw-r--r--fs/ext4/ext4_extents.h19
-rw-r--r--fs/ext4/ext4_i.h39
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/ext4_sb.h28
-rw-r--r--fs/ext4/extents.c392
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/ext4/fsync.c7
-rw-r--r--fs/ext4/hash.c8
-rw-r--r--fs/ext4/ialloc.c73
-rw-r--r--fs/ext4/inode.c1139
-rw-r--r--fs/ext4/ioctl.c96
-rw-r--r--fs/ext4/mballoc.c536
-rw-r--r--fs/ext4/mballoc.h32
-rw-r--r--fs/ext4/migrate.c13
-rw-r--r--fs/ext4/namei.c402
-rw-r--r--fs/ext4/resize.c36
-rw-r--r--fs/ext4/super.c438
-rw-r--r--fs/ext4/symlink.c8
-rw-r--r--fs/ext4/xattr.c14
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/fat/fatent.c14
-rw-r--r--fs/fat/inode.c6
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/glock.c15
-rw-r--r--fs/gfs2/glock.h1
-rw-r--r--fs/gfs2/incore.h38
-rw-r--r--fs/gfs2/inode.c159
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/locking/dlm/mount.c3
-rw-r--r--fs/gfs2/log.c21
-rw-r--r--fs/gfs2/mount.c9
-rw-r--r--fs/gfs2/ops_address.c18
-rw-r--r--fs/gfs2/ops_file.c16
-rw-r--r--fs/gfs2/ops_fstype.c578
-rw-r--r--fs/gfs2/ops_inode.c127
-rw-r--r--fs/gfs2/ops_super.c108
-rw-r--r--fs/gfs2/super.c340
-rw-r--r--fs/gfs2/super.h6
-rw-r--r--fs/gfs2/sys.c11
-rw-r--r--fs/hfs/catalog.c4
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/bitmap.c12
-rw-r--r--fs/hfsplus/catalog.c5
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inotify_user.c27
-rw-r--r--fs/ioctl.c277
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jbd2/checkpoint.c71
-rw-r--r--fs/jbd2/commit.c35
-rw-r--r--fs/jbd2/journal.c103
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/jbd2/transaction.c1
-rw-r--r--fs/jffs2/jffs2_fs_i.h1
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/lockd/Makefile2
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/grace.c59
-rw-r--r--fs/lockd/host.c350
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc.c88
-rw-r--r--fs/lockd/svc4proc.c31
-rw-r--r--fs/lockd/svclock.c18
-rw-r--r--fs/lockd/svcproc.c31
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/lockd/xdr.c2
-rw-r--r--fs/lockd/xdr4.c2
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/nfs/callback.c3
-rw-r--r--fs/nfs/client.c5
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/file.c18
-rw-r--r--fs/nfs/inode.c183
-rw-r--r--fs/nfs/internal.h25
-rw-r--r--fs/nfs/mount_clnt.c3
-rw-r--r--fs/nfs/namespace.c7
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c20
-rw-r--r--fs/nfs/nfs4namespace.c105
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/proc.c10
-rw-r--r--fs/nfs/super.c138
-rw-r--r--fs/nfs/unlink.c5
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c7
-rw-r--r--fs/nfsd/nfs4proc.c20
-rw-r--r--fs/nfsd/nfs4state.c34
-rw-r--r--fs/nfsd/nfs4xdr.c171
-rw-r--r--fs/nfsd/nfsctl.c5
-rw-r--r--fs/nfsd/nfsfh.c30
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c20
-rw-r--r--fs/nfsd/vfs.c63
-rw-r--r--fs/nls/nls_base.c21
-rw-r--r--fs/ntfs/namei.c89
-rw-r--r--fs/ntfs/usnjrnl.h4
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/alloc.c922
-rw-r--r--fs/ocfs2/alloc.h95
-rw-r--r--fs/ocfs2/aops.c62
-rw-r--r--fs/ocfs2/buffer_head_io.c134
-rw-r--r--fs/ocfs2/buffer_head_io.h23
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c26
-rw-r--r--fs/ocfs2/cluster/tcp.c44
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h32
-rw-r--r--fs/ocfs2/dir.c120
-rw-r--r--fs/ocfs2/dlmglue.c9
-rw-r--r--fs/ocfs2/extent_map.c386
-rw-r--r--fs/ocfs2/extent_map.h7
-rw-r--r--fs/ocfs2/file.c334
-rw-r--r--fs/ocfs2/file.h32
-rw-r--r--fs/ocfs2/inode.c87
-rw-r--r--fs/ocfs2/inode.h6
-rw-r--r--fs/ocfs2/ioctl.c3
-rw-r--r--fs/ocfs2/journal.c112
-rw-r--r--fs/ocfs2/journal.h52
-rw-r--r--fs/ocfs2/localalloc.c384
-rw-r--r--fs/ocfs2/localalloc.h4
-rw-r--r--fs/ocfs2/locks.c15
-rw-r--r--fs/ocfs2/locks.h1
-rw-r--r--fs/ocfs2/namei.c101
-rw-r--r--fs/ocfs2/ocfs2.h56
-rw-r--r--fs/ocfs2/ocfs2_fs.h220
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/resize.c11
-rw-r--r--fs/ocfs2/slot_map.c7
-rw-r--r--fs/ocfs2/stack_user.c33
-rw-r--r--fs/ocfs2/stackglue.c27
-rw-r--r--fs/ocfs2/stackglue.h19
-rw-r--r--fs/ocfs2/suballoc.c248
-rw-r--r--fs/ocfs2/suballoc.h26
-rw-r--r--fs/ocfs2/super.c64
-rw-r--r--fs/ocfs2/symlink.c18
-rw-r--r--fs/ocfs2/uptodate.c38
-rw-r--r--fs/ocfs2/uptodate.h3
-rw-r--r--fs/ocfs2/xattr.c4832
-rw-r--r--fs/ocfs2/xattr.h68
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/open.c3
-rw-r--r--fs/partitions/acorn.c10
-rw-r--r--fs/partitions/check.c297
-rw-r--r--fs/partitions/check.h4
-rw-r--r--fs/proc/Kconfig10
-rw-r--r--fs/proc/array.c74
-rw-r--r--fs/proc/base.c21
-rw-r--r--fs/proc/generic.c5
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/proc_misc.c38
-rw-r--r--fs/proc/proc_sysctl.c6
-rw-r--r--fs/proc/task_mmu.c16
-rw-r--r--fs/proc/task_nommu.c5
-rw-r--r--fs/proc/vmcore.c6
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/readdir.c8
-rw-r--r--fs/reiserfs/procfs.c3
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/seq_file.c11
-rw-r--r--fs/splice.c3
-rw-r--r--fs/sysfs/bin.c42
-rw-r--r--fs/sysfs/dir.c24
-rw-r--r--fs/sysfs/file.c46
-rw-r--r--fs/sysfs/mount.c15
-rw-r--r--fs/sysfs/sysfs.h6
-rw-r--r--fs/ubifs/budget.c114
-rw-r--r--fs/ubifs/debug.c2
-rw-r--r--fs/ubifs/dir.c3
-rw-r--r--fs/ubifs/file.c20
-rw-r--r--fs/ubifs/find.c19
-rw-r--r--fs/ubifs/gc.c20
-rw-r--r--fs/ubifs/misc.h49
-rw-r--r--fs/ubifs/super.c27
-rw-r--r--fs/ubifs/tnc.c116
-rw-r--r--fs/ubifs/ubifs-media.h2
-rw-r--r--fs/ubifs/ubifs.h14
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/ialloc.c44
-rw-r--r--fs/udf/super.c2
-rw-r--r--fs/ufs/super.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c22
-rw-r--r--fs/xfs/xfs_buf_item.c44
-rw-r--r--fs/xfs/xfs_dfrag.c9
-rw-r--r--fs/xfs/xfs_dmapi.h1
-rw-r--r--fs/xfs/xfs_inode.c94
-rw-r--r--fs/xfs/xfs_log.c67
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_vnodeops.c26
298 files changed, 15600 insertions, 7690 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 047c791427aa..c061c3f18e7c 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -55,7 +55,7 @@ enum {
55 Opt_err 55 Opt_err
56}; 56};
57 57
58static match_table_t tokens = { 58static const match_table_t tokens = {
59 {Opt_debug, "debug=%x"}, 59 {Opt_debug, "debug=%x"},
60 {Opt_dfltuid, "dfltuid=%u"}, 60 {Opt_dfltuid, "dfltuid=%u"},
61 {Opt_dfltgid, "dfltgid=%u"}, 61 {Opt_dfltgid, "dfltgid=%u"},
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 88e3787c6ea9..e298fe194093 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
119 119
120const struct file_operations v9fs_dir_operations = { 120const struct file_operations v9fs_dir_operations = {
121 .read = generic_read_dir, 121 .read = generic_read_dir,
122 .llseek = generic_file_llseek,
122 .readdir = v9fs_dir_readdir, 123 .readdir = v9fs_dir_readdir,
123 .open = v9fs_file_open, 124 .open = v9fs_file_open,
124 .release = v9fs_dir_release, 125 .release = v9fs_dir_release,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c95295c65045..e83aa5ebe861 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
626 return NULL; 626 return NULL;
627 627
628error: 628error:
629 if (fid) 629 p9_client_clunk(fid);
630 p9_client_clunk(fid);
631 630
632 return ERR_PTR(result); 631 return ERR_PTR(result);
633} 632}
diff --git a/fs/Kconfig b/fs/Kconfig
index d3873583360b..d0a1174fb516 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,37 +136,51 @@ config EXT3_FS_SECURITY
136 If you are not using a security module that requires using 136 If you are not using a security module that requires using
137 extended attributes for file security labels, say N. 137 extended attributes for file security labels, say N.
138 138
139config EXT4DEV_FS 139config EXT4_FS
140 tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" 140 tristate "The Extended 4 (ext4) filesystem"
141 depends on EXPERIMENTAL
142 select JBD2 141 select JBD2
143 select CRC16 142 select CRC16
144 help 143 help
145 Ext4dev is a predecessor filesystem of the next generation 144 This is the next generation of the ext3 filesystem.
146 extended fs ext4, based on ext3 filesystem code. It will be
147 renamed ext4 fs later, once ext4dev is mature and stabilized.
148 145
149 Unlike the change from ext2 filesystem to ext3 filesystem, 146 Unlike the change from ext2 filesystem to ext3 filesystem,
150 the on-disk format of ext4dev is not the same as ext3 any more: 147 the on-disk format of ext4 is not forwards compatible with
151 it is based on extent maps and it supports 48-bit physical block 148 ext3; it is based on extent maps and it supports 48-bit
152 numbers. These combined on-disk format changes will allow 149 physical block numbers. The ext4 filesystem also supports delayed
153 ext4dev/ext4 to handle more than 16 TB filesystem volumes -- 150 allocation, persistent preallocation, high resolution time stamps,
154 a hard limit that ext3 cannot overcome without changing the 151 and a number of other features to improve performance and speed
155 on-disk format. 152 up fsck time. For more information, please see the web pages at
156 153 http://ext4.wiki.kernel.org.
157 Other than extent maps and 48-bit block numbers, ext4dev also is 154
158 likely to have other new features such as persistent preallocation, 155 The ext4 filesystem will support mounting an ext3
159 high resolution time stamps, and larger file support etc. These 156 filesystem; while there will be some performance gains from
160 features will be added to ext4dev gradually. 157 the delayed allocation and inode table readahead, the best
158 performance gains will require enabling ext4 features in the
159 filesystem, or formating a new filesystem as an ext4
160 filesystem initially.
161 161
162 To compile this file system support as a module, choose M here. The 162 To compile this file system support as a module, choose M here. The
163 module will be called ext4dev. 163 module will be called ext4.
164 164
165 If unsure, say N. 165 If unsure, say N.
166 166
167config EXT4DEV_FS_XATTR 167config EXT4DEV_COMPAT
168 bool "Ext4dev extended attributes" 168 bool "Enable ext4dev compatibility"
169 depends on EXT4DEV_FS 169 depends on EXT4_FS
170 help
171 Starting with 2.6.28, the name of the ext4 filesystem was
172 renamed from ext4dev to ext4. Unfortunately there are some
173 legacy userspace programs (such as klibc's fstype) have
174 "ext4dev" hardcoded.
175
176 To enable backwards compatibility so that systems that are
177 still expecting to mount ext4 filesystems using ext4dev,
178 chose Y here. This feature will go away by 2.6.31, so
179 please arrange to get your userspace programs fixed!
180
181config EXT4_FS_XATTR
182 bool "Ext4 extended attributes"
183 depends on EXT4_FS
170 default y 184 default y
171 help 185 help
172 Extended attributes are name:value pairs associated with inodes by 186 Extended attributes are name:value pairs associated with inodes by
@@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR
175 189
176 If unsure, say N. 190 If unsure, say N.
177 191
178 You need this for POSIX ACL support on ext4dev/ext4. 192 You need this for POSIX ACL support on ext4.
179 193
180config EXT4DEV_FS_POSIX_ACL 194config EXT4_FS_POSIX_ACL
181 bool "Ext4dev POSIX Access Control Lists" 195 bool "Ext4 POSIX Access Control Lists"
182 depends on EXT4DEV_FS_XATTR 196 depends on EXT4_FS_XATTR
183 select FS_POSIX_ACL 197 select FS_POSIX_ACL
184 help 198 help
185 POSIX Access Control Lists (ACLs) support permissions for users and 199 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL
190 204
191 If you don't know what Access Control Lists are, say N 205 If you don't know what Access Control Lists are, say N
192 206
193config EXT4DEV_FS_SECURITY 207config EXT4_FS_SECURITY
194 bool "Ext4dev Security Labels" 208 bool "Ext4 Security Labels"
195 depends on EXT4DEV_FS_XATTR 209 depends on EXT4_FS_XATTR
196 help 210 help
197 Security labels support alternative access control models 211 Security labels support alternative access control models
198 implemented by security modules like SELinux. This option 212 implemented by security modules like SELinux. This option
199 enables an extended attribute handler for file security 213 enables an extended attribute handler for file security
200 labels in the ext4dev/ext4 filesystem. 214 labels in the ext4 filesystem.
201 215
202 If you are not using a security module that requires using 216 If you are not using a security module that requires using
203 extended attributes for file security labels, say N. 217 extended attributes for file security labels, say N.
@@ -206,17 +220,16 @@ config JBD
206 tristate 220 tristate
207 help 221 help
208 This is a generic journalling layer for block devices. It is 222 This is a generic journalling layer for block devices. It is
209 currently used by the ext3 and OCFS2 file systems, but it could 223 currently used by the ext3 file system, but it could also be
210 also be used to add journal support to other file systems or block 224 used to add journal support to other file systems or block
211 devices such as RAID or LVM. 225 devices such as RAID or LVM.
212 226
213 If you are using the ext3 or OCFS2 file systems, you need to 227 If you are using the ext3 file system, you need to say Y here.
214 say Y here. If you are not using ext3 OCFS2 then you will probably 228 If you are not using ext3 then you will probably want to say N.
215 want to say N.
216 229
217 To compile this device as a module, choose M here: the module will be 230 To compile this device as a module, choose M here: the module will be
218 called jbd. If you are compiling ext3 or OCFS2 into the kernel, 231 called jbd. If you are compiling ext3 into the kernel, you
219 you cannot compile this code as a module. 232 cannot compile this code as a module.
220 233
221config JBD_DEBUG 234config JBD_DEBUG
222 bool "JBD (ext3) debugging support" 235 bool "JBD (ext3) debugging support"
@@ -240,22 +253,23 @@ config JBD2
240 help 253 help
241 This is a generic journaling layer for block devices that support 254 This is a generic journaling layer for block devices that support
242 both 32-bit and 64-bit block numbers. It is currently used by 255 both 32-bit and 64-bit block numbers. It is currently used by
243 the ext4dev/ext4 filesystem, but it could also be used to add 256 the ext4 and OCFS2 filesystems, but it could also be used to add
244 journal support to other file systems or block devices such 257 journal support to other file systems or block devices such
245 as RAID or LVM. 258 as RAID or LVM.
246 259
247 If you are using ext4dev/ext4, you need to say Y here. If you are not 260 If you are using ext4 or OCFS2, you need to say Y here.
248 using ext4dev/ext4 then you will probably want to say N. 261 If you are not using ext4 or OCFS2 then you will
262 probably want to say N.
249 263
250 To compile this device as a module, choose M here. The module will be 264 To compile this device as a module, choose M here. The module will be
251 called jbd2. If you are compiling ext4dev/ext4 into the kernel, 265 called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
252 you cannot compile this code as a module. 266 you cannot compile this code as a module.
253 267
254config JBD2_DEBUG 268config JBD2_DEBUG
255 bool "JBD2 (ext4dev/ext4) debugging support" 269 bool "JBD2 (ext4) debugging support"
256 depends on JBD2 && DEBUG_FS 270 depends on JBD2 && DEBUG_FS
257 help 271 help
258 If you are using the ext4dev/ext4 journaled file system (or 272 If you are using the ext4 journaled file system (or
259 potentially any other filesystem/device using JBD2), this option 273 potentially any other filesystem/device using JBD2), this option
260 allows you to enable debugging output while the system is running, 274 allows you to enable debugging output while the system is running,
261 in order to help track down any problems you are having. 275 in order to help track down any problems you are having.
@@ -270,9 +284,9 @@ config JBD2_DEBUG
270config FS_MBCACHE 284config FS_MBCACHE
271# Meta block cache for Extended Attributes (ext2/ext3/ext4) 285# Meta block cache for Extended Attributes (ext2/ext3/ext4)
272 tristate 286 tristate
273 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR 287 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
274 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y 288 default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
275 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m 289 default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
276 290
277config REISERFS_FS 291config REISERFS_FS
278 tristate "Reiserfs support" 292 tristate "Reiserfs support"
@@ -419,6 +433,14 @@ config FS_POSIX_ACL
419 bool 433 bool
420 default n 434 default n
421 435
436config FILE_LOCKING
437 bool "Enable POSIX file locking API" if EMBEDDED
438 default y
439 help
440 This option enables standard file locking support, required
441 for filesystems like NFS and for the flock() system
442 call. Disabling this option saves about 11k.
443
422source "fs/xfs/Kconfig" 444source "fs/xfs/Kconfig"
423source "fs/gfs2/Kconfig" 445source "fs/gfs2/Kconfig"
424 446
@@ -426,7 +448,7 @@ config OCFS2_FS
426 tristate "OCFS2 file system support" 448 tristate "OCFS2 file system support"
427 depends on NET && SYSFS 449 depends on NET && SYSFS
428 select CONFIGFS_FS 450 select CONFIGFS_FS
429 select JBD 451 select JBD2
430 select CRC32 452 select CRC32
431 help 453 help
432 OCFS2 is a general purpose extent based shared disk cluster file 454 OCFS2 is a general purpose extent based shared disk cluster file
@@ -497,6 +519,16 @@ config OCFS2_DEBUG_FS
497 this option for debugging only as it is likely to decrease 519 this option for debugging only as it is likely to decrease
498 performance of the filesystem. 520 performance of the filesystem.
499 521
522config OCFS2_COMPAT_JBD
523 bool "Use JBD for compatibility"
524 depends on OCFS2_FS
525 default n
526 select JBD
527 help
528 The ocfs2 filesystem now uses JBD2 for its journalling. JBD2
529 is backwards compatible with JBD. It is safe to say N here.
530 However, if you really want to use the original JBD, say Y here.
531
500endif # BLOCK 532endif # BLOCK
501 533
502config DNOTIFY 534config DNOTIFY
@@ -1765,6 +1797,28 @@ config SUNRPC_XPRT_RDMA
1765 1797
1766 If unsure, say N. 1798 If unsure, say N.
1767 1799
1800config SUNRPC_REGISTER_V4
1801 bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
1802 depends on SUNRPC && EXPERIMENTAL
1803 default n
1804 help
1805 Sun added support for registering RPC services at an IPv6
1806 address by creating two new versions of the rpcbind protocol
1807 (RFC 1833).
1808
1809 This option enables support in the kernel RPC server for
1810 registering kernel RPC services via version 4 of the rpcbind
1811 protocol. If you enable this option, you must run a portmapper
1812 daemon that supports rpcbind protocol version 4.
1813
1814 Serving NFS over IPv6 from knfsd (the kernel's NFS server)
1815 requires that you enable this option and use a portmapper that
1816 supports rpcbind version 4.
1817
1818 If unsure, say N to get traditional behavior (register kernel
1819 RPC services using only rpcbind version 2). Distributions
1820 using the legacy Linux portmapper daemon must say N here.
1821
1768config RPCSEC_GSS_KRB5 1822config RPCSEC_GSS_KRB5
1769 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" 1823 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
1770 depends on SUNRPC && EXPERIMENTAL 1824 depends on SUNRPC && EXPERIMENTAL
@@ -1930,6 +1984,16 @@ config CIFS_WEAK_PW_HASH
1930 1984
1931 If unsure, say N. 1985 If unsure, say N.
1932 1986
1987config CIFS_UPCALL
1988 bool "Kerberos/SPNEGO advanced session setup"
1989 depends on CIFS && KEYS
1990 help
1991 Enables an upcall mechanism for CIFS which accesses
1992 userspace helper utilities to provide SPNEGO packaged (RFC 4178)
1993 Kerberos tickets which are needed to mount to certain secure servers
1994 (for which more secure Kerberos authentication is required). If
1995 unsure, say N.
1996
1933config CIFS_XATTR 1997config CIFS_XATTR
1934 bool "CIFS extended attributes" 1998 bool "CIFS extended attributes"
1935 depends on CIFS 1999 depends on CIFS
@@ -1982,17 +2046,6 @@ config CIFS_EXPERIMENTAL
1982 (which is disabled by default). See the file fs/cifs/README 2046 (which is disabled by default). See the file fs/cifs/README
1983 for more details. If unsure, say N. 2047 for more details. If unsure, say N.
1984 2048
1985config CIFS_UPCALL
1986 bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
1987 depends on CIFS_EXPERIMENTAL
1988 depends on KEYS
1989 help
1990 Enables an upcall mechanism for CIFS which accesses
1991 userspace helper utilities to provide SPNEGO packaged (RFC 4178)
1992 Kerberos tickets which are needed to mount to certain secure servers
1993 (for which more secure Kerberos authentication is required). If
1994 unsure, say N.
1995
1996config CIFS_DFS_UPCALL 2049config CIFS_DFS_UPCALL
1997 bool "DFS feature support (EXPERIMENTAL)" 2050 bool "DFS feature support (EXPERIMENTAL)"
1998 depends on CIFS_EXPERIMENTAL 2051 depends on CIFS_EXPERIMENTAL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4a551af6f3fc..801db1341811 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -25,7 +25,7 @@ config BINFMT_ELF
25 25
26config COMPAT_BINFMT_ELF 26config COMPAT_BINFMT_ELF
27 bool 27 bool
28 depends on COMPAT && MMU 28 depends on COMPAT && BINFMT_ELF
29 29
30config BINFMT_ELF_FDPIC 30config BINFMT_ELF_FDPIC
31 bool "Kernel support for FDPIC ELF binaries" 31 bool "Kernel support for FDPIC ELF binaries"
@@ -59,10 +59,12 @@ config BINFMT_SHARED_FLAT
59 help 59 help
60 Support FLAT shared libraries 60 Support FLAT shared libraries
61 61
62config HAVE_AOUT
63 def_bool n
64
62config BINFMT_AOUT 65config BINFMT_AOUT
63 tristate "Kernel support for a.out and ECOFF binaries" 66 tristate "Kernel support for a.out and ECOFF binaries"
64 depends on ARCH_SUPPORTS_AOUT && \ 67 depends on HAVE_AOUT
65 (X86_32 || ALPHA || ARM || M68K)
66 ---help--- 68 ---help---
67 A.out (Assembler.OUTput) is a set of formats for libraries and 69 A.out (Assembler.OUTput) is a set of formats for libraries and
68 executables used in the earliest versions of UNIX. Linux used 70 executables used in the earliest versions of UNIX. Linux used
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5eff15..2168c902d5ca 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,8 +7,8 @@
7 7
8obj-y := open.o read_write.o file_table.o super.o \ 8obj-y := open.o read_write.o file_table.o super.o \
9 char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ 9 char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
10 ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ 10 ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o 14 stack.o
@@ -27,6 +27,8 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
27obj-$(CONFIG_SIGNALFD) += signalfd.o 27obj-$(CONFIG_SIGNALFD) += signalfd.o
28obj-$(CONFIG_TIMERFD) += timerfd.o 28obj-$(CONFIG_TIMERFD) += timerfd.o
29obj-$(CONFIG_EVENTFD) += eventfd.o 29obj-$(CONFIG_EVENTFD) += eventfd.o
30obj-$(CONFIG_AIO) += aio.o
31obj-$(CONFIG_FILE_LOCKING) += locks.o
30obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 32obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
31 33
32nfsd-$(CONFIG_NFSD) := nfsctl.o 34nfsd-$(CONFIG_NFSD) := nfsctl.o
@@ -69,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/
69# Do not add any filesystems before this line 71# Do not add any filesystems before this line
70obj-$(CONFIG_REISERFS_FS) += reiserfs/ 72obj-$(CONFIG_REISERFS_FS) += reiserfs/
71obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 73obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
72obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev 74obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4
73obj-$(CONFIG_JBD) += jbd/ 75obj-$(CONFIG_JBD) += jbd/
74obj-$(CONFIG_JBD2) += jbd2/ 76obj-$(CONFIG_JBD2) += jbd2/
75obj-$(CONFIG_EXT2_FS) += ext2/ 77obj-$(CONFIG_EXT2_FS) += ext2/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index fc1a8dc64d78..85a30e929800 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,6 +197,7 @@ out:
197 197
198const struct file_operations adfs_dir_operations = { 198const struct file_operations adfs_dir_operations = {
199 .read = generic_read_dir, 199 .read = generic_read_dir,
200 .llseek = generic_file_llseek,
200 .readdir = adfs_readdir, 201 .readdir = adfs_readdir,
201 .fsync = file_fsync, 202 .fsync = file_fsync,
202}; 203};
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 26f3b43726bb..7f83a46f2b7e 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -157,7 +157,7 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
157 157
158enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err}; 158enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
159 159
160static match_table_t tokens = { 160static const match_table_t tokens = {
161 {Opt_uid, "uid=%u"}, 161 {Opt_uid, "uid=%u"},
162 {Opt_gid, "gid=%u"}, 162 {Opt_gid, "gid=%u"},
163 {Opt_ownmask, "ownmask=%o"}, 163 {Opt_ownmask, "ownmask=%o"},
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 6e3f282424b0..7b36904dbeac 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t);
19 19
20const struct file_operations affs_dir_operations = { 20const struct file_operations affs_dir_operations = {
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .llseek = generic_file_llseek,
22 .readdir = affs_readdir, 23 .readdir = affs_readdir,
23 .fsync = file_fsync, 24 .fsync = file_fsync,
24}; 25};
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3a89094f93d0..8989c93193ed 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -135,7 +135,7 @@ enum {
135 Opt_verbose, Opt_volume, Opt_ignore, Opt_err, 135 Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
136}; 136};
137 137
138static match_table_t tokens = { 138static const match_table_t tokens = {
139 {Opt_bs, "bs=%u"}, 139 {Opt_bs, "bs=%u"},
140 {Opt_mode, "mode=%o"}, 140 {Opt_mode, "mode=%o"},
141 {Opt_mufs, "mufs"}, 141 {Opt_mufs, "mufs"},
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 525f7c56e068..a3901769a96c 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -50,8 +50,8 @@ const struct address_space_operations afs_fs_aops = {
50 .launder_page = afs_launder_page, 50 .launder_page = afs_launder_page,
51 .releasepage = afs_releasepage, 51 .releasepage = afs_releasepage,
52 .invalidatepage = afs_invalidatepage, 52 .invalidatepage = afs_invalidatepage,
53 .prepare_write = afs_prepare_write, 53 .write_begin = afs_write_begin,
54 .commit_write = afs_commit_write, 54 .write_end = afs_write_end,
55 .writepage = afs_writepage, 55 .writepage = afs_writepage,
56 .writepages = afs_writepages, 56 .writepages = afs_writepages,
57}; 57};
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 3cb6920ff30b..67f259d99cd6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -728,8 +728,12 @@ extern int afs_volume_release_fileserver(struct afs_vnode *,
728 */ 728 */
729extern int afs_set_page_dirty(struct page *); 729extern int afs_set_page_dirty(struct page *);
730extern void afs_put_writeback(struct afs_writeback *); 730extern void afs_put_writeback(struct afs_writeback *);
731extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned); 731extern int afs_write_begin(struct file *file, struct address_space *mapping,
732extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned); 732 loff_t pos, unsigned len, unsigned flags,
733 struct page **pagep, void **fsdata);
734extern int afs_write_end(struct file *file, struct address_space *mapping,
735 loff_t pos, unsigned len, unsigned copied,
736 struct page *page, void *fsdata);
733extern int afs_writepage(struct page *, struct writeback_control *); 737extern int afs_writepage(struct page *, struct writeback_control *);
734extern int afs_writepages(struct address_space *, struct writeback_control *); 738extern int afs_writepages(struct address_space *, struct writeback_control *);
735extern int afs_write_inode(struct inode *, int); 739extern int afs_write_inode(struct inode *, int);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 250d8c4d66e4..aee239a048cb 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -64,7 +64,7 @@ enum {
64 afs_opt_vol, 64 afs_opt_vol,
65}; 65};
66 66
67static match_table_t afs_options_list = { 67static const match_table_t afs_options_list = {
68 { afs_opt_cell, "cell=%s" }, 68 { afs_opt_cell, "cell=%s" },
69 { afs_opt_rwpath, "rwpath" }, 69 { afs_opt_rwpath, "rwpath" },
70 { afs_opt_vol, "vol=%s" }, 70 { afs_opt_vol, "vol=%s" },
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 065b4e10681a..d6b85dab35fc 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -84,15 +84,23 @@ void afs_put_writeback(struct afs_writeback *wb)
84 * partly or wholly fill a page that's under preparation for writing 84 * partly or wholly fill a page that's under preparation for writing
85 */ 85 */
86static int afs_fill_page(struct afs_vnode *vnode, struct key *key, 86static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
87 unsigned start, unsigned len, struct page *page) 87 loff_t pos, unsigned len, struct page *page)
88{ 88{
89 loff_t i_size;
90 unsigned eof;
89 int ret; 91 int ret;
90 92
91 _enter(",,%u,%u", start, len); 93 _enter(",,%llu,%u", (unsigned long long)pos, len);
92 94
93 ASSERTCMP(start + len, <=, PAGE_SIZE); 95 ASSERTCMP(len, <=, PAGE_CACHE_SIZE);
94 96
95 ret = afs_vnode_fetch_data(vnode, key, start, len, page); 97 i_size = i_size_read(&vnode->vfs_inode);
98 if (pos + len > i_size)
99 eof = i_size;
100 else
101 eof = PAGE_CACHE_SIZE;
102
103 ret = afs_vnode_fetch_data(vnode, key, 0, eof, page);
96 if (ret < 0) { 104 if (ret < 0) {
97 if (ret == -ENOENT) { 105 if (ret == -ENOENT) {
98 _debug("got NOENT from server" 106 _debug("got NOENT from server"
@@ -107,109 +115,55 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
107} 115}
108 116
109/* 117/*
110 * prepare a page for being written to
111 */
112static int afs_prepare_page(struct afs_vnode *vnode, struct page *page,
113 struct key *key, unsigned offset, unsigned to)
114{
115 unsigned eof, tail, start, stop, len;
116 loff_t i_size, pos;
117 void *p;
118 int ret;
119
120 _enter("");
121
122 if (offset == 0 && to == PAGE_SIZE)
123 return 0;
124
125 p = kmap_atomic(page, KM_USER0);
126
127 i_size = i_size_read(&vnode->vfs_inode);
128 pos = (loff_t) page->index << PAGE_SHIFT;
129 if (pos >= i_size) {
130 /* partial write, page beyond EOF */
131 _debug("beyond");
132 if (offset > 0)
133 memset(p, 0, offset);
134 if (to < PAGE_SIZE)
135 memset(p + to, 0, PAGE_SIZE - to);
136 kunmap_atomic(p, KM_USER0);
137 return 0;
138 }
139
140 if (i_size - pos >= PAGE_SIZE) {
141 /* partial write, page entirely before EOF */
142 _debug("before");
143 tail = eof = PAGE_SIZE;
144 } else {
145 /* partial write, page overlaps EOF */
146 eof = i_size - pos;
147 _debug("overlap %u", eof);
148 tail = max(eof, to);
149 if (tail < PAGE_SIZE)
150 memset(p + tail, 0, PAGE_SIZE - tail);
151 if (offset > eof)
152 memset(p + eof, 0, PAGE_SIZE - eof);
153 }
154
155 kunmap_atomic(p, KM_USER0);
156
157 ret = 0;
158 if (offset > 0 || eof > to) {
159 /* need to fill one or two bits that aren't going to be written
160 * (cover both fillers in one read if there are two) */
161 start = (offset > 0) ? 0 : to;
162 stop = (eof > to) ? eof : offset;
163 len = stop - start;
164 _debug("wr=%u-%u av=0-%u rd=%u@%u",
165 offset, to, eof, start, len);
166 ret = afs_fill_page(vnode, key, start, len, page);
167 }
168
169 _leave(" = %d", ret);
170 return ret;
171}
172
173/*
174 * prepare to perform part of a write to a page 118 * prepare to perform part of a write to a page
175 * - the caller holds the page locked, preventing it from being written out or
176 * modified by anyone else
177 */ 119 */
178int afs_prepare_write(struct file *file, struct page *page, 120int afs_write_begin(struct file *file, struct address_space *mapping,
179 unsigned offset, unsigned to) 121 loff_t pos, unsigned len, unsigned flags,
122 struct page **pagep, void **fsdata)
180{ 123{
181 struct afs_writeback *candidate, *wb; 124 struct afs_writeback *candidate, *wb;
182 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); 125 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
126 struct page *page;
183 struct key *key = file->private_data; 127 struct key *key = file->private_data;
184 pgoff_t index; 128 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
129 unsigned to = from + len;
130 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
185 int ret; 131 int ret;
186 132
187 _enter("{%x:%u},{%lx},%u,%u", 133 _enter("{%x:%u},{%lx},%u,%u",
188 vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); 134 vnode->fid.vid, vnode->fid.vnode, index, from, to);
189 135
190 candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); 136 candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
191 if (!candidate) 137 if (!candidate)
192 return -ENOMEM; 138 return -ENOMEM;
193 candidate->vnode = vnode; 139 candidate->vnode = vnode;
194 candidate->first = candidate->last = page->index; 140 candidate->first = candidate->last = index;
195 candidate->offset_first = offset; 141 candidate->offset_first = from;
196 candidate->to_last = to; 142 candidate->to_last = to;
197 candidate->usage = 1; 143 candidate->usage = 1;
198 candidate->state = AFS_WBACK_PENDING; 144 candidate->state = AFS_WBACK_PENDING;
199 init_waitqueue_head(&candidate->waitq); 145 init_waitqueue_head(&candidate->waitq);
200 146
147 page = __grab_cache_page(mapping, index);
148 if (!page) {
149 kfree(candidate);
150 return -ENOMEM;
151 }
152 *pagep = page;
153 /* page won't leak in error case: it eventually gets cleaned off LRU */
154
201 if (!PageUptodate(page)) { 155 if (!PageUptodate(page)) {
202 _debug("not up to date"); 156 _debug("not up to date");
203 ret = afs_prepare_page(vnode, page, key, offset, to); 157 ret = afs_fill_page(vnode, key, pos, len, page);
204 if (ret < 0) { 158 if (ret < 0) {
205 kfree(candidate); 159 kfree(candidate);
206 _leave(" = %d [prep]", ret); 160 _leave(" = %d [prep]", ret);
207 return ret; 161 return ret;
208 } 162 }
163 SetPageUptodate(page);
209 } 164 }
210 165
211try_again: 166try_again:
212 index = page->index;
213 spin_lock(&vnode->writeback_lock); 167 spin_lock(&vnode->writeback_lock);
214 168
215 /* see if this page is already pending a writeback under a suitable key 169 /* see if this page is already pending a writeback under a suitable key
@@ -242,8 +196,8 @@ try_again:
242subsume_in_current_wb: 196subsume_in_current_wb:
243 _debug("subsume"); 197 _debug("subsume");
244 ASSERTRANGE(wb->first, <=, index, <=, wb->last); 198 ASSERTRANGE(wb->first, <=, index, <=, wb->last);
245 if (index == wb->first && offset < wb->offset_first) 199 if (index == wb->first && from < wb->offset_first)
246 wb->offset_first = offset; 200 wb->offset_first = from;
247 if (index == wb->last && to > wb->to_last) 201 if (index == wb->last && to > wb->to_last)
248 wb->to_last = to; 202 wb->to_last = to;
249 spin_unlock(&vnode->writeback_lock); 203 spin_unlock(&vnode->writeback_lock);
@@ -289,17 +243,17 @@ flush_conflicting_wb:
289/* 243/*
290 * finalise part of a write to a page 244 * finalise part of a write to a page
291 */ 245 */
292int afs_commit_write(struct file *file, struct page *page, 246int afs_write_end(struct file *file, struct address_space *mapping,
293 unsigned offset, unsigned to) 247 loff_t pos, unsigned len, unsigned copied,
248 struct page *page, void *fsdata)
294{ 249{
295 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); 250 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
296 loff_t i_size, maybe_i_size; 251 loff_t i_size, maybe_i_size;
297 252
298 _enter("{%x:%u},{%lx},%u,%u", 253 _enter("{%x:%u},{%lx}",
299 vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); 254 vnode->fid.vid, vnode->fid.vnode, page->index);
300 255
301 maybe_i_size = (loff_t) page->index << PAGE_SHIFT; 256 maybe_i_size = pos + copied;
302 maybe_i_size += to;
303 257
304 i_size = i_size_read(&vnode->vfs_inode); 258 i_size = i_size_read(&vnode->vfs_inode);
305 if (maybe_i_size > i_size) { 259 if (maybe_i_size > i_size) {
@@ -310,12 +264,13 @@ int afs_commit_write(struct file *file, struct page *page,
310 spin_unlock(&vnode->writeback_lock); 264 spin_unlock(&vnode->writeback_lock);
311 } 265 }
312 266
313 SetPageUptodate(page);
314 set_page_dirty(page); 267 set_page_dirty(page);
315 if (PageDirty(page)) 268 if (PageDirty(page))
316 _debug("dirtied"); 269 _debug("dirtied");
270 unlock_page(page);
271 page_cache_release(page);
317 272
318 return 0; 273 return copied;
319} 274}
320 275
321/* 276/*
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index dda510d31f84..b70eea1e8c59 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -59,7 +59,7 @@ static const struct super_operations autofs_sops = {
59 59
60enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; 60enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
61 61
62static match_table_t autofs_tokens = { 62static const match_table_t autofs_tokens = {
63 {Opt_fd, "fd=%u"}, 63 {Opt_fd, "fd=%u"},
64 {Opt_uid, "uid=%u"}, 64 {Opt_uid, "uid=%u"},
65 {Opt_gid, "gid=%u"}, 65 {Opt_gid, "gid=%u"},
diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile
index f2c3b79e94d2..a811c1f7d9ab 100644
--- a/fs/autofs4/Makefile
+++ b/fs/autofs4/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_AUTOFS4_FS) += autofs4.o 5obj-$(CONFIG_AUTOFS4_FS) += autofs4.o
6 6
7autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o 7autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 69a2f5c92319..e0f16da00e54 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -14,6 +14,7 @@
14/* Internal header file for autofs */ 14/* Internal header file for autofs */
15 15
16#include <linux/auto_fs4.h> 16#include <linux/auto_fs4.h>
17#include <linux/auto_dev-ioctl.h>
17#include <linux/mutex.h> 18#include <linux/mutex.h>
18#include <linux/list.h> 19#include <linux/list.h>
19 20
@@ -21,6 +22,11 @@
21#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY 22#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
22#define AUTOFS_IOC_COUNT 32 23#define AUTOFS_IOC_COUNT 32
23 24
25#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
26#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
27
28#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
29
24#include <linux/kernel.h> 30#include <linux/kernel.h>
25#include <linux/slab.h> 31#include <linux/slab.h>
26#include <linux/time.h> 32#include <linux/time.h>
@@ -35,11 +41,27 @@
35/* #define DEBUG */ 41/* #define DEBUG */
36 42
37#ifdef DEBUG 43#ifdef DEBUG
38#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __func__ , ##args); } while(0) 44#define DPRINTK(fmt, args...) \
45do { \
46 printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \
47 current->pid, __func__, ##args); \
48} while (0)
39#else 49#else
40#define DPRINTK(fmt,args...) do {} while(0) 50#define DPRINTK(fmt, args...) do {} while (0)
41#endif 51#endif
42 52
53#define AUTOFS_WARN(fmt, args...) \
54do { \
55 printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
56 current->pid, __func__, ##args); \
57} while (0)
58
59#define AUTOFS_ERROR(fmt, args...) \
60do { \
61 printk(KERN_ERR "pid %d: %s: " fmt "\n", \
62 current->pid, __func__, ##args); \
63} while (0)
64
43/* Unified info structure. This is pointed to by both the dentry and 65/* Unified info structure. This is pointed to by both the dentry and
44 inode structures. Each file in the filesystem has an instance of this 66 inode structures. Each file in the filesystem has an instance of this
45 structure. It holds a reference to the dentry, so dentries are never 67 structure. It holds a reference to the dentry, so dentries are never
@@ -61,6 +83,9 @@ struct autofs_info {
61 unsigned long last_used; 83 unsigned long last_used;
62 atomic_t count; 84 atomic_t count;
63 85
86 uid_t uid;
87 gid_t gid;
88
64 mode_t mode; 89 mode_t mode;
65 size_t size; 90 size_t size;
66 91
@@ -92,10 +117,6 @@ struct autofs_wait_queue {
92 117
93#define AUTOFS_SBI_MAGIC 0x6d4a556d 118#define AUTOFS_SBI_MAGIC 0x6d4a556d
94 119
95#define AUTOFS_TYPE_INDIRECT 0x0001
96#define AUTOFS_TYPE_DIRECT 0x0002
97#define AUTOFS_TYPE_OFFSET 0x0004
98
99struct autofs_sb_info { 120struct autofs_sb_info {
100 u32 magic; 121 u32 magic;
101 int pipefd; 122 int pipefd;
@@ -169,6 +190,17 @@ int autofs4_expire_run(struct super_block *, struct vfsmount *,
169 struct autofs_packet_expire __user *); 190 struct autofs_packet_expire __user *);
170int autofs4_expire_multi(struct super_block *, struct vfsmount *, 191int autofs4_expire_multi(struct super_block *, struct vfsmount *,
171 struct autofs_sb_info *, int __user *); 192 struct autofs_sb_info *, int __user *);
193struct dentry *autofs4_expire_direct(struct super_block *sb,
194 struct vfsmount *mnt,
195 struct autofs_sb_info *sbi, int how);
196struct dentry *autofs4_expire_indirect(struct super_block *sb,
197 struct vfsmount *mnt,
198 struct autofs_sb_info *sbi, int how);
199
200/* Device node initialization */
201
202int autofs_dev_ioctl_init(void);
203void autofs_dev_ioctl_exit(void);
172 204
173/* Operations structures */ 205/* Operations structures */
174 206
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
new file mode 100644
index 000000000000..625abf5422e2
--- /dev/null
+++ b/fs/autofs4/dev-ioctl.c
@@ -0,0 +1,863 @@
1/*
2 * Copyright 2008 Red Hat, Inc. All rights reserved.
3 * Copyright 2008 Ian Kent <raven@themaw.net>
4 *
5 * This file is part of the Linux kernel and is made available under
6 * the terms of the GNU General Public License, version 2, or at your
7 * option, any later version, incorporated herein by reference.
8 */
9
10#include <linux/module.h>
11#include <linux/vmalloc.h>
12#include <linux/miscdevice.h>
13#include <linux/init.h>
14#include <linux/wait.h>
15#include <linux/namei.h>
16#include <linux/fcntl.h>
17#include <linux/file.h>
18#include <linux/fdtable.h>
19#include <linux/sched.h>
20#include <linux/compat.h>
21#include <linux/syscalls.h>
22#include <linux/smp_lock.h>
23#include <linux/magic.h>
24#include <linux/dcache.h>
25#include <linux/uaccess.h>
26
27#include "autofs_i.h"
28
29/*
30 * This module implements an interface for routing autofs ioctl control
31 * commands via a miscellaneous device file.
32 *
33 * The alternate interface is needed because we need to be able open
34 * an ioctl file descriptor on an autofs mount that may be covered by
35 * another mount. This situation arises when starting automount(8)
36 * or other user space daemon which uses direct mounts or offset
37 * mounts (used for autofs lazy mount/umount of nested mount trees),
38 * which have been left busy at at service shutdown.
39 */
40
41#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl)
42
43typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
44 struct autofs_dev_ioctl *);
45
46static int check_name(const char *name)
47{
48 if (!strchr(name, '/'))
49 return -EINVAL;
50 return 0;
51}
52
53/*
54 * Check a string doesn't overrun the chunk of
55 * memory we copied from user land.
56 */
57static int invalid_str(char *str, void *end)
58{
59 while ((void *) str <= end)
60 if (!*str++)
61 return 0;
62 return -EINVAL;
63}
64
65/*
66 * Check that the user compiled against correct version of autofs
67 * misc device code.
68 *
69 * As well as checking the version compatibility this always copies
70 * the kernel interface version out.
71 */
72static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
73{
74 int err = 0;
75
76 if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
77 (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
78 AUTOFS_WARN("ioctl control interface version mismatch: "
79 "kernel(%u.%u), user(%u.%u), cmd(%d)",
80 AUTOFS_DEV_IOCTL_VERSION_MAJOR,
81 AUTOFS_DEV_IOCTL_VERSION_MINOR,
82 param->ver_major, param->ver_minor, cmd);
83 err = -EINVAL;
84 }
85
86 /* Fill in the kernel version. */
87 param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
88 param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
89
90 return err;
91}
92
93/*
94 * Copy parameter control struct, including a possible path allocated
95 * at the end of the struct.
96 */
97static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
98{
99 struct autofs_dev_ioctl tmp, *ads;
100
101 if (copy_from_user(&tmp, in, sizeof(tmp)))
102 return ERR_PTR(-EFAULT);
103
104 if (tmp.size < sizeof(tmp))
105 return ERR_PTR(-EINVAL);
106
107 ads = kmalloc(tmp.size, GFP_KERNEL);
108 if (!ads)
109 return ERR_PTR(-ENOMEM);
110
111 if (copy_from_user(ads, in, tmp.size)) {
112 kfree(ads);
113 return ERR_PTR(-EFAULT);
114 }
115
116 return ads;
117}
118
119static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
120{
121 kfree(param);
122 return;
123}
124
125/*
126 * Check sanity of parameter control fields and if a path is present
127 * check that it has a "/" and is terminated.
128 */
129static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
130{
131 int err = -EINVAL;
132
133 if (check_dev_ioctl_version(cmd, param)) {
134 AUTOFS_WARN("invalid device control module version "
135 "supplied for cmd(0x%08x)", cmd);
136 goto out;
137 }
138
139 if (param->size > sizeof(*param)) {
140 err = check_name(param->path);
141 if (err) {
142 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
143 cmd);
144 goto out;
145 }
146
147 err = invalid_str(param->path,
148 (void *) ((size_t) param + param->size));
149 if (err) {
150 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
151 cmd);
152 goto out;
153 }
154 }
155
156 err = 0;
157out:
158 return err;
159}
160
161/*
162 * Get the autofs super block info struct from the file opened on
163 * the autofs mount point.
164 */
165static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
166{
167 struct autofs_sb_info *sbi = NULL;
168 struct inode *inode;
169
170 if (f) {
171 inode = f->f_path.dentry->d_inode;
172 sbi = autofs4_sbi(inode->i_sb);
173 }
174 return sbi;
175}
176
177/* Return autofs module protocol version */
178static int autofs_dev_ioctl_protover(struct file *fp,
179 struct autofs_sb_info *sbi,
180 struct autofs_dev_ioctl *param)
181{
182 param->arg1 = sbi->version;
183 return 0;
184}
185
186/* Return autofs module protocol sub version */
187static int autofs_dev_ioctl_protosubver(struct file *fp,
188 struct autofs_sb_info *sbi,
189 struct autofs_dev_ioctl *param)
190{
191 param->arg1 = sbi->sub_version;
192 return 0;
193}
194
195/*
196 * Walk down the mount stack looking for an autofs mount that
197 * has the requested device number (aka. new_encode_dev(sb->s_dev).
198 */
199static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
200{
201 struct dentry *dentry;
202 struct inode *inode;
203 struct super_block *sb;
204 dev_t s_dev;
205 unsigned int err;
206
207 err = -ENOENT;
208
209 /* Lookup the dentry name at the base of our mount point */
210 dentry = d_lookup(nd->path.dentry, &nd->last);
211 if (!dentry)
212 goto out;
213
214 dput(nd->path.dentry);
215 nd->path.dentry = dentry;
216
217 /* And follow the mount stack looking for our autofs mount */
218 while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
219 inode = nd->path.dentry->d_inode;
220 if (!inode)
221 break;
222
223 sb = inode->i_sb;
224 s_dev = new_encode_dev(sb->s_dev);
225 if (devno == s_dev) {
226 if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
227 err = 0;
228 break;
229 }
230 }
231 }
232out:
233 return err;
234}
235
236/*
237 * Walk down the mount stack looking for an autofs mount that
238 * has the requested mount type (ie. indirect, direct or offset).
239 */
240static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
241{
242 struct dentry *dentry;
243 struct autofs_info *ino;
244 unsigned int err;
245
246 err = -ENOENT;
247
248 /* Lookup the dentry name at the base of our mount point */
249 dentry = d_lookup(nd->path.dentry, &nd->last);
250 if (!dentry)
251 goto out;
252
253 dput(nd->path.dentry);
254 nd->path.dentry = dentry;
255
256 /* And follow the mount stack looking for our autofs mount */
257 while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
258 ino = autofs4_dentry_ino(nd->path.dentry);
259 if (ino && ino->sbi->type & type) {
260 err = 0;
261 break;
262 }
263 }
264out:
265 return err;
266}
267
268static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
269{
270 struct files_struct *files = current->files;
271 struct fdtable *fdt;
272
273 spin_lock(&files->file_lock);
274 fdt = files_fdtable(files);
275 BUG_ON(fdt->fd[fd] != NULL);
276 rcu_assign_pointer(fdt->fd[fd], file);
277 FD_SET(fd, fdt->close_on_exec);
278 spin_unlock(&files->file_lock);
279}
280
281
282/*
283 * Open a file descriptor on the autofs mount point corresponding
284 * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
285 */
286static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
287{
288 struct file *filp;
289 struct nameidata nd;
290 int err, fd;
291
292 fd = get_unused_fd();
293 if (likely(fd >= 0)) {
294 /* Get nameidata of the parent directory */
295 err = path_lookup(path, LOOKUP_PARENT, &nd);
296 if (err)
297 goto out;
298
299 /*
300 * Search down, within the parent, looking for an
301 * autofs super block that has the device number
302 * corresponding to the autofs fs we want to open.
303 */
304 err = autofs_dev_ioctl_find_super(&nd, devid);
305 if (err) {
306 path_put(&nd.path);
307 goto out;
308 }
309
310 filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY);
311 if (IS_ERR(filp)) {
312 err = PTR_ERR(filp);
313 goto out;
314 }
315
316 autofs_dev_ioctl_fd_install(fd, filp);
317 }
318
319 return fd;
320
321out:
322 put_unused_fd(fd);
323 return err;
324}
325
326/* Open a file descriptor on an autofs mount point */
327static int autofs_dev_ioctl_openmount(struct file *fp,
328 struct autofs_sb_info *sbi,
329 struct autofs_dev_ioctl *param)
330{
331 const char *path;
332 dev_t devid;
333 int err, fd;
334
335 /* param->path has already been checked */
336 if (!param->arg1)
337 return -EINVAL;
338
339 param->ioctlfd = -1;
340
341 path = param->path;
342 devid = param->arg1;
343
344 err = 0;
345 fd = autofs_dev_ioctl_open_mountpoint(path, devid);
346 if (unlikely(fd < 0)) {
347 err = fd;
348 goto out;
349 }
350
351 param->ioctlfd = fd;
352out:
353 return err;
354}
355
356/* Close file descriptor allocated above (user can also use close(2)). */
357static int autofs_dev_ioctl_closemount(struct file *fp,
358 struct autofs_sb_info *sbi,
359 struct autofs_dev_ioctl *param)
360{
361 return sys_close(param->ioctlfd);
362}
363
364/*
365 * Send "ready" status for an existing wait (either a mount or an expire
366 * request).
367 */
368static int autofs_dev_ioctl_ready(struct file *fp,
369 struct autofs_sb_info *sbi,
370 struct autofs_dev_ioctl *param)
371{
372 autofs_wqt_t token;
373
374 token = (autofs_wqt_t) param->arg1;
375 return autofs4_wait_release(sbi, token, 0);
376}
377
378/*
379 * Send "fail" status for an existing wait (either a mount or an expire
380 * request).
381 */
382static int autofs_dev_ioctl_fail(struct file *fp,
383 struct autofs_sb_info *sbi,
384 struct autofs_dev_ioctl *param)
385{
386 autofs_wqt_t token;
387 int status;
388
389 token = (autofs_wqt_t) param->arg1;
390 status = param->arg2 ? param->arg2 : -ENOENT;
391 return autofs4_wait_release(sbi, token, status);
392}
393
394/*
395 * Set the pipe fd for kernel communication to the daemon.
396 *
397 * Normally this is set at mount using an option but if we
398 * are reconnecting to a busy mount then we need to use this
399 * to tell the autofs mount about the new kernel pipe fd. In
400 * order to protect mounts against incorrectly setting the
401 * pipefd we also require that the autofs mount be catatonic.
402 *
403 * This also sets the process group id used to identify the
404 * controlling process (eg. the owning automount(8) daemon).
405 */
406static int autofs_dev_ioctl_setpipefd(struct file *fp,
407 struct autofs_sb_info *sbi,
408 struct autofs_dev_ioctl *param)
409{
410 int pipefd;
411 int err = 0;
412
413 if (param->arg1 == -1)
414 return -EINVAL;
415
416 pipefd = param->arg1;
417
418 mutex_lock(&sbi->wq_mutex);
419 if (!sbi->catatonic) {
420 mutex_unlock(&sbi->wq_mutex);
421 return -EBUSY;
422 } else {
423 struct file *pipe = fget(pipefd);
424 if (!pipe->f_op || !pipe->f_op->write) {
425 err = -EPIPE;
426 fput(pipe);
427 goto out;
428 }
429 sbi->oz_pgrp = task_pgrp_nr(current);
430 sbi->pipefd = pipefd;
431 sbi->pipe = pipe;
432 sbi->catatonic = 0;
433 }
434out:
435 mutex_unlock(&sbi->wq_mutex);
436 return err;
437}
438
439/*
440 * Make the autofs mount point catatonic, no longer responsive to
441 * mount requests. Also closes the kernel pipe file descriptor.
442 */
443static int autofs_dev_ioctl_catatonic(struct file *fp,
444 struct autofs_sb_info *sbi,
445 struct autofs_dev_ioctl *param)
446{
447 autofs4_catatonic_mode(sbi);
448 return 0;
449}
450
451/* Set the autofs mount timeout */
452static int autofs_dev_ioctl_timeout(struct file *fp,
453 struct autofs_sb_info *sbi,
454 struct autofs_dev_ioctl *param)
455{
456 unsigned long timeout;
457
458 timeout = param->arg1;
459 param->arg1 = sbi->exp_timeout / HZ;
460 sbi->exp_timeout = timeout * HZ;
461 return 0;
462}
463
464/*
465 * Return the uid and gid of the last request for the mount
466 *
467 * When reconstructing an autofs mount tree with active mounts
468 * we need to re-connect to mounts that may have used the original
469 * process uid and gid (or string variations of them) for mount
470 * lookups within the map entry.
471 */
472static int autofs_dev_ioctl_requester(struct file *fp,
473 struct autofs_sb_info *sbi,
474 struct autofs_dev_ioctl *param)
475{
476 struct autofs_info *ino;
477 struct nameidata nd;
478 const char *path;
479 dev_t devid;
480 int err = -ENOENT;
481
482 if (param->size <= sizeof(*param)) {
483 err = -EINVAL;
484 goto out;
485 }
486
487 path = param->path;
488 devid = sbi->sb->s_dev;
489
490 param->arg1 = param->arg2 = -1;
491
492 /* Get nameidata of the parent directory */
493 err = path_lookup(path, LOOKUP_PARENT, &nd);
494 if (err)
495 goto out;
496
497 err = autofs_dev_ioctl_find_super(&nd, devid);
498 if (err)
499 goto out_release;
500
501 ino = autofs4_dentry_ino(nd.path.dentry);
502 if (ino) {
503 err = 0;
504 autofs4_expire_wait(nd.path.dentry);
505 spin_lock(&sbi->fs_lock);
506 param->arg1 = ino->uid;
507 param->arg2 = ino->gid;
508 spin_unlock(&sbi->fs_lock);
509 }
510
511out_release:
512 path_put(&nd.path);
513out:
514 return err;
515}
516
517/*
518 * Call repeatedly until it returns -EAGAIN, meaning there's nothing
519 * more that can be done.
520 */
521static int autofs_dev_ioctl_expire(struct file *fp,
522 struct autofs_sb_info *sbi,
523 struct autofs_dev_ioctl *param)
524{
525 struct dentry *dentry;
526 struct vfsmount *mnt;
527 int err = -EAGAIN;
528 int how;
529
530 how = param->arg1;
531 mnt = fp->f_path.mnt;
532
533 if (sbi->type & AUTOFS_TYPE_TRIGGER)
534 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
535 else
536 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
537
538 if (dentry) {
539 struct autofs_info *ino = autofs4_dentry_ino(dentry);
540
541 /*
542 * This is synchronous because it makes the daemon a
543 * little easier
544 */
545 err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
546
547 spin_lock(&sbi->fs_lock);
548 if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
549 ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
550 sbi->sb->s_root->d_mounted++;
551 }
552 ino->flags &= ~AUTOFS_INF_EXPIRING;
553 complete_all(&ino->expire_complete);
554 spin_unlock(&sbi->fs_lock);
555 dput(dentry);
556 }
557
558 return err;
559}
560
561/* Check if autofs mount point is in use */
562static int autofs_dev_ioctl_askumount(struct file *fp,
563 struct autofs_sb_info *sbi,
564 struct autofs_dev_ioctl *param)
565{
566 param->arg1 = 0;
567 if (may_umount(fp->f_path.mnt))
568 param->arg1 = 1;
569 return 0;
570}
571
572/*
573 * Check if the given path is a mountpoint.
574 *
575 * If we are supplied with the file descriptor of an autofs
576 * mount we're looking for a specific mount. In this case
577 * the path is considered a mountpoint if it is itself a
578 * mountpoint or contains a mount, such as a multi-mount
579 * without a root mount. In this case we return 1 if the
580 * path is a mount point and the super magic of the covering
581 * mount if there is one or 0 if it isn't a mountpoint.
582 *
583 * If we aren't supplied with a file descriptor then we
584 * lookup the nameidata of the path and check if it is the
585 * root of a mount. If a type is given we are looking for
586 * a particular autofs mount and if we don't find a match
587 * we return fail. If the located nameidata path is the
588 * root of a mount we return 1 along with the super magic
589 * of the mount or 0 otherwise.
590 *
591 * In both cases the the device number (as returned by
592 * new_encode_dev()) is also returned.
593 */
594static int autofs_dev_ioctl_ismountpoint(struct file *fp,
595 struct autofs_sb_info *sbi,
596 struct autofs_dev_ioctl *param)
597{
598 struct nameidata nd;
599 const char *path;
600 unsigned int type;
601 int err = -ENOENT;
602
603 if (param->size <= sizeof(*param)) {
604 err = -EINVAL;
605 goto out;
606 }
607
608 path = param->path;
609 type = param->arg1;
610
611 param->arg1 = 0;
612 param->arg2 = 0;
613
614 if (!fp || param->ioctlfd == -1) {
615 if (type == AUTOFS_TYPE_ANY) {
616 struct super_block *sb;
617
618 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
619 if (err)
620 goto out;
621
622 sb = nd.path.dentry->d_sb;
623 param->arg1 = new_encode_dev(sb->s_dev);
624 } else {
625 struct autofs_info *ino;
626
627 err = path_lookup(path, LOOKUP_PARENT, &nd);
628 if (err)
629 goto out;
630
631 err = autofs_dev_ioctl_find_sbi_type(&nd, type);
632 if (err)
633 goto out_release;
634
635 ino = autofs4_dentry_ino(nd.path.dentry);
636 param->arg1 = autofs4_get_dev(ino->sbi);
637 }
638
639 err = 0;
640 if (nd.path.dentry->d_inode &&
641 nd.path.mnt->mnt_root == nd.path.dentry) {
642 err = 1;
643 param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
644 }
645 } else {
646 dev_t devid = new_encode_dev(sbi->sb->s_dev);
647
648 err = path_lookup(path, LOOKUP_PARENT, &nd);
649 if (err)
650 goto out;
651
652 err = autofs_dev_ioctl_find_super(&nd, devid);
653 if (err)
654 goto out_release;
655
656 param->arg1 = autofs4_get_dev(sbi);
657
658 err = have_submounts(nd.path.dentry);
659
660 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
661 if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
662 struct inode *inode = nd.path.dentry->d_inode;
663 param->arg2 = inode->i_sb->s_magic;
664 }
665 }
666 }
667
668out_release:
669 path_put(&nd.path);
670out:
671 return err;
672}
673
674/*
675 * Our range of ioctl numbers isn't 0 based so we need to shift
676 * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table
677 * lookup.
678 */
679#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST))
680
681static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
682{
683 static struct {
684 int cmd;
685 ioctl_fn fn;
686 } _ioctls[] = {
687 {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL},
688 {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
689 autofs_dev_ioctl_protover},
690 {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
691 autofs_dev_ioctl_protosubver},
692 {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD),
693 autofs_dev_ioctl_openmount},
694 {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD),
695 autofs_dev_ioctl_closemount},
696 {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD),
697 autofs_dev_ioctl_ready},
698 {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD),
699 autofs_dev_ioctl_fail},
700 {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD),
701 autofs_dev_ioctl_setpipefd},
702 {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD),
703 autofs_dev_ioctl_catatonic},
704 {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD),
705 autofs_dev_ioctl_timeout},
706 {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD),
707 autofs_dev_ioctl_requester},
708 {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD),
709 autofs_dev_ioctl_expire},
710 {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD),
711 autofs_dev_ioctl_askumount},
712 {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD),
713 autofs_dev_ioctl_ismountpoint}
714 };
715 unsigned int idx = cmd_idx(cmd);
716
717 return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn;
718}
719
720/* ioctl dispatcher */
721static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
722{
723 struct autofs_dev_ioctl *param;
724 struct file *fp;
725 struct autofs_sb_info *sbi;
726 unsigned int cmd_first, cmd;
727 ioctl_fn fn = NULL;
728 int err = 0;
729
730 /* only root can play with this */
731 if (!capable(CAP_SYS_ADMIN))
732 return -EPERM;
733
734 cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
735 cmd = _IOC_NR(command);
736
737 if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) ||
738 cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) {
739 return -ENOTTY;
740 }
741
742 /* Copy the parameters into kernel space. */
743 param = copy_dev_ioctl(user);
744 if (IS_ERR(param))
745 return PTR_ERR(param);
746
747 err = validate_dev_ioctl(command, param);
748 if (err)
749 goto out;
750
751 /* The validate routine above always sets the version */
752 if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD)
753 goto done;
754
755 fn = lookup_dev_ioctl(cmd);
756 if (!fn) {
757 AUTOFS_WARN("unknown command 0x%08x", command);
758 return -ENOTTY;
759 }
760
761 fp = NULL;
762 sbi = NULL;
763
764 /*
765 * For obvious reasons the openmount can't have a file
766 * descriptor yet. We don't take a reference to the
767 * file during close to allow for immediate release.
768 */
769 if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
770 cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
771 fp = fget(param->ioctlfd);
772 if (!fp) {
773 if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
774 goto cont;
775 err = -EBADF;
776 goto out;
777 }
778
779 if (!fp->f_op) {
780 err = -ENOTTY;
781 fput(fp);
782 goto out;
783 }
784
785 sbi = autofs_dev_ioctl_sbi(fp);
786 if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
787 err = -EINVAL;
788 fput(fp);
789 goto out;
790 }
791
792 /*
793 * Admin needs to be able to set the mount catatonic in
794 * order to be able to perform the re-open.
795 */
796 if (!autofs4_oz_mode(sbi) &&
797 cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) {
798 err = -EACCES;
799 fput(fp);
800 goto out;
801 }
802 }
803cont:
804 err = fn(fp, sbi, param);
805
806 if (fp)
807 fput(fp);
808done:
809 if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE))
810 err = -EFAULT;
811out:
812 free_dev_ioctl(param);
813 return err;
814}
815
816static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
817{
818 int err;
819 err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
820 return (long) err;
821}
822
823#ifdef CONFIG_COMPAT
824static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u)
825{
826 return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u));
827}
828#else
829#define autofs_dev_ioctl_compat NULL
830#endif
831
832static const struct file_operations _dev_ioctl_fops = {
833 .unlocked_ioctl = autofs_dev_ioctl,
834 .compat_ioctl = autofs_dev_ioctl_compat,
835 .owner = THIS_MODULE,
836};
837
838static struct miscdevice _autofs_dev_ioctl_misc = {
839 .minor = MISC_DYNAMIC_MINOR,
840 .name = AUTOFS_DEVICE_NAME,
841 .fops = &_dev_ioctl_fops
842};
843
844/* Register/deregister misc character device */
845int autofs_dev_ioctl_init(void)
846{
847 int r;
848
849 r = misc_register(&_autofs_dev_ioctl_misc);
850 if (r) {
851 AUTOFS_ERROR("misc_register failed for control device");
852 return r;
853 }
854
855 return 0;
856}
857
858void autofs_dev_ioctl_exit(void)
859{
860 misc_deregister(&_autofs_dev_ioctl_misc);
861 return;
862}
863
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cdabb796ff01..cde2f8e8935a 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -244,10 +244,10 @@ cont:
244} 244}
245 245
246/* Check if we can expire a direct mount (possibly a tree) */ 246/* Check if we can expire a direct mount (possibly a tree) */
247static struct dentry *autofs4_expire_direct(struct super_block *sb, 247struct dentry *autofs4_expire_direct(struct super_block *sb,
248 struct vfsmount *mnt, 248 struct vfsmount *mnt,
249 struct autofs_sb_info *sbi, 249 struct autofs_sb_info *sbi,
250 int how) 250 int how)
251{ 251{
252 unsigned long timeout; 252 unsigned long timeout;
253 struct dentry *root = dget(sb->s_root); 253 struct dentry *root = dget(sb->s_root);
@@ -283,10 +283,10 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
283 * - it is unused by any user process 283 * - it is unused by any user process
284 * - it has been unused for exp_timeout time 284 * - it has been unused for exp_timeout time
285 */ 285 */
286static struct dentry *autofs4_expire_indirect(struct super_block *sb, 286struct dentry *autofs4_expire_indirect(struct super_block *sb,
287 struct vfsmount *mnt, 287 struct vfsmount *mnt,
288 struct autofs_sb_info *sbi, 288 struct autofs_sb_info *sbi,
289 int how) 289 int how)
290{ 290{
291 unsigned long timeout; 291 unsigned long timeout;
292 struct dentry *root = sb->s_root; 292 struct dentry *root = sb->s_root;
@@ -479,7 +479,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
479 if (arg && get_user(do_now, arg)) 479 if (arg && get_user(do_now, arg))
480 return -EFAULT; 480 return -EFAULT;
481 481
482 if (sbi->type & AUTOFS_TYPE_DIRECT) 482 if (sbi->type & AUTOFS_TYPE_TRIGGER)
483 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); 483 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
484 else 484 else
485 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); 485 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 723a1c5e361b..9722e4bd8957 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -29,11 +29,20 @@ static struct file_system_type autofs_fs_type = {
29 29
30static int __init init_autofs4_fs(void) 30static int __init init_autofs4_fs(void)
31{ 31{
32 return register_filesystem(&autofs_fs_type); 32 int err;
33
34 err = register_filesystem(&autofs_fs_type);
35 if (err)
36 return err;
37
38 autofs_dev_ioctl_init();
39
40 return err;
33} 41}
34 42
35static void __exit exit_autofs4_fs(void) 43static void __exit exit_autofs4_fs(void)
36{ 44{
45 autofs_dev_ioctl_exit();
37 unregister_filesystem(&autofs_fs_type); 46 unregister_filesystem(&autofs_fs_type);
38} 47}
39 48
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7bb3e5ba0537..c7e65bb30ba0 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -53,6 +53,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
53 atomic_set(&ino->count, 0); 53 atomic_set(&ino->count, 0);
54 } 54 }
55 55
56 ino->uid = 0;
57 ino->gid = 0;
56 ino->mode = mode; 58 ino->mode = mode;
57 ino->last_used = jiffies; 59 ino->last_used = jiffies;
58 60
@@ -213,7 +215,7 @@ static const struct super_operations autofs4_sops = {
213enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, 215enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
214 Opt_indirect, Opt_direct, Opt_offset}; 216 Opt_indirect, Opt_direct, Opt_offset};
215 217
216static match_table_t tokens = { 218static const match_table_t tokens = {
217 {Opt_fd, "fd=%u"}, 219 {Opt_fd, "fd=%u"},
218 {Opt_uid, "uid=%u"}, 220 {Opt_uid, "uid=%u"},
219 {Opt_gid, "gid=%u"}, 221 {Opt_gid, "gid=%u"},
@@ -288,7 +290,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
288 *type = AUTOFS_TYPE_DIRECT; 290 *type = AUTOFS_TYPE_DIRECT;
289 break; 291 break;
290 case Opt_offset: 292 case Opt_offset:
291 *type = AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET; 293 *type = AUTOFS_TYPE_OFFSET;
292 break; 294 break;
293 default: 295 default:
294 return 1; 296 return 1;
@@ -336,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
336 sbi->sb = s; 338 sbi->sb = s;
337 sbi->version = 0; 339 sbi->version = 0;
338 sbi->sub_version = 0; 340 sbi->sub_version = 0;
339 sbi->type = 0; 341 sbi->type = AUTOFS_TYPE_INDIRECT;
340 sbi->min_proto = 0; 342 sbi->min_proto = 0;
341 sbi->max_proto = 0; 343 sbi->max_proto = 0;
342 mutex_init(&sbi->wq_mutex); 344 mutex_init(&sbi->wq_mutex);
@@ -378,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
378 } 380 }
379 381
380 root_inode->i_fop = &autofs4_root_operations; 382 root_inode->i_fop = &autofs4_root_operations;
381 root_inode->i_op = sbi->type & AUTOFS_TYPE_DIRECT ? 383 root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
382 &autofs4_direct_root_inode_operations : 384 &autofs4_direct_root_inode_operations :
383 &autofs4_indirect_root_inode_operations; 385 &autofs4_indirect_root_inode_operations;
384 386
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index bcfb2dc0a61b..2a41c2a7fc52 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,6 +36,7 @@ const struct file_operations autofs4_root_operations = {
36 .release = dcache_dir_close, 36 .release = dcache_dir_close,
37 .read = generic_read_dir, 37 .read = generic_read_dir,
38 .readdir = dcache_readdir, 38 .readdir = dcache_readdir,
39 .llseek = dcache_dir_lseek,
39 .ioctl = autofs4_root_ioctl, 40 .ioctl = autofs4_root_ioctl,
40}; 41};
41 42
@@ -44,6 +45,7 @@ const struct file_operations autofs4_dir_operations = {
44 .release = dcache_dir_close, 45 .release = dcache_dir_close,
45 .read = generic_read_dir, 46 .read = generic_read_dir,
46 .readdir = dcache_readdir, 47 .readdir = dcache_readdir,
48 .llseek = dcache_dir_lseek,
47}; 49};
48 50
49const struct inode_operations autofs4_indirect_root_inode_operations = { 51const struct inode_operations autofs4_indirect_root_inode_operations = {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35216d18d8b5..4b67c2a2d77c 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
337 * is very similar for indirect mounts except only dentrys 337 * is very similar for indirect mounts except only dentrys
338 * in the root of the autofs file system may be negative. 338 * in the root of the autofs file system may be negative.
339 */ 339 */
340 if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)) 340 if (sbi->type & AUTOFS_TYPE_TRIGGER)
341 return -ENOENT; 341 return -ENOENT;
342 else if (!IS_ROOT(dentry->d_parent)) 342 else if (!IS_ROOT(dentry->d_parent))
343 return -ENOENT; 343 return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
348 return -ENOMEM; 348 return -ENOMEM;
349 349
350 /* If this is a direct mount request create a dummy name */ 350 /* If this is a direct mount request create a dummy name */
351 if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) 351 if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
352 qstr.len = sprintf(name, "%p", dentry); 352 qstr.len = sprintf(name, "%p", dentry);
353 else { 353 else {
354 qstr.len = autofs4_getpath(sbi, dentry, &name); 354 qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
406 type = autofs_ptype_expire_multi; 406 type = autofs_ptype_expire_multi;
407 } else { 407 } else {
408 if (notify == NFY_MOUNT) 408 if (notify == NFY_MOUNT)
409 type = (sbi->type & AUTOFS_TYPE_DIRECT) ? 409 type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
410 autofs_ptype_missing_direct : 410 autofs_ptype_missing_direct :
411 autofs_ptype_missing_indirect; 411 autofs_ptype_missing_indirect;
412 else 412 else
413 type = (sbi->type & AUTOFS_TYPE_DIRECT) ? 413 type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
414 autofs_ptype_expire_direct : 414 autofs_ptype_expire_direct :
415 autofs_ptype_expire_indirect; 415 autofs_ptype_expire_indirect;
416 } 416 }
@@ -457,6 +457,40 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
457 457
458 status = wq->status; 458 status = wq->status;
459 459
460 /*
461 * For direct and offset mounts we need to track the requester's
462 * uid and gid in the dentry info struct. This is so it can be
463 * supplied, on request, by the misc device ioctl interface.
464 * This is needed during daemon resatart when reconnecting
465 * to existing, active, autofs mounts. The uid and gid (and
466 * related string values) may be used for macro substitution
467 * in autofs mount maps.
468 */
469 if (!status) {
470 struct autofs_info *ino;
471 struct dentry *de = NULL;
472
473 /* direct mount or browsable map */
474 ino = autofs4_dentry_ino(dentry);
475 if (!ino) {
476 /* If not lookup actual dentry used */
477 de = d_lookup(dentry->d_parent, &dentry->d_name);
478 if (de)
479 ino = autofs4_dentry_ino(de);
480 }
481
482 /* Set mount requester */
483 if (ino) {
484 spin_lock(&sbi->fs_lock);
485 ino->uid = wq->uid;
486 ino->gid = wq->gid;
487 spin_unlock(&sbi->fs_lock);
488 }
489
490 if (de)
491 dput(de);
492 }
493
460 /* Are we the last process to need status? */ 494 /* Are we the last process to need status? */
461 mutex_lock(&sbi->wq_mutex); 495 mutex_lock(&sbi->wq_mutex);
462 if (!--wq->wait_ctr) 496 if (!--wq->wait_ctr)
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index e2595c2c403a..7893eaa1e58c 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -55,8 +55,12 @@ enum super_flags {
55}; 55};
56 56
57#define BEFS_BYTEORDER_NATIVE 0x42494745 57#define BEFS_BYTEORDER_NATIVE 0x42494745
58#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE)
59#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE)
58 60
59#define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1 61#define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1
62#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1)
63#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1)
60 64
61/* 65/*
62 * Flags of inode 66 * Flags of inode
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 02c6e62b72f8..b6dfee37c7b7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
66static const struct file_operations befs_dir_operations = { 66static const struct file_operations befs_dir_operations = {
67 .read = generic_read_dir, 67 .read = generic_read_dir,
68 .readdir = befs_readdir, 68 .readdir = befs_readdir,
69 .llseek = generic_file_llseek,
69}; 70};
70 71
71static const struct inode_operations befs_dir_inode_operations = { 72static const struct inode_operations befs_dir_inode_operations = {
@@ -649,7 +650,7 @@ enum {
649 Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, 650 Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
650}; 651};
651 652
652static match_table_t befs_tokens = { 653static const match_table_t befs_tokens = {
653 {Opt_uid, "uid=%d"}, 654 {Opt_uid, "uid=%d"},
654 {Opt_gid, "gid=%d"}, 655 {Opt_gid, "gid=%d"},
655 {Opt_charset, "iocharset=%s"}, 656 {Opt_charset, "iocharset=%s"},
@@ -808,8 +809,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
808 809
809 /* account for offset of super block on x86 */ 810 /* account for offset of super block on x86 */
810 disk_sb = (befs_super_block *) bh->b_data; 811 disk_sb = (befs_super_block *) bh->b_data;
811 if ((le32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1) || 812 if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) ||
812 (be32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1)) { 813 (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) {
813 befs_debug(sb, "Using PPC superblock location"); 814 befs_debug(sb, "Using PPC superblock location");
814 } else { 815 } else {
815 befs_debug(sb, "Using x86 superblock location"); 816 befs_debug(sb, "Using x86 superblock location");
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 8c3401ff6d6a..41f2b4d0093e 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -26,10 +26,10 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
26 befs_sb_info *befs_sb = BEFS_SB(sb); 26 befs_sb_info *befs_sb = BEFS_SB(sb);
27 27
28 /* Check the byte order of the filesystem */ 28 /* Check the byte order of the filesystem */
29 if (le32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE) 29 if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE)
30 befs_sb->byte_order = BEFS_BYTESEX_LE; 30 befs_sb->byte_order = BEFS_BYTESEX_LE;
31 else if (be32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE) 31 else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE)
32 befs_sb->byte_order = BEFS_BYTESEX_BE; 32 befs_sb->byte_order = BEFS_BYTESEX_BE;
33 33
34 befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); 34 befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1);
35 befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); 35 befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2);
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 87ee5ccee348..ed8feb052df9 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
125 inode->i_ino); 125 inode->i_ino);
126 if (err) { 126 if (err) {
127 inode_dec_link_count(inode); 127 inode_dec_link_count(inode);
128 iput(inode);
129 mutex_unlock(&info->bfs_lock); 128 mutex_unlock(&info->bfs_lock);
129 iput(inode);
130 return err; 130 return err;
131 } 131 }
132 mutex_unlock(&info->bfs_lock); 132 mutex_unlock(&info->bfs_lock);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 655ed8d30a86..83d72006e29d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -683,7 +683,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
683 * switch really is going to happen - do this in 683 * switch really is going to happen - do this in
684 * flush_thread(). - akpm 684 * flush_thread(). - akpm
685 */ 685 */
686 SET_PERSONALITY(loc->elf_ex, 0); 686 SET_PERSONALITY(loc->elf_ex);
687 687
688 interpreter = open_exec(elf_interpreter); 688 interpreter = open_exec(elf_interpreter);
689 retval = PTR_ERR(interpreter); 689 retval = PTR_ERR(interpreter);
@@ -734,7 +734,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
734 goto out_free_dentry; 734 goto out_free_dentry;
735 } else { 735 } else {
736 /* Executables without an interpreter also need a personality */ 736 /* Executables without an interpreter also need a personality */
737 SET_PERSONALITY(loc->elf_ex, 0); 737 SET_PERSONALITY(loc->elf_ex);
738 } 738 }
739 739
740 /* Flush all traces of the currently running executable */ 740 /* Flush all traces of the currently running executable */
@@ -748,7 +748,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
748 748
749 /* Do this immediately, since STACK_TOP as used in setup_arg_pages 749 /* Do this immediately, since STACK_TOP as used in setup_arg_pages
750 may depend on the personality. */ 750 may depend on the personality. */
751 SET_PERSONALITY(loc->elf_ex, 0); 751 SET_PERSONALITY(loc->elf_ex);
752 if (elf_read_implies_exec(loc->elf_ex, executable_stack)) 752 if (elf_read_implies_exec(loc->elf_ex, executable_stack))
753 current->personality |= READ_IMPLIES_EXEC; 753 current->personality |= READ_IMPLIES_EXEC;
754 754
@@ -1333,20 +1333,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
1333 prstatus->pr_pgrp = task_pgrp_vnr(p); 1333 prstatus->pr_pgrp = task_pgrp_vnr(p);
1334 prstatus->pr_sid = task_session_vnr(p); 1334 prstatus->pr_sid = task_session_vnr(p);
1335 if (thread_group_leader(p)) { 1335 if (thread_group_leader(p)) {
1336 struct task_cputime cputime;
1337
1336 /* 1338 /*
1337 * This is the record for the group leader. Add in the 1339 * This is the record for the group leader. It shows the
1338 * cumulative times of previous dead threads. This total 1340 * group-wide total, not its individual thread total.
1339 * won't include the time of each live thread whose state
1340 * is included in the core dump. The final total reported
1341 * to our parent process when it calls wait4 will include
1342 * those sums as well as the little bit more time it takes
1343 * this and each other thread to finish dying after the
1344 * core dump synchronization phase.
1345 */ 1341 */
1346 cputime_to_timeval(cputime_add(p->utime, p->signal->utime), 1342 thread_group_cputime(p, &cputime);
1347 &prstatus->pr_utime); 1343 cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
1348 cputime_to_timeval(cputime_add(p->stime, p->signal->stime), 1344 cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
1349 &prstatus->pr_stime);
1350 } else { 1345 } else {
1351 cputime_to_timeval(p->utime, &prstatus->pr_utime); 1346 cputime_to_timeval(p->utime, &prstatus->pr_utime);
1352 cputime_to_timeval(p->stime, &prstatus->pr_stime); 1347 cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 80c1f952ef78..0e8367c54624 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -25,6 +25,7 @@
25#include <linux/fcntl.h> 25#include <linux/fcntl.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/security.h>
28#include <linux/highmem.h> 29#include <linux/highmem.h>
29#include <linux/highuid.h> 30#include <linux/highuid.h>
30#include <linux/personality.h> 31#include <linux/personality.h>
@@ -455,8 +456,19 @@ error_kill:
455} 456}
456 457
457/*****************************************************************************/ 458/*****************************************************************************/
459
460#ifndef ELF_BASE_PLATFORM
458/* 461/*
459 * present useful information to the program 462 * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
463 * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
464 * will be copied to the user stack in the same manner as AT_PLATFORM.
465 */
466#define ELF_BASE_PLATFORM NULL
467#endif
468
469/*
470 * present useful information to the program by shovelling it onto the new
471 * process's stack
460 */ 472 */
461static int create_elf_fdpic_tables(struct linux_binprm *bprm, 473static int create_elf_fdpic_tables(struct linux_binprm *bprm,
462 struct mm_struct *mm, 474 struct mm_struct *mm,
@@ -466,15 +478,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
466 unsigned long sp, csp, nitems; 478 unsigned long sp, csp, nitems;
467 elf_caddr_t __user *argv, *envp; 479 elf_caddr_t __user *argv, *envp;
468 size_t platform_len = 0, len; 480 size_t platform_len = 0, len;
469 char *k_platform; 481 char *k_platform, *k_base_platform;
470 char __user *u_platform, *p; 482 char __user *u_platform, *u_base_platform, *p;
471 long hwcap; 483 long hwcap;
472 int loop; 484 int loop;
473 int nr; /* reset for each csp adjustment */ 485 int nr; /* reset for each csp adjustment */
474 486
475 /* we're going to shovel a whole load of stuff onto the stack */
476#ifdef CONFIG_MMU 487#ifdef CONFIG_MMU
477 sp = bprm->p; 488 /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
489 * by the processes running on the same package. One thing we can do is
490 * to shuffle the initial stack for them, so we give the architecture
491 * an opportunity to do so here.
492 */
493 sp = arch_align_stack(bprm->p);
478#else 494#else
479 sp = mm->start_stack; 495 sp = mm->start_stack;
480 496
@@ -483,11 +499,14 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
483 return -EFAULT; 499 return -EFAULT;
484#endif 500#endif
485 501
486 /* get hold of platform and hardware capabilities masks for the machine
487 * we are running on. In some cases (Sparc), this info is impossible
488 * to get, in others (i386) it is merely difficult.
489 */
490 hwcap = ELF_HWCAP; 502 hwcap = ELF_HWCAP;
503
504 /*
505 * If this architecture has a platform capability string, copy it
506 * to userspace. In some cases (Sparc), this info is impossible
507 * for userspace to get any other way, in others (i386) it is
508 * merely difficult.
509 */
491 k_platform = ELF_PLATFORM; 510 k_platform = ELF_PLATFORM;
492 u_platform = NULL; 511 u_platform = NULL;
493 512
@@ -499,19 +518,20 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
499 return -EFAULT; 518 return -EFAULT;
500 } 519 }
501 520
502#if defined(__i386__) && defined(CONFIG_SMP) 521 /*
503 /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions 522 * If this architecture has a "base" platform capability
504 * by the processes running on the same package. One thing we can do is 523 * string, copy it to userspace.
505 * to shuffle the initial stack for them.
506 *
507 * the conditionals here are unneeded, but kept in to make the code
508 * behaviour the same as pre change unless we have hyperthreaded
509 * processors. This keeps Mr Marcelo Person happier but should be
510 * removed for 2.5
511 */ 524 */
512 if (smp_num_siblings > 1) 525 k_base_platform = ELF_BASE_PLATFORM;
513 sp = sp - ((current->pid % 64) << 7); 526 u_base_platform = NULL;
514#endif 527
528 if (k_base_platform) {
529 platform_len = strlen(k_base_platform) + 1;
530 sp -= platform_len;
531 u_base_platform = (char __user *) sp;
532 if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0)
533 return -EFAULT;
534 }
515 535
516 sp &= ~7UL; 536 sp &= ~7UL;
517 537
@@ -541,9 +561,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
541 } 561 }
542 562
543 /* force 16 byte _final_ alignment here for generality */ 563 /* force 16 byte _final_ alignment here for generality */
544#define DLINFO_ITEMS 13 564#define DLINFO_ITEMS 15
565
566 nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) +
567 (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
545 568
546 nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; 569 if (bprm->interp_flags & BINPRM_FLAGS_EXECFD)
570 nitems++;
547 571
548 csp = sp; 572 csp = sp;
549 sp -= nitems * 2 * sizeof(unsigned long); 573 sp -= nitems * 2 * sizeof(unsigned long);
@@ -575,6 +599,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
575 (elf_addr_t) (unsigned long) u_platform); 599 (elf_addr_t) (unsigned long) u_platform);
576 } 600 }
577 601
602 if (k_base_platform) {
603 nr = 0;
604 csp -= 2 * sizeof(unsigned long);
605 NEW_AUX_ENT(AT_BASE_PLATFORM,
606 (elf_addr_t) (unsigned long) u_base_platform);
607 }
608
609 if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
610 nr = 0;
611 csp -= 2 * sizeof(unsigned long);
612 NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
613 }
614
578 nr = 0; 615 nr = 0;
579 csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); 616 csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
580 NEW_AUX_ENT(AT_HWCAP, hwcap); 617 NEW_AUX_ENT(AT_HWCAP, hwcap);
@@ -590,6 +627,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
590 NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->euid); 627 NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->euid);
591 NEW_AUX_ENT(AT_GID, (elf_addr_t) current->gid); 628 NEW_AUX_ENT(AT_GID, (elf_addr_t) current->gid);
592 NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->egid); 629 NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->egid);
630 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
631 NEW_AUX_ENT(AT_EXECFN, bprm->exec);
593 632
594#ifdef ARCH_DLINFO 633#ifdef ARCH_DLINFO
595 nr = 0; 634 nr = 0;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index f9c88d0c8ced..32fb00b52cd0 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -43,7 +43,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
43 return -ENOEXEC; 43 return -ENOEXEC;
44 } 44 }
45 45
46 bprm->sh_bang = 1; /* Well, the bang-shell is implicit... */ 46 bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
47 allow_write_access(bprm->file); 47 allow_write_access(bprm->file);
48 fput(bprm->file); 48 fput(bprm->file);
49 bprm->file = NULL; 49 bprm->file = NULL;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index dfc0197905ca..ccb781a6a804 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -229,13 +229,13 @@ static int decompress_exec(
229 ret = 10; 229 ret = 10;
230 if (buf[3] & EXTRA_FIELD) { 230 if (buf[3] & EXTRA_FIELD) {
231 ret += 2 + buf[10] + (buf[11] << 8); 231 ret += 2 + buf[10] + (buf[11] << 8);
232 if (unlikely(LBUFSIZE == ret)) { 232 if (unlikely(LBUFSIZE <= ret)) {
233 DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); 233 DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n");
234 goto out_free_buf; 234 goto out_free_buf;
235 } 235 }
236 } 236 }
237 if (buf[3] & ORIG_NAME) { 237 if (buf[3] & ORIG_NAME) {
238 for (; ret < LBUFSIZE && (buf[ret] != 0); ret++) 238 while (ret < LBUFSIZE && buf[ret++] != 0)
239 ; 239 ;
240 if (unlikely(LBUFSIZE == ret)) { 240 if (unlikely(LBUFSIZE == ret)) {
241 DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); 241 DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n");
@@ -243,7 +243,7 @@ static int decompress_exec(
243 } 243 }
244 } 244 }
245 if (buf[3] & COMMENT) { 245 if (buf[3] & COMMENT) {
246 for (; ret < LBUFSIZE && (buf[ret] != 0); ret++) 246 while (ret < LBUFSIZE && buf[ret++] != 0)
247 ; 247 ;
248 if (unlikely(LBUFSIZE == ret)) { 248 if (unlikely(LBUFSIZE == ret)) {
249 DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); 249 DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n");
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 8d7e88e02e0f..f2744ab4e5b3 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -117,7 +117,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
117 goto _ret; 117 goto _ret;
118 118
119 retval = -ENOEXEC; 119 retval = -ENOEXEC;
120 if (bprm->misc_bang) 120 if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
121 goto _ret; 121 goto _ret;
122 122
123 /* to keep locking time low, we copy the interpreter string */ 123 /* to keep locking time low, we copy the interpreter string */
@@ -197,7 +197,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
197 if (retval < 0) 197 if (retval < 0)
198 goto _error; 198 goto _error;
199 199
200 bprm->misc_bang = 1; 200 bprm->recursion_depth++;
201 201
202 retval = search_binary_handler (bprm, regs); 202 retval = search_binary_handler (bprm, regs);
203 if (retval < 0) 203 if (retval < 0)
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 9e3963f7ebf1..08343505e184 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -22,14 +22,15 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
22 char interp[BINPRM_BUF_SIZE]; 22 char interp[BINPRM_BUF_SIZE];
23 int retval; 23 int retval;
24 24
25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || (bprm->sh_bang)) 25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') ||
26 (bprm->recursion_depth > BINPRM_MAX_RECURSION))
26 return -ENOEXEC; 27 return -ENOEXEC;
27 /* 28 /*
28 * This section does the #! interpretation. 29 * This section does the #! interpretation.
29 * Sorta complicated, but hopefully it will work. -TYT 30 * Sorta complicated, but hopefully it will work. -TYT
30 */ 31 */
31 32
32 bprm->sh_bang = 1; 33 bprm->recursion_depth++;
33 allow_write_access(bprm->file); 34 allow_write_access(bprm->file);
34 fput(bprm->file); 35 fput(bprm->file);
35 bprm->file = NULL; 36 bprm->file = NULL;
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 68be580ba289..74e587a52796 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -306,3 +306,5 @@ static void __exit exit_som_binfmt(void)
306 306
307core_initcall(init_som_binfmt); 307core_initcall(init_som_binfmt);
308module_exit(exit_som_binfmt); 308module_exit(exit_som_binfmt);
309
310MODULE_LICENSE("GPL");
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index c3e174b35fe6..19caf7c962ac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
107 BUG_ON(bip == NULL); 107 BUG_ON(bip == NULL);
108 108
109 /* A cloned bio doesn't own the integrity metadata */ 109 /* A cloned bio doesn't own the integrity metadata */
110 if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) 110 if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
111 && bip->bip_buf != NULL)
111 kfree(bip->bip_buf); 112 kfree(bip->bip_buf);
112 113
113 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); 114 mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
@@ -150,6 +151,24 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
150} 151}
151EXPORT_SYMBOL(bio_integrity_add_page); 152EXPORT_SYMBOL(bio_integrity_add_page);
152 153
154static int bdev_integrity_enabled(struct block_device *bdev, int rw)
155{
156 struct blk_integrity *bi = bdev_get_integrity(bdev);
157
158 if (bi == NULL)
159 return 0;
160
161 if (rw == READ && bi->verify_fn != NULL &&
162 (bi->flags & INTEGRITY_FLAG_READ))
163 return 1;
164
165 if (rw == WRITE && bi->generate_fn != NULL &&
166 (bi->flags & INTEGRITY_FLAG_WRITE))
167 return 1;
168
169 return 0;
170}
171
153/** 172/**
154 * bio_integrity_enabled - Check whether integrity can be passed 173 * bio_integrity_enabled - Check whether integrity can be passed
155 * @bio: bio to check 174 * @bio: bio to check
@@ -313,6 +332,14 @@ static void bio_integrity_generate(struct bio *bio)
313 } 332 }
314} 333}
315 334
335static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
336{
337 if (bi)
338 return bi->tuple_size;
339
340 return 0;
341}
342
316/** 343/**
317 * bio_integrity_prep - Prepare bio for integrity I/O 344 * bio_integrity_prep - Prepare bio for integrity I/O
318 * @bio: bio to prepare 345 * @bio: bio to prepare
diff --git a/fs/bio.c b/fs/bio.c
index 8000e2fa16cb..77a55bcceedb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
30 30
31static struct kmem_cache *bio_slab __read_mostly; 31static struct kmem_cache *bio_slab __read_mostly;
32 32
33mempool_t *bio_split_pool __read_mostly; 33static mempool_t *bio_split_pool __read_mostly;
34 34
35/* 35/*
36 * if you change this list, also change bvec_alloc or things will 36 * if you change this list, also change bvec_alloc or things will
@@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
60 struct bio_vec *bvl; 60 struct bio_vec *bvl;
61 61
62 /* 62 /*
63 * see comment near bvec_array define! 63 * If 'bs' is given, lookup the pool and do the mempool alloc.
64 * If not, this is a bio_kmalloc() allocation and just do a
65 * kzalloc() for the exact number of vecs right away.
64 */ 66 */
65 switch (nr) { 67 if (bs) {
66 case 1 : *idx = 0; break; 68 /*
67 case 2 ... 4: *idx = 1; break; 69 * see comment near bvec_array define!
68 case 5 ... 16: *idx = 2; break; 70 */
69 case 17 ... 64: *idx = 3; break; 71 switch (nr) {
70 case 65 ... 128: *idx = 4; break; 72 case 1:
71 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 73 *idx = 0;
74 break;
75 case 2 ... 4:
76 *idx = 1;
77 break;
78 case 5 ... 16:
79 *idx = 2;
80 break;
81 case 17 ... 64:
82 *idx = 3;
83 break;
84 case 65 ... 128:
85 *idx = 4;
86 break;
87 case 129 ... BIO_MAX_PAGES:
88 *idx = 5;
89 break;
72 default: 90 default:
73 return NULL; 91 return NULL;
74 } 92 }
75 /*
76 * idx now points to the pool we want to allocate from
77 */
78 93
79 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 94 /*
80 if (bvl) 95 * idx now points to the pool we want to allocate from
81 memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); 96 */
97 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
98 if (bvl)
99 memset(bvl, 0,
100 bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
101 } else
102 bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
82 103
83 return bvl; 104 return bvl;
84} 105}
@@ -107,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio)
107 bio_free(bio, fs_bio_set); 128 bio_free(bio, fs_bio_set);
108} 129}
109 130
131static void bio_kmalloc_destructor(struct bio *bio)
132{
133 kfree(bio->bi_io_vec);
134 kfree(bio);
135}
136
110void bio_init(struct bio *bio) 137void bio_init(struct bio *bio)
111{ 138{
112 memset(bio, 0, sizeof(*bio)); 139 memset(bio, 0, sizeof(*bio));
113 bio->bi_flags = 1 << BIO_UPTODATE; 140 bio->bi_flags = 1 << BIO_UPTODATE;
141 bio->bi_comp_cpu = -1;
114 atomic_set(&bio->bi_cnt, 1); 142 atomic_set(&bio->bi_cnt, 1);
115} 143}
116 144
@@ -118,19 +146,25 @@ void bio_init(struct bio *bio)
118 * bio_alloc_bioset - allocate a bio for I/O 146 * bio_alloc_bioset - allocate a bio for I/O
119 * @gfp_mask: the GFP_ mask given to the slab allocator 147 * @gfp_mask: the GFP_ mask given to the slab allocator
120 * @nr_iovecs: number of iovecs to pre-allocate 148 * @nr_iovecs: number of iovecs to pre-allocate
121 * @bs: the bio_set to allocate from 149 * @bs: the bio_set to allocate from. If %NULL, just use kmalloc
122 * 150 *
123 * Description: 151 * Description:
124 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 152 * bio_alloc_bioset will first try its own mempool to satisfy the allocation.
125 * If %__GFP_WAIT is set then we will block on the internal pool waiting 153 * If %__GFP_WAIT is set then we will block on the internal pool waiting
126 * for a &struct bio to become free. 154 * for a &struct bio to become free. If a %NULL @bs is passed in, we will
155 * fall back to just using @kmalloc to allocate the required memory.
127 * 156 *
128 * allocate bio and iovecs from the memory pools specified by the 157 * allocate bio and iovecs from the memory pools specified by the
129 * bio_set structure. 158 * bio_set structure, or @kmalloc if none given.
130 **/ 159 **/
131struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 160struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
132{ 161{
133 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 162 struct bio *bio;
163
164 if (bs)
165 bio = mempool_alloc(bs->bio_pool, gfp_mask);
166 else
167 bio = kmalloc(sizeof(*bio), gfp_mask);
134 168
135 if (likely(bio)) { 169 if (likely(bio)) {
136 struct bio_vec *bvl = NULL; 170 struct bio_vec *bvl = NULL;
@@ -141,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
141 175
142 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 176 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
143 if (unlikely(!bvl)) { 177 if (unlikely(!bvl)) {
144 mempool_free(bio, bs->bio_pool); 178 if (bs)
179 mempool_free(bio, bs->bio_pool);
180 else
181 kfree(bio);
145 bio = NULL; 182 bio = NULL;
146 goto out; 183 goto out;
147 } 184 }
@@ -164,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
164 return bio; 201 return bio;
165} 202}
166 203
204/*
205 * Like bio_alloc(), but doesn't use a mempool backing. This means that
206 * it CAN fail, but while bio_alloc() can only be used for allocations
207 * that have a short (finite) life span, bio_kmalloc() should be used
208 * for more permanent bio allocations (like allocating some bio's for
209 * initalization or setup purposes).
210 */
211struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
212{
213 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
214
215 if (bio)
216 bio->bi_destructor = bio_kmalloc_destructor;
217
218 return bio;
219}
220
167void zero_fill_bio(struct bio *bio) 221void zero_fill_bio(struct bio *bio)
168{ 222{
169 unsigned long flags; 223 unsigned long flags;
@@ -208,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
208 return bio->bi_phys_segments; 262 return bio->bi_phys_segments;
209} 263}
210 264
211inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
212{
213 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
214 blk_recount_segments(q, bio);
215
216 return bio->bi_hw_segments;
217}
218
219/** 265/**
220 * __bio_clone - clone a bio 266 * __bio_clone - clone a bio
221 * @bio: destination bio 267 * @bio: destination bio
@@ -350,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
350 */ 396 */
351 397
352 while (bio->bi_phys_segments >= q->max_phys_segments 398 while (bio->bi_phys_segments >= q->max_phys_segments
353 || bio->bi_hw_segments >= q->max_hw_segments 399 || bio->bi_phys_segments >= q->max_hw_segments) {
354 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
355 400
356 if (retried_segments) 401 if (retried_segments)
357 return 0; 402 return 0;
@@ -395,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
395 } 440 }
396 441
397 /* If we may be able to merge these biovecs, force a recount */ 442 /* If we may be able to merge these biovecs, force a recount */
398 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 443 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
399 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
400 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 444 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
401 445
402 bio->bi_vcnt++; 446 bio->bi_vcnt++;
403 bio->bi_phys_segments++; 447 bio->bi_phys_segments++;
404 bio->bi_hw_segments++;
405 done: 448 done:
406 bio->bi_size += len; 449 bio->bi_size += len;
407 return len; 450 return len;
@@ -449,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
449 492
450struct bio_map_data { 493struct bio_map_data {
451 struct bio_vec *iovecs; 494 struct bio_vec *iovecs;
452 int nr_sgvecs;
453 struct sg_iovec *sgvecs; 495 struct sg_iovec *sgvecs;
496 int nr_sgvecs;
497 int is_our_pages;
454}; 498};
455 499
456static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, 500static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
457 struct sg_iovec *iov, int iov_count) 501 struct sg_iovec *iov, int iov_count,
502 int is_our_pages)
458{ 503{
459 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 504 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
460 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); 505 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
461 bmd->nr_sgvecs = iov_count; 506 bmd->nr_sgvecs = iov_count;
507 bmd->is_our_pages = is_our_pages;
462 bio->bi_private = bmd; 508 bio->bi_private = bmd;
463} 509}
464 510
@@ -469,20 +515,21 @@ static void bio_free_map_data(struct bio_map_data *bmd)
469 kfree(bmd); 515 kfree(bmd);
470} 516}
471 517
472static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) 518static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
519 gfp_t gfp_mask)
473{ 520{
474 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 521 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
475 522
476 if (!bmd) 523 if (!bmd)
477 return NULL; 524 return NULL;
478 525
479 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 526 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
480 if (!bmd->iovecs) { 527 if (!bmd->iovecs) {
481 kfree(bmd); 528 kfree(bmd);
482 return NULL; 529 return NULL;
483 } 530 }
484 531
485 bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); 532 bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
486 if (bmd->sgvecs) 533 if (bmd->sgvecs)
487 return bmd; 534 return bmd;
488 535
@@ -491,8 +538,9 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
491 return NULL; 538 return NULL;
492} 539}
493 540
494static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, 541static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
495 int uncopy) 542 struct sg_iovec *iov, int iov_count, int uncopy,
543 int do_free_page)
496{ 544{
497 int ret = 0, i; 545 int ret = 0, i;
498 struct bio_vec *bvec; 546 struct bio_vec *bvec;
@@ -502,7 +550,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
502 550
503 __bio_for_each_segment(bvec, bio, i, 0) { 551 __bio_for_each_segment(bvec, bio, i, 0) {
504 char *bv_addr = page_address(bvec->bv_page); 552 char *bv_addr = page_address(bvec->bv_page);
505 unsigned int bv_len = bvec->bv_len; 553 unsigned int bv_len = iovecs[i].bv_len;
506 554
507 while (bv_len && iov_idx < iov_count) { 555 while (bv_len && iov_idx < iov_count) {
508 unsigned int bytes; 556 unsigned int bytes;
@@ -535,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
535 } 583 }
536 } 584 }
537 585
538 if (uncopy) 586 if (do_free_page)
539 __free_page(bvec->bv_page); 587 __free_page(bvec->bv_page);
540 } 588 }
541 589
@@ -552,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
552int bio_uncopy_user(struct bio *bio) 600int bio_uncopy_user(struct bio *bio)
553{ 601{
554 struct bio_map_data *bmd = bio->bi_private; 602 struct bio_map_data *bmd = bio->bi_private;
555 int ret; 603 int ret = 0;
556
557 ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
558 604
605 if (!bio_flagged(bio, BIO_NULL_MAPPED))
606 ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
607 bmd->nr_sgvecs, 1, bmd->is_our_pages);
559 bio_free_map_data(bmd); 608 bio_free_map_data(bmd);
560 bio_put(bio); 609 bio_put(bio);
561 return ret; 610 return ret;
@@ -564,16 +613,20 @@ int bio_uncopy_user(struct bio *bio)
564/** 613/**
565 * bio_copy_user_iov - copy user data to bio 614 * bio_copy_user_iov - copy user data to bio
566 * @q: destination block queue 615 * @q: destination block queue
616 * @map_data: pointer to the rq_map_data holding pages (if necessary)
567 * @iov: the iovec. 617 * @iov: the iovec.
568 * @iov_count: number of elements in the iovec 618 * @iov_count: number of elements in the iovec
569 * @write_to_vm: bool indicating writing to pages or not 619 * @write_to_vm: bool indicating writing to pages or not
620 * @gfp_mask: memory allocation flags
570 * 621 *
571 * Prepares and returns a bio for indirect user io, bouncing data 622 * Prepares and returns a bio for indirect user io, bouncing data
572 * to/from kernel pages as necessary. Must be paired with 623 * to/from kernel pages as necessary. Must be paired with
573 * call bio_uncopy_user() on io completion. 624 * call bio_uncopy_user() on io completion.
574 */ 625 */
575struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, 626struct bio *bio_copy_user_iov(struct request_queue *q,
576 int iov_count, int write_to_vm) 627 struct rq_map_data *map_data,
628 struct sg_iovec *iov, int iov_count,
629 int write_to_vm, gfp_t gfp_mask)
577{ 630{
578 struct bio_map_data *bmd; 631 struct bio_map_data *bmd;
579 struct bio_vec *bvec; 632 struct bio_vec *bvec;
@@ -596,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
596 len += iov[i].iov_len; 649 len += iov[i].iov_len;
597 } 650 }
598 651
599 bmd = bio_alloc_map_data(nr_pages, iov_count); 652 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
600 if (!bmd) 653 if (!bmd)
601 return ERR_PTR(-ENOMEM); 654 return ERR_PTR(-ENOMEM);
602 655
603 ret = -ENOMEM; 656 ret = -ENOMEM;
604 bio = bio_alloc(GFP_KERNEL, nr_pages); 657 bio = bio_alloc(gfp_mask, nr_pages);
605 if (!bio) 658 if (!bio)
606 goto out_bmd; 659 goto out_bmd;
607 660
608 bio->bi_rw |= (!write_to_vm << BIO_RW); 661 bio->bi_rw |= (!write_to_vm << BIO_RW);
609 662
610 ret = 0; 663 ret = 0;
664 i = 0;
611 while (len) { 665 while (len) {
612 unsigned int bytes = PAGE_SIZE; 666 unsigned int bytes;
667
668 if (map_data)
669 bytes = 1U << (PAGE_SHIFT + map_data->page_order);
670 else
671 bytes = PAGE_SIZE;
613 672
614 if (bytes > len) 673 if (bytes > len)
615 bytes = len; 674 bytes = len;
616 675
617 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 676 if (map_data) {
677 if (i == map_data->nr_entries) {
678 ret = -ENOMEM;
679 break;
680 }
681 page = map_data->pages[i++];
682 } else
683 page = alloc_page(q->bounce_gfp | gfp_mask);
618 if (!page) { 684 if (!page) {
619 ret = -ENOMEM; 685 ret = -ENOMEM;
620 break; 686 break;
@@ -633,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
633 * success 699 * success
634 */ 700 */
635 if (!write_to_vm) { 701 if (!write_to_vm) {
636 ret = __bio_copy_iov(bio, iov, iov_count, 0); 702 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
637 if (ret) 703 if (ret)
638 goto cleanup; 704 goto cleanup;
639 } 705 }
640 706
641 bio_set_map_data(bmd, bio, iov, iov_count); 707 bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
642 return bio; 708 return bio;
643cleanup: 709cleanup:
644 bio_for_each_segment(bvec, bio, i) 710 if (!map_data)
645 __free_page(bvec->bv_page); 711 bio_for_each_segment(bvec, bio, i)
712 __free_page(bvec->bv_page);
646 713
647 bio_put(bio); 714 bio_put(bio);
648out_bmd: 715out_bmd:
@@ -653,29 +720,32 @@ out_bmd:
653/** 720/**
654 * bio_copy_user - copy user data to bio 721 * bio_copy_user - copy user data to bio
655 * @q: destination block queue 722 * @q: destination block queue
723 * @map_data: pointer to the rq_map_data holding pages (if necessary)
656 * @uaddr: start of user address 724 * @uaddr: start of user address
657 * @len: length in bytes 725 * @len: length in bytes
658 * @write_to_vm: bool indicating writing to pages or not 726 * @write_to_vm: bool indicating writing to pages or not
727 * @gfp_mask: memory allocation flags
659 * 728 *
660 * Prepares and returns a bio for indirect user io, bouncing data 729 * Prepares and returns a bio for indirect user io, bouncing data
661 * to/from kernel pages as necessary. Must be paired with 730 * to/from kernel pages as necessary. Must be paired with
662 * call bio_uncopy_user() on io completion. 731 * call bio_uncopy_user() on io completion.
663 */ 732 */
664struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, 733struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
665 unsigned int len, int write_to_vm) 734 unsigned long uaddr, unsigned int len,
735 int write_to_vm, gfp_t gfp_mask)
666{ 736{
667 struct sg_iovec iov; 737 struct sg_iovec iov;
668 738
669 iov.iov_base = (void __user *)uaddr; 739 iov.iov_base = (void __user *)uaddr;
670 iov.iov_len = len; 740 iov.iov_len = len;
671 741
672 return bio_copy_user_iov(q, &iov, 1, write_to_vm); 742 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
673} 743}
674 744
675static struct bio *__bio_map_user_iov(struct request_queue *q, 745static struct bio *__bio_map_user_iov(struct request_queue *q,
676 struct block_device *bdev, 746 struct block_device *bdev,
677 struct sg_iovec *iov, int iov_count, 747 struct sg_iovec *iov, int iov_count,
678 int write_to_vm) 748 int write_to_vm, gfp_t gfp_mask)
679{ 749{
680 int i, j; 750 int i, j;
681 int nr_pages = 0; 751 int nr_pages = 0;
@@ -701,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
701 if (!nr_pages) 771 if (!nr_pages)
702 return ERR_PTR(-EINVAL); 772 return ERR_PTR(-EINVAL);
703 773
704 bio = bio_alloc(GFP_KERNEL, nr_pages); 774 bio = bio_alloc(gfp_mask, nr_pages);
705 if (!bio) 775 if (!bio)
706 return ERR_PTR(-ENOMEM); 776 return ERR_PTR(-ENOMEM);
707 777
708 ret = -ENOMEM; 778 ret = -ENOMEM;
709 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 779 pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
710 if (!pages) 780 if (!pages)
711 goto out; 781 goto out;
712 782
@@ -785,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
785 * @uaddr: start of user address 855 * @uaddr: start of user address
786 * @len: length in bytes 856 * @len: length in bytes
787 * @write_to_vm: bool indicating writing to pages or not 857 * @write_to_vm: bool indicating writing to pages or not
858 * @gfp_mask: memory allocation flags
788 * 859 *
789 * Map the user space address into a bio suitable for io to a block 860 * Map the user space address into a bio suitable for io to a block
790 * device. Returns an error pointer in case of error. 861 * device. Returns an error pointer in case of error.
791 */ 862 */
792struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, 863struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
793 unsigned long uaddr, unsigned int len, int write_to_vm) 864 unsigned long uaddr, unsigned int len, int write_to_vm,
865 gfp_t gfp_mask)
794{ 866{
795 struct sg_iovec iov; 867 struct sg_iovec iov;
796 868
797 iov.iov_base = (void __user *)uaddr; 869 iov.iov_base = (void __user *)uaddr;
798 iov.iov_len = len; 870 iov.iov_len = len;
799 871
800 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); 872 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
801} 873}
802 874
803/** 875/**
@@ -807,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
807 * @iov: the iovec. 879 * @iov: the iovec.
808 * @iov_count: number of elements in the iovec 880 * @iov_count: number of elements in the iovec
809 * @write_to_vm: bool indicating writing to pages or not 881 * @write_to_vm: bool indicating writing to pages or not
882 * @gfp_mask: memory allocation flags
810 * 883 *
811 * Map the user space address into a bio suitable for io to a block 884 * Map the user space address into a bio suitable for io to a block
812 * device. Returns an error pointer in case of error. 885 * device. Returns an error pointer in case of error.
813 */ 886 */
814struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, 887struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
815 struct sg_iovec *iov, int iov_count, 888 struct sg_iovec *iov, int iov_count,
816 int write_to_vm) 889 int write_to_vm, gfp_t gfp_mask)
817{ 890{
818 struct bio *bio; 891 struct bio *bio;
819 892
820 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); 893 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
821 894 gfp_mask);
822 if (IS_ERR(bio)) 895 if (IS_ERR(bio))
823 return bio; 896 return bio;
824 897
@@ -942,19 +1015,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
942{ 1015{
943 struct bio_vec *bvec; 1016 struct bio_vec *bvec;
944 const int read = bio_data_dir(bio) == READ; 1017 const int read = bio_data_dir(bio) == READ;
945 char *p = bio->bi_private; 1018 struct bio_map_data *bmd = bio->bi_private;
946 int i; 1019 int i;
1020 char *p = bmd->sgvecs[0].iov_base;
947 1021
948 __bio_for_each_segment(bvec, bio, i, 0) { 1022 __bio_for_each_segment(bvec, bio, i, 0) {
949 char *addr = page_address(bvec->bv_page); 1023 char *addr = page_address(bvec->bv_page);
1024 int len = bmd->iovecs[i].bv_len;
950 1025
951 if (read && !err) 1026 if (read && !err)
952 memcpy(p, addr, bvec->bv_len); 1027 memcpy(p, addr, len);
953 1028
954 __free_page(bvec->bv_page); 1029 __free_page(bvec->bv_page);
955 p += bvec->bv_len; 1030 p += len;
956 } 1031 }
957 1032
1033 bio_free_map_data(bmd);
958 bio_put(bio); 1034 bio_put(bio);
959} 1035}
960 1036
@@ -972,38 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
972struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, 1048struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
973 gfp_t gfp_mask, int reading) 1049 gfp_t gfp_mask, int reading)
974{ 1050{
975 unsigned long kaddr = (unsigned long)data;
976 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
977 unsigned long start = kaddr >> PAGE_SHIFT;
978 const int nr_pages = end - start;
979 struct bio *bio; 1051 struct bio *bio;
980 struct bio_vec *bvec; 1052 struct bio_vec *bvec;
981 int i, ret; 1053 int i;
982
983 bio = bio_alloc(gfp_mask, nr_pages);
984 if (!bio)
985 return ERR_PTR(-ENOMEM);
986
987 while (len) {
988 struct page *page;
989 unsigned int bytes = PAGE_SIZE;
990
991 if (bytes > len)
992 bytes = len;
993
994 page = alloc_page(q->bounce_gfp | gfp_mask);
995 if (!page) {
996 ret = -ENOMEM;
997 goto cleanup;
998 }
999
1000 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
1001 ret = -EINVAL;
1002 goto cleanup;
1003 }
1004 1054
1005 len -= bytes; 1055 bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
1006 } 1056 if (IS_ERR(bio))
1057 return bio;
1007 1058
1008 if (!reading) { 1059 if (!reading) {
1009 void *p = data; 1060 void *p = data;
@@ -1016,16 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1016 } 1067 }
1017 } 1068 }
1018 1069
1019 bio->bi_private = data;
1020 bio->bi_end_io = bio_copy_kern_endio; 1070 bio->bi_end_io = bio_copy_kern_endio;
1021 return bio;
1022cleanup:
1023 bio_for_each_segment(bvec, bio, i)
1024 __free_page(bvec->bv_page);
1025
1026 bio_put(bio);
1027 1071
1028 return ERR_PTR(ret); 1072 return bio;
1029} 1073}
1030 1074
1031/* 1075/*
@@ -1212,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
1212 * split a bio - only worry about a bio with a single page 1256 * split a bio - only worry about a bio with a single page
1213 * in it's iovec 1257 * in it's iovec
1214 */ 1258 */
1215struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 1259struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1216{ 1260{
1217 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 1261 struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
1218 1262
1219 if (!bp) 1263 if (!bp)
1220 return bp; 1264 return bp;
@@ -1248,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1248 bp->bio2.bi_end_io = bio_pair_end_2; 1292 bp->bio2.bi_end_io = bio_pair_end_2;
1249 1293
1250 bp->bio1.bi_private = bi; 1294 bp->bio1.bi_private = bi;
1251 bp->bio2.bi_private = pool; 1295 bp->bio2.bi_private = bio_split_pool;
1252 1296
1253 if (bio_integrity(bi)) 1297 if (bio_integrity(bi))
1254 bio_integrity_split(bi, bp, first_sectors); 1298 bio_integrity_split(bi, bp, first_sectors);
@@ -1256,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
1256 return bp; 1300 return bp;
1257} 1301}
1258 1302
1303/**
1304 * bio_sector_offset - Find hardware sector offset in bio
1305 * @bio: bio to inspect
1306 * @index: bio_vec index
1307 * @offset: offset in bv_page
1308 *
1309 * Return the number of hardware sectors between beginning of bio
1310 * and an end point indicated by a bio_vec index and an offset
1311 * within that vector's page.
1312 */
1313sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1314 unsigned int offset)
1315{
1316 unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
1317 struct bio_vec *bv;
1318 sector_t sectors;
1319 int i;
1320
1321 sectors = 0;
1322
1323 if (index >= bio->bi_idx)
1324 index = bio->bi_vcnt - 1;
1325
1326 __bio_for_each_segment(bv, bio, i, 0) {
1327 if (i == index) {
1328 if (offset > bv->bv_offset)
1329 sectors += (offset - bv->bv_offset) / sector_sz;
1330 break;
1331 }
1332
1333 sectors += bv->bv_len / sector_sz;
1334 }
1335
1336 return sectors;
1337}
1338EXPORT_SYMBOL(bio_sector_offset);
1259 1339
1260/* 1340/*
1261 * create memory pools for biovec's in a bio_set. 1341 * create memory pools for biovec's in a bio_set.
@@ -1358,6 +1438,7 @@ static int __init init_bio(void)
1358subsys_initcall(init_bio); 1438subsys_initcall(init_bio);
1359 1439
1360EXPORT_SYMBOL(bio_alloc); 1440EXPORT_SYMBOL(bio_alloc);
1441EXPORT_SYMBOL(bio_kmalloc);
1361EXPORT_SYMBOL(bio_put); 1442EXPORT_SYMBOL(bio_put);
1362EXPORT_SYMBOL(bio_free); 1443EXPORT_SYMBOL(bio_free);
1363EXPORT_SYMBOL(bio_endio); 1444EXPORT_SYMBOL(bio_endio);
@@ -1365,7 +1446,6 @@ EXPORT_SYMBOL(bio_init);
1365EXPORT_SYMBOL(__bio_clone); 1446EXPORT_SYMBOL(__bio_clone);
1366EXPORT_SYMBOL(bio_clone); 1447EXPORT_SYMBOL(bio_clone);
1367EXPORT_SYMBOL(bio_phys_segments); 1448EXPORT_SYMBOL(bio_phys_segments);
1368EXPORT_SYMBOL(bio_hw_segments);
1369EXPORT_SYMBOL(bio_add_page); 1449EXPORT_SYMBOL(bio_add_page);
1370EXPORT_SYMBOL(bio_add_pc_page); 1450EXPORT_SYMBOL(bio_add_pc_page);
1371EXPORT_SYMBOL(bio_get_nr_vecs); 1451EXPORT_SYMBOL(bio_get_nr_vecs);
@@ -1375,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
1375EXPORT_SYMBOL(bio_copy_kern); 1455EXPORT_SYMBOL(bio_copy_kern);
1376EXPORT_SYMBOL(bio_pair_release); 1456EXPORT_SYMBOL(bio_pair_release);
1377EXPORT_SYMBOL(bio_split); 1457EXPORT_SYMBOL(bio_split);
1378EXPORT_SYMBOL(bio_split_pool);
1379EXPORT_SYMBOL(bio_copy_user); 1458EXPORT_SYMBOL(bio_copy_user);
1380EXPORT_SYMBOL(bio_uncopy_user); 1459EXPORT_SYMBOL(bio_uncopy_user);
1381EXPORT_SYMBOL(bioset_create); 1460EXPORT_SYMBOL(bioset_create);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aff54219e049..218408eed1bb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release);
540 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 540 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
541 */ 541 */
542 542
543static struct kobject *bdev_get_kobj(struct block_device *bdev)
544{
545 if (bdev->bd_contains != bdev)
546 return kobject_get(&bdev->bd_part->dev.kobj);
547 else
548 return kobject_get(&bdev->bd_disk->dev.kobj);
549}
550
551static struct kobject *bdev_get_holder(struct block_device *bdev)
552{
553 if (bdev->bd_contains != bdev)
554 return kobject_get(bdev->bd_part->holder_dir);
555 else
556 return kobject_get(bdev->bd_disk->holder_dir);
557}
558
559static int add_symlink(struct kobject *from, struct kobject *to) 543static int add_symlink(struct kobject *from, struct kobject *to)
560{ 544{
561 if (!from || !to) 545 if (!from || !to)
@@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
604 if (!bo->hdev) 588 if (!bo->hdev)
605 goto fail_put_sdir; 589 goto fail_put_sdir;
606 590
607 bo->sdev = bdev_get_kobj(bdev); 591 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
608 if (!bo->sdev) 592 if (!bo->sdev)
609 goto fail_put_hdev; 593 goto fail_put_hdev;
610 594
611 bo->hdir = bdev_get_holder(bdev); 595 bo->hdir = kobject_get(bdev->bd_part->holder_dir);
612 if (!bo->hdir) 596 if (!bo->hdir)
613 goto fail_put_sdev; 597 goto fail_put_sdev;
614 598
@@ -868,6 +852,87 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
868 852
869EXPORT_SYMBOL(open_by_devnum); 853EXPORT_SYMBOL(open_by_devnum);
870 854
855/**
856 * flush_disk - invalidates all buffer-cache entries on a disk
857 *
858 * @bdev: struct block device to be flushed
859 *
860 * Invalidates all buffer-cache entries on a disk. It should be called
861 * when a disk has been changed -- either by a media change or online
862 * resize.
863 */
864static void flush_disk(struct block_device *bdev)
865{
866 if (__invalidate_device(bdev)) {
867 char name[BDEVNAME_SIZE] = "";
868
869 if (bdev->bd_disk)
870 disk_name(bdev->bd_disk, 0, name);
871 printk(KERN_WARNING "VFS: busy inodes on changed media or "
872 "resized disk %s\n", name);
873 }
874
875 if (!bdev->bd_disk)
876 return;
877 if (disk_partitionable(bdev->bd_disk))
878 bdev->bd_invalidated = 1;
879}
880
881/**
882 * check_disk_size_change - checks for disk size change and adjusts bdev size.
883 * @disk: struct gendisk to check
884 * @bdev: struct bdev to adjust.
885 *
886 * This routine checks to see if the bdev size does not match the disk size
887 * and adjusts it if it differs.
888 */
889void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
890{
891 loff_t disk_size, bdev_size;
892
893 disk_size = (loff_t)get_capacity(disk) << 9;
894 bdev_size = i_size_read(bdev->bd_inode);
895 if (disk_size != bdev_size) {
896 char name[BDEVNAME_SIZE];
897
898 disk_name(disk, 0, name);
899 printk(KERN_INFO
900 "%s: detected capacity change from %lld to %lld\n",
901 name, bdev_size, disk_size);
902 i_size_write(bdev->bd_inode, disk_size);
903 flush_disk(bdev);
904 }
905}
906EXPORT_SYMBOL(check_disk_size_change);
907
908/**
909 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
910 * @disk: struct gendisk to be revalidated
911 *
912 * This routine is a wrapper for lower-level driver's revalidate_disk
913 * call-backs. It is used to do common pre and post operations needed
914 * for all revalidate_disk operations.
915 */
916int revalidate_disk(struct gendisk *disk)
917{
918 struct block_device *bdev;
919 int ret = 0;
920
921 if (disk->fops->revalidate_disk)
922 ret = disk->fops->revalidate_disk(disk);
923
924 bdev = bdget_disk(disk, 0);
925 if (!bdev)
926 return ret;
927
928 mutex_lock(&bdev->bd_mutex);
929 check_disk_size_change(disk, bdev);
930 mutex_unlock(&bdev->bd_mutex);
931 bdput(bdev);
932 return ret;
933}
934EXPORT_SYMBOL(revalidate_disk);
935
871/* 936/*
872 * This routine checks whether a removable media has been changed, 937 * This routine checks whether a removable media has been changed,
873 * and invalidates all buffer-cache-entries in that case. This 938 * and invalidates all buffer-cache-entries in that case. This
@@ -887,13 +952,9 @@ int check_disk_change(struct block_device *bdev)
887 if (!bdops->media_changed(bdev->bd_disk)) 952 if (!bdops->media_changed(bdev->bd_disk))
888 return 0; 953 return 0;
889 954
890 if (__invalidate_device(bdev)) 955 flush_disk(bdev);
891 printk("VFS: busy inodes on changed media.\n");
892
893 if (bdops->revalidate_disk) 956 if (bdops->revalidate_disk)
894 bdops->revalidate_disk(bdev->bd_disk); 957 bdops->revalidate_disk(bdev->bd_disk);
895 if (bdev->bd_disk->minors > 1)
896 bdev->bd_invalidated = 1;
897 return 1; 958 return 1;
898} 959}
899 960
@@ -927,10 +988,10 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
927 988
928static int do_open(struct block_device *bdev, struct file *file, int for_part) 989static int do_open(struct block_device *bdev, struct file *file, int for_part)
929{ 990{
930 struct module *owner = NULL;
931 struct gendisk *disk; 991 struct gendisk *disk;
992 struct hd_struct *part = NULL;
932 int ret; 993 int ret;
933 int part; 994 int partno;
934 int perm = 0; 995 int perm = 0;
935 996
936 if (file->f_mode & FMODE_READ) 997 if (file->f_mode & FMODE_READ)
@@ -948,25 +1009,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
948 1009
949 ret = -ENXIO; 1010 ret = -ENXIO;
950 file->f_mapping = bdev->bd_inode->i_mapping; 1011 file->f_mapping = bdev->bd_inode->i_mapping;
1012
951 lock_kernel(); 1013 lock_kernel();
952 disk = get_gendisk(bdev->bd_dev, &part); 1014
953 if (!disk) { 1015 disk = get_gendisk(bdev->bd_dev, &partno);
954 unlock_kernel(); 1016 if (!disk)
955 bdput(bdev); 1017 goto out_unlock_kernel;
956 return ret; 1018 part = disk_get_part(disk, partno);
957 } 1019 if (!part)
958 owner = disk->fops->owner; 1020 goto out_unlock_kernel;
959 1021
960 mutex_lock_nested(&bdev->bd_mutex, for_part); 1022 mutex_lock_nested(&bdev->bd_mutex, for_part);
961 if (!bdev->bd_openers) { 1023 if (!bdev->bd_openers) {
962 bdev->bd_disk = disk; 1024 bdev->bd_disk = disk;
1025 bdev->bd_part = part;
963 bdev->bd_contains = bdev; 1026 bdev->bd_contains = bdev;
964 if (!part) { 1027 if (!partno) {
965 struct backing_dev_info *bdi; 1028 struct backing_dev_info *bdi;
966 if (disk->fops->open) { 1029 if (disk->fops->open) {
967 ret = disk->fops->open(bdev->bd_inode, file); 1030 ret = disk->fops->open(bdev->bd_inode, file);
968 if (ret) 1031 if (ret)
969 goto out_first; 1032 goto out_clear;
970 } 1033 }
971 if (!bdev->bd_openers) { 1034 if (!bdev->bd_openers) {
972 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1035 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -978,36 +1041,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
978 if (bdev->bd_invalidated) 1041 if (bdev->bd_invalidated)
979 rescan_partitions(disk, bdev); 1042 rescan_partitions(disk, bdev);
980 } else { 1043 } else {
981 struct hd_struct *p;
982 struct block_device *whole; 1044 struct block_device *whole;
983 whole = bdget_disk(disk, 0); 1045 whole = bdget_disk(disk, 0);
984 ret = -ENOMEM; 1046 ret = -ENOMEM;
985 if (!whole) 1047 if (!whole)
986 goto out_first; 1048 goto out_clear;
987 BUG_ON(for_part); 1049 BUG_ON(for_part);
988 ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1); 1050 ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
989 if (ret) 1051 if (ret)
990 goto out_first; 1052 goto out_clear;
991 bdev->bd_contains = whole; 1053 bdev->bd_contains = whole;
992 p = disk->part[part - 1];
993 bdev->bd_inode->i_data.backing_dev_info = 1054 bdev->bd_inode->i_data.backing_dev_info =
994 whole->bd_inode->i_data.backing_dev_info; 1055 whole->bd_inode->i_data.backing_dev_info;
995 if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { 1056 if (!(disk->flags & GENHD_FL_UP) ||
1057 !part || !part->nr_sects) {
996 ret = -ENXIO; 1058 ret = -ENXIO;
997 goto out_first; 1059 goto out_clear;
998 } 1060 }
999 kobject_get(&p->dev.kobj); 1061 bd_set_size(bdev, (loff_t)part->nr_sects << 9);
1000 bdev->bd_part = p;
1001 bd_set_size(bdev, (loff_t) p->nr_sects << 9);
1002 } 1062 }
1003 } else { 1063 } else {
1064 disk_put_part(part);
1004 put_disk(disk); 1065 put_disk(disk);
1005 module_put(owner); 1066 module_put(disk->fops->owner);
1067 part = NULL;
1068 disk = NULL;
1006 if (bdev->bd_contains == bdev) { 1069 if (bdev->bd_contains == bdev) {
1007 if (bdev->bd_disk->fops->open) { 1070 if (bdev->bd_disk->fops->open) {
1008 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); 1071 ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
1009 if (ret) 1072 if (ret)
1010 goto out; 1073 goto out_unlock_bdev;
1011 } 1074 }
1012 if (bdev->bd_invalidated) 1075 if (bdev->bd_invalidated)
1013 rescan_partitions(bdev->bd_disk, bdev); 1076 rescan_partitions(bdev->bd_disk, bdev);
@@ -1020,19 +1083,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
1020 unlock_kernel(); 1083 unlock_kernel();
1021 return 0; 1084 return 0;
1022 1085
1023out_first: 1086 out_clear:
1024 bdev->bd_disk = NULL; 1087 bdev->bd_disk = NULL;
1088 bdev->bd_part = NULL;
1025 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1089 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1026 if (bdev != bdev->bd_contains) 1090 if (bdev != bdev->bd_contains)
1027 __blkdev_put(bdev->bd_contains, 1); 1091 __blkdev_put(bdev->bd_contains, 1);
1028 bdev->bd_contains = NULL; 1092 bdev->bd_contains = NULL;
1029 put_disk(disk); 1093 out_unlock_bdev:
1030 module_put(owner);
1031out:
1032 mutex_unlock(&bdev->bd_mutex); 1094 mutex_unlock(&bdev->bd_mutex);
1095 out_unlock_kernel:
1033 unlock_kernel(); 1096 unlock_kernel();
1034 if (ret) 1097
1035 bdput(bdev); 1098 disk_put_part(part);
1099 if (disk)
1100 module_put(disk->fops->owner);
1101 put_disk(disk);
1102 bdput(bdev);
1103
1036 return ret; 1104 return ret;
1037} 1105}
1038 1106
@@ -1117,11 +1185,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
1117 1185
1118 put_disk(disk); 1186 put_disk(disk);
1119 module_put(owner); 1187 module_put(owner);
1120 1188 disk_put_part(bdev->bd_part);
1121 if (bdev->bd_contains != bdev) { 1189 bdev->bd_part = NULL;
1122 kobject_put(&bdev->bd_part->dev.kobj);
1123 bdev->bd_part = NULL;
1124 }
1125 bdev->bd_disk = NULL; 1190 bdev->bd_disk = NULL;
1126 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1191 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
1127 if (bdev != bdev->bd_contains) 1192 if (bdev != bdev->bd_contains)
@@ -1197,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev);
1197 1262
1198/** 1263/**
1199 * lookup_bdev - lookup a struct block_device by name 1264 * lookup_bdev - lookup a struct block_device by name
1200 *
1201 * @path: special file representing the block device 1265 * @path: special file representing the block device
1202 * 1266 *
1203 * Get a reference to the blockdevice at @path in the current 1267 * Get a reference to the blockdevice at @pathname in the current
1204 * namespace if possible and return it. Return ERR_PTR(error) 1268 * namespace if possible and return it. Return ERR_PTR(error)
1205 * otherwise. 1269 * otherwise.
1206 */ 1270 */
diff --git a/fs/buffer.c b/fs/buffer.c
index 38653e36e225..ac78d4c19b3b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2926,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh)
2926 BUG_ON(!buffer_mapped(bh)); 2926 BUG_ON(!buffer_mapped(bh));
2927 BUG_ON(!bh->b_end_io); 2927 BUG_ON(!bh->b_end_io);
2928 2928
2929 if (buffer_ordered(bh) && (rw == WRITE)) 2929 /*
2930 rw = WRITE_BARRIER; 2930 * Mask in barrier bit for a write (could be either a WRITE or a
2931 * WRITE_SYNC
2932 */
2933 if (buffer_ordered(bh) && (rw & WRITE))
2934 rw |= WRITE_BARRIER;
2931 2935
2932 /* 2936 /*
2933 * Only clear out a write error when rewriting, should this 2937 * Only clear out a write error when rewriting
2934 * include WRITE_SYNC as well?
2935 */ 2938 */
2936 if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) 2939 if (test_set_buffer_req(bh) && (rw & WRITE))
2937 clear_buffer_write_io_error(bh); 2940 clear_buffer_write_io_error(bh);
2938 2941
2939 /* 2942 /*
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cb7cda3d780..262fa10e213d 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -22,9 +22,6 @@
22#include <linux/mutex.h> 22#include <linux/mutex.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24 24
25#ifdef CONFIG_KMOD
26#include <linux/kmod.h>
27#endif
28#include "internal.h" 25#include "internal.h"
29 26
30/* 27/*
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f5d0083e09fa..06e521a945c3 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -4,7 +4,15 @@ Fix premature write failure on congested networks (we would give up
4on EAGAIN from the socket too quickly on large writes). 4on EAGAIN from the socket too quickly on large writes).
5Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. 5Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
6Fix endian problems in acl (mode from/to cifs acl) on bigendian 6Fix endian problems in acl (mode from/to cifs acl) on bigendian
7architectures. 7architectures. Fix problems with preserving timestamps on copying open
8files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit
9on parent directory when server supports Unix Extensions but not POSIX
10create. Update cifs.upcall version to handle new Kerberos sec flags
11(this requires update of cifs.upcall program from Samba). Fix memory leak
12on dns_upcall (resolving DFS referralls). Fix plain text password
13authentication (requires setting SecurityFlags to 0x30030 to enable
14lanman and plain text though). Fix writes to be at correct offset when
15file is open with O_APPEND and file is on a directio (forcediretio) mount.
8 16
9Version 1.53 17Version 1.53
10------------ 18------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 2bd6fe556f88..bd2343d4c6a6 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -542,10 +542,20 @@ SecurityFlags Flags which control security negotiation and
542 hashing mechanisms (as "must use") on the other hand 542 hashing mechanisms (as "must use") on the other hand
543 does not make much sense. Default flags are 543 does not make much sense. Default flags are
544 0x07007 544 0x07007
545 (NTLM, NTLMv2 and packet signing allowed). Maximum 545 (NTLM, NTLMv2 and packet signing allowed). The maximum
546 allowable flags if you want to allow mounts to servers 546 allowable flags if you want to allow mounts to servers
547 using weaker password hashes is 0x37037 (lanman, 547 using weaker password hashes is 0x37037 (lanman,
548 plaintext, ntlm, ntlmv2, signing allowed): 548 plaintext, ntlm, ntlmv2, signing allowed). Some
549 SecurityFlags require the corresponding menuconfig
550 options to be enabled (lanman and plaintext require
551 CONFIG_CIFS_WEAK_PW_HASH for example). Enabling
552 plaintext authentication currently requires also
553 enabling lanman authentication in the security flags
554 because the cifs module only supports sending
555 laintext passwords using the older lanman dialect
556 form of the session setup SMB. (e.g. for authentication
557 using plain text passwords, set the SecurityFlags
558 to 0x30030):
549 559
550 may use packet signing 0x00001 560 may use packet signing 0x00001
551 must use packet signing 0x01001 561 must use packet signing 0x01001
@@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in
642that they represent all for that share, not just those for which the server 652that they represent all for that share, not just those for which the server
643returned success. 653returned success.
644 654
645Also note that "cat /proc/fs/cifs/DebugData" will display information about 655Also note that "cat /proc/fs/cifs/DebugData" will display information about
646the active sessions and the shares that are mounted. 656the active sessions and the shares that are mounted.
647Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is 657
648on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and 658Enabling Kerberos (extended security) works but requires version 1.2 or later
649LANMAN support do not require this helper. 659of the helper program cifs.upcall to be present and to be configured in the
660/etc/request-key.conf file. The cifs.upcall helper program is from the Samba
661project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
662require this helper. Note that NTLMv2 security (which does not require the
663cifs.upcall helper program), instead of using Kerberos, is sufficient for
664some use cases.
665
666Enabling DFS support (used to access shares transparently in an MS-DFS
667global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In
668addition, DFS support for target shares which are specified as UNC
669names which begin with host names (rather than IP addresses) requires
670a user space helper (such as cifs.upcall) to be present in order to
671translate host names to ip address, and the user space helper must also
672be configured in the file /etc/request-key.conf
673
674To use cifs Kerberos and DFS support, the Linux keyutils package should be
675installed and something like the following lines should be added to the
676/etc/request-key.conf file:
677
678create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
679create dns_resolver * * /usr/local/sbin/cifs.upcall %k
680
681
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 5fabd2caf93c..1b09f1670061 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
476 unsigned int cls, con, tag, oidlen, rc; 476 unsigned int cls, con, tag, oidlen, rc;
477 bool use_ntlmssp = false; 477 bool use_ntlmssp = false;
478 bool use_kerberos = false; 478 bool use_kerberos = false;
479 bool use_mskerberos = false;
479 480
480 *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/ 481 *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
481 482
@@ -574,10 +575,12 @@ decode_negTokenInit(unsigned char *security_blob, int length,
574 *(oid + 1), *(oid + 2), *(oid + 3))); 575 *(oid + 1), *(oid + 2), *(oid + 3)));
575 576
576 if (compare_oid(oid, oidlen, MSKRB5_OID, 577 if (compare_oid(oid, oidlen, MSKRB5_OID,
577 MSKRB5_OID_LEN)) 578 MSKRB5_OID_LEN) &&
578 use_kerberos = true; 579 !use_kerberos)
580 use_mskerberos = true;
579 else if (compare_oid(oid, oidlen, KRB5_OID, 581 else if (compare_oid(oid, oidlen, KRB5_OID,
580 KRB5_OID_LEN)) 582 KRB5_OID_LEN) &&
583 !use_mskerberos)
581 use_kerberos = true; 584 use_kerberos = true;
582 else if (compare_oid(oid, oidlen, NTLMSSP_OID, 585 else if (compare_oid(oid, oidlen, NTLMSSP_OID,
583 NTLMSSP_OID_LEN)) 586 NTLMSSP_OID_LEN))
@@ -630,6 +633,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
630 633
631 if (use_kerberos) 634 if (use_kerberos)
632 *secType = Kerberos; 635 *secType = Kerberos;
636 else if (use_mskerberos)
637 *secType = MSKerberos;
633 else if (use_ntlmssp) 638 else if (use_ntlmssp)
634 *secType = NTLMSSP; 639 *secType = NTLMSSP;
635 640
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2434ab0e8791..fcee9298b620 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = {
66 .describe = user_describe, 66 .describe = user_describe,
67}; 67};
68 68
69#define MAX_VER_STR_LEN 8 /* length of longest version string e.g. 69/* length of longest version string e.g. strlen("ver=0xFF") */
70 strlen("ver=0xFF") */ 70#define MAX_VER_STR_LEN 8
71#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg 71
72 in future could have strlen(";sec=ntlmsspi") */ 72/* length of longest security mechanism name, eg in future could have
73#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */ 73 * strlen(";sec=ntlmsspi") */
74#define MAX_MECH_STR_LEN 13
75
76/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
77#define MAX_IPV6_ADDR_LEN 42
78
79/* strlen of "host=" */
80#define HOST_KEY_LEN 5
81
82/* strlen of ";ip4=" or ";ip6=" */
83#define IP_KEY_LEN 5
84
85/* strlen of ";uid=0x" */
86#define UID_KEY_LEN 7
87
88/* strlen of ";user=" */
89#define USER_KEY_LEN 6
90
74/* get a key struct with a SPNEGO security blob, suitable for session setup */ 91/* get a key struct with a SPNEGO security blob, suitable for session setup */
75struct key * 92struct key *
76cifs_get_spnego_key(struct cifsSesInfo *sesInfo) 93cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -84,11 +101,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
84 /* length of fields (with semicolons): ver=0xyz ip4=ipaddress 101 /* length of fields (with semicolons): ver=0xyz ip4=ipaddress
85 host=hostname sec=mechanism uid=0xFF user=username */ 102 host=hostname sec=mechanism uid=0xFF user=username */
86 desc_len = MAX_VER_STR_LEN + 103 desc_len = MAX_VER_STR_LEN +
87 6 /* len of "host=" */ + strlen(hostname) + 104 HOST_KEY_LEN + strlen(hostname) +
88 5 /* len of ";ipv4=" */ + MAX_IPV6_ADDR_LEN + 105 IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
89 MAX_MECH_STR_LEN + 106 MAX_MECH_STR_LEN +
90 7 /* len of ";uid=0x" */ + (sizeof(uid_t) * 2) + 107 UID_KEY_LEN + (sizeof(uid_t) * 2) +
91 6 /* len of ";user=" */ + strlen(sesInfo->userName) + 1; 108 USER_KEY_LEN + strlen(sesInfo->userName) + 1;
92 109
93 spnego_key = ERR_PTR(-ENOMEM); 110 spnego_key = ERR_PTR(-ENOMEM);
94 description = kzalloc(desc_len, GFP_KERNEL); 111 description = kzalloc(desc_len, GFP_KERNEL);
@@ -114,9 +131,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
114 131
115 dp = description + strlen(description); 132 dp = description + strlen(description);
116 133
117 /* for now, only sec=krb5 is valid */ 134 /* for now, only sec=krb5 and sec=mskrb5 are valid */
118 if (server->secType == Kerberos) 135 if (server->secType == Kerberos)
119 sprintf(dp, ";sec=krb5"); 136 sprintf(dp, ";sec=krb5");
137 else if (server->secType == MSKerberos)
138 sprintf(dp, ";sec=mskrb5");
120 else 139 else
121 goto out; 140 goto out;
122 141
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 05a34b17a1ab..e4041ec4d712 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -23,7 +23,7 @@
23#ifndef _CIFS_SPNEGO_H 23#ifndef _CIFS_SPNEGO_H
24#define _CIFS_SPNEGO_H 24#define _CIFS_SPNEGO_H
25 25
26#define CIFS_SPNEGO_UPCALL_VERSION 1 26#define CIFS_SPNEGO_UPCALL_VERSION 2
27 27
28/* 28/*
29 * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. 29 * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 83fd40dc1ef0..bd5f13d38450 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
294 294
295 if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) 295 if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
296 if (extended_security & CIFSSEC_MAY_PLNTXT) { 296 if (extended_security & CIFSSEC_MAY_PLNTXT) {
297 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
297 memcpy(lnm_session_key, password_with_pad, 298 memcpy(lnm_session_key, password_with_pad,
298 CIFS_ENCPWD_SIZE); 299 CIFS_ENCPWD_SIZE);
299 return; 300 return;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 135c965c4137..f7b4a5cd837b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int,
41 struct nameidata *); 41 struct nameidata *);
42extern struct dentry *cifs_lookup(struct inode *, struct dentry *, 42extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
43 struct nameidata *); 43 struct nameidata *);
44extern int cifs_unlink(struct inode *, struct dentry *); 44extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
45extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); 45extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
46extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t); 46extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
47extern int cifs_mkdir(struct inode *, struct dentry *, int); 47extern int cifs_mkdir(struct inode *, struct dentry *, int);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7e1cf262effe..0d22479d99b7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -80,7 +80,8 @@ enum securityEnum {
80 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 80 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
81 RawNTLMSSP, /* NTLMSSP without SPNEGO */ 81 RawNTLMSSP, /* NTLMSSP without SPNEGO */
82 NTLMSSP, /* NTLMSSP via SPNEGO */ 82 NTLMSSP, /* NTLMSSP via SPNEGO */
83 Kerberos /* Kerberos via SPNEGO */ 83 Kerberos, /* Kerberos via SPNEGO */
84 MSKerberos, /* MS Kerberos via SPNEGO */
84}; 85};
85 86
86enum protocolEnum { 87enum protocolEnum {
@@ -308,6 +309,7 @@ struct cifs_search_info {
308 __u32 resume_key; 309 __u32 resume_key;
309 char *ntwrk_buf_start; 310 char *ntwrk_buf_start;
310 char *srch_entries_start; 311 char *srch_entries_start;
312 char *last_entry;
311 char *presume_name; 313 char *presume_name;
312 unsigned int resume_name_len; 314 unsigned int resume_name_len;
313 bool endOfSearch:1; 315 bool endOfSearch:1;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a729d083e6f4..0cff7fe986e8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -179,6 +179,8 @@ extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
179extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon, 179extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
180 const FILE_BASIC_INFO *data, __u16 fid, 180 const FILE_BASIC_INFO *data, __u16 fid,
181 __u32 pid_of_opener); 181 __u32 pid_of_opener);
182extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
183 bool delete_file, __u16 fid, __u32 pid_of_opener);
182#if 0 184#if 0
183extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, 185extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
184 char *fileName, __u16 dos_attributes, 186 char *fileName, __u16 dos_attributes,
@@ -229,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
229 const struct nls_table *nls_codepage, 231 const struct nls_table *nls_codepage,
230 int remap_special_chars); 232 int remap_special_chars);
231extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, 233extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
232 int netfid, char *target_name, 234 int netfid, const char *target_name,
233 const struct nls_table *nls_codepage, 235 const struct nls_table *nls_codepage,
234 int remap_special_chars); 236 int remap_special_chars);
235extern int CIFSCreateHardLink(const int xid, 237extern int CIFSCreateHardLink(const int xid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 994de7c90474..6f4ffe15d68d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2017,7 +2017,7 @@ renameRetry:
2017} 2017}
2018 2018
2019int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, 2019int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2020 int netfid, char *target_name, 2020 int netfid, const char *target_name,
2021 const struct nls_table *nls_codepage, int remap) 2021 const struct nls_table *nls_codepage, int remap)
2022{ 2022{
2023 struct smb_com_transaction2_sfi_req *pSMB = NULL; 2023 struct smb_com_transaction2_sfi_req *pSMB = NULL;
@@ -2071,7 +2071,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
2071 remap); 2071 remap);
2072 } 2072 }
2073 rename_info->target_name_len = cpu_to_le32(2 * len_of_str); 2073 rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
2074 count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2; 2074 count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
2075 byte_count += count; 2075 byte_count += count;
2076 pSMB->DataCount = cpu_to_le16(count); 2076 pSMB->DataCount = cpu_to_le16(count);
2077 pSMB->TotalDataCount = pSMB->DataCount; 2077 pSMB->TotalDataCount = pSMB->DataCount;
@@ -3614,6 +3614,8 @@ findFirstRetry:
3614 /* BB remember to free buffer if error BB */ 3614 /* BB remember to free buffer if error BB */
3615 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3615 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3616 if (rc == 0) { 3616 if (rc == 0) {
3617 unsigned int lnoff;
3618
3617 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) 3619 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
3618 psrch_inf->unicode = true; 3620 psrch_inf->unicode = true;
3619 else 3621 else
@@ -3636,6 +3638,17 @@ findFirstRetry:
3636 le16_to_cpu(parms->SearchCount); 3638 le16_to_cpu(parms->SearchCount);
3637 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + 3639 psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
3638 psrch_inf->entries_in_buffer; 3640 psrch_inf->entries_in_buffer;
3641 lnoff = le16_to_cpu(parms->LastNameOffset);
3642 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3643 lnoff) {
3644 cERROR(1, ("ignoring corrupt resume name"));
3645 psrch_inf->last_entry = NULL;
3646 return rc;
3647 }
3648
3649 psrch_inf->last_entry = psrch_inf->srch_entries_start +
3650 lnoff;
3651
3639 *pnetfid = parms->SearchHandle; 3652 *pnetfid = parms->SearchHandle;
3640 } else { 3653 } else {
3641 cifs_buf_release(pSMB); 3654 cifs_buf_release(pSMB);
@@ -3725,6 +3738,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3725 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3738 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3726 3739
3727 if (rc == 0) { 3740 if (rc == 0) {
3741 unsigned int lnoff;
3742
3728 /* BB fixme add lock for file (srch_info) struct here */ 3743 /* BB fixme add lock for file (srch_info) struct here */
3729 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) 3744 if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
3730 psrch_inf->unicode = true; 3745 psrch_inf->unicode = true;
@@ -3751,6 +3766,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
3751 le16_to_cpu(parms->SearchCount); 3766 le16_to_cpu(parms->SearchCount);
3752 psrch_inf->index_of_last_entry += 3767 psrch_inf->index_of_last_entry +=
3753 psrch_inf->entries_in_buffer; 3768 psrch_inf->entries_in_buffer;
3769 lnoff = le16_to_cpu(parms->LastNameOffset);
3770 if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
3771 lnoff) {
3772 cERROR(1, ("ignoring corrupt resume name"));
3773 psrch_inf->last_entry = NULL;
3774 return rc;
3775 } else
3776 psrch_inf->last_entry =
3777 psrch_inf->srch_entries_start + lnoff;
3778
3754/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d", 3779/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
3755 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */ 3780 psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
3756 3781
@@ -4876,6 +4901,61 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
4876 return rc; 4901 return rc;
4877} 4902}
4878 4903
4904int
4905CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
4906 bool delete_file, __u16 fid, __u32 pid_of_opener)
4907{
4908 struct smb_com_transaction2_sfi_req *pSMB = NULL;
4909 char *data_offset;
4910 int rc = 0;
4911 __u16 params, param_offset, offset, byte_count, count;
4912
4913 cFYI(1, ("Set File Disposition (via SetFileInfo)"));
4914 rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
4915
4916 if (rc)
4917 return rc;
4918
4919 pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
4920 pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
4921
4922 params = 6;
4923 pSMB->MaxSetupCount = 0;
4924 pSMB->Reserved = 0;
4925 pSMB->Flags = 0;
4926 pSMB->Timeout = 0;
4927 pSMB->Reserved2 = 0;
4928 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
4929 offset = param_offset + params;
4930
4931 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
4932
4933 count = 1;
4934 pSMB->MaxParameterCount = cpu_to_le16(2);
4935 /* BB find max SMB PDU from sess */
4936 pSMB->MaxDataCount = cpu_to_le16(1000);
4937 pSMB->SetupCount = 1;
4938 pSMB->Reserved3 = 0;
4939 pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
4940 byte_count = 3 /* pad */ + params + count;
4941 pSMB->DataCount = cpu_to_le16(count);
4942 pSMB->ParameterCount = cpu_to_le16(params);
4943 pSMB->TotalDataCount = pSMB->DataCount;
4944 pSMB->TotalParameterCount = pSMB->ParameterCount;
4945 pSMB->ParameterOffset = cpu_to_le16(param_offset);
4946 pSMB->DataOffset = cpu_to_le16(offset);
4947 pSMB->Fid = fid;
4948 pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
4949 pSMB->Reserved4 = 0;
4950 pSMB->hdr.smb_buf_length += byte_count;
4951 pSMB->ByteCount = cpu_to_le16(byte_count);
4952 *data_offset = delete_file ? 1 : 0;
4953 rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
4954 if (rc)
4955 cFYI(1, ("Send error in SetFileDisposition = %d", rc));
4956
4957 return rc;
4958}
4879 4959
4880int 4960int
4881CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon, 4961CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0711db65afe8..4c13bcdb92a5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3598,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3598 char ntlm_session_key[CIFS_SESS_KEY_SIZE]; 3598 char ntlm_session_key[CIFS_SESS_KEY_SIZE];
3599 bool ntlmv2_flag = false; 3599 bool ntlmv2_flag = false;
3600 int first_time = 0; 3600 int first_time = 0;
3601 struct TCP_Server_Info *server = pSesInfo->server;
3601 3602
3602 /* what if server changes its buffer size after dropping the session? */ 3603 /* what if server changes its buffer size after dropping the session? */
3603 if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ { 3604 if (server->maxBuf == 0) /* no need to send on reconnect */ {
3604 rc = CIFSSMBNegotiate(xid, pSesInfo); 3605 rc = CIFSSMBNegotiate(xid, pSesInfo);
3605 if (rc == -EAGAIN) /* retry only once on 1st time connection */ { 3606 if (rc == -EAGAIN) {
3607 /* retry only once on 1st time connection */
3606 rc = CIFSSMBNegotiate(xid, pSesInfo); 3608 rc = CIFSSMBNegotiate(xid, pSesInfo);
3607 if (rc == -EAGAIN) 3609 if (rc == -EAGAIN)
3608 rc = -EHOSTDOWN; 3610 rc = -EHOSTDOWN;
3609 } 3611 }
3610 if (rc == 0) { 3612 if (rc == 0) {
3611 spin_lock(&GlobalMid_Lock); 3613 spin_lock(&GlobalMid_Lock);
3612 if (pSesInfo->server->tcpStatus != CifsExiting) 3614 if (server->tcpStatus != CifsExiting)
3613 pSesInfo->server->tcpStatus = CifsGood; 3615 server->tcpStatus = CifsGood;
3614 else 3616 else
3615 rc = -EHOSTDOWN; 3617 rc = -EHOSTDOWN;
3616 spin_unlock(&GlobalMid_Lock); 3618 spin_unlock(&GlobalMid_Lock);
@@ -3623,23 +3625,22 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3623 goto ss_err_exit; 3625 goto ss_err_exit;
3624 3626
3625 pSesInfo->flags = 0; 3627 pSesInfo->flags = 0;
3626 pSesInfo->capabilities = pSesInfo->server->capabilities; 3628 pSesInfo->capabilities = server->capabilities;
3627 if (linuxExtEnabled == 0) 3629 if (linuxExtEnabled == 0)
3628 pSesInfo->capabilities &= (~CAP_UNIX); 3630 pSesInfo->capabilities &= (~CAP_UNIX);
3629 /* pSesInfo->sequence_number = 0;*/ 3631 /* pSesInfo->sequence_number = 0;*/
3630 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", 3632 cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
3631 pSesInfo->server->secMode, 3633 server->secMode, server->capabilities, server->timeAdj));
3632 pSesInfo->server->capabilities, 3634
3633 pSesInfo->server->timeAdj));
3634 if (experimEnabled < 2) 3635 if (experimEnabled < 2)
3635 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); 3636 rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
3636 else if (extended_security 3637 else if (extended_security
3637 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) 3638 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
3638 && (pSesInfo->server->secType == NTLMSSP)) { 3639 && (server->secType == NTLMSSP)) {
3639 rc = -EOPNOTSUPP; 3640 rc = -EOPNOTSUPP;
3640 } else if (extended_security 3641 } else if (extended_security
3641 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) 3642 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
3642 && (pSesInfo->server->secType == RawNTLMSSP)) { 3643 && (server->secType == RawNTLMSSP)) {
3643 cFYI(1, ("NTLMSSP sesssetup")); 3644 cFYI(1, ("NTLMSSP sesssetup"));
3644 rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag, 3645 rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
3645 nls_info); 3646 nls_info);
@@ -3668,12 +3669,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3668 3669
3669 } else { 3670 } else {
3670 SMBNTencrypt(pSesInfo->password, 3671 SMBNTencrypt(pSesInfo->password,
3671 pSesInfo->server->cryptKey, 3672 server->cryptKey,
3672 ntlm_session_key); 3673 ntlm_session_key);
3673 3674
3674 if (first_time) 3675 if (first_time)
3675 cifs_calculate_mac_key( 3676 cifs_calculate_mac_key(
3676 &pSesInfo->server->mac_signing_key, 3677 &server->mac_signing_key,
3677 ntlm_session_key, 3678 ntlm_session_key,
3678 pSesInfo->password); 3679 pSesInfo->password);
3679 } 3680 }
@@ -3686,13 +3687,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3686 nls_info); 3687 nls_info);
3687 } 3688 }
3688 } else { /* old style NTLM 0.12 session setup */ 3689 } else { /* old style NTLM 0.12 session setup */
3689 SMBNTencrypt(pSesInfo->password, pSesInfo->server->cryptKey, 3690 SMBNTencrypt(pSesInfo->password, server->cryptKey,
3690 ntlm_session_key); 3691 ntlm_session_key);
3691 3692
3692 if (first_time) 3693 if (first_time)
3693 cifs_calculate_mac_key( 3694 cifs_calculate_mac_key(&server->mac_signing_key,
3694 &pSesInfo->server->mac_signing_key, 3695 ntlm_session_key,
3695 ntlm_session_key, pSesInfo->password); 3696 pSesInfo->password);
3696 3697
3697 rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info); 3698 rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
3698 } 3699 }
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index f730ef35499e..1e0c1bd8f2e4 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -29,38 +29,13 @@
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31 31
32static int dns_resolver_instantiate(struct key *key, const void *data,
33 size_t datalen)
34{
35 int rc = 0;
36 char *ip;
37
38 ip = kmalloc(datalen+1, GFP_KERNEL);
39 if (!ip)
40 return -ENOMEM;
41
42 memcpy(ip, data, datalen);
43 ip[datalen] = '\0';
44
45 rcu_assign_pointer(key->payload.data, ip);
46
47 return rc;
48}
49
50struct key_type key_type_dns_resolver = {
51 .name = "dns_resolver",
52 .def_datalen = sizeof(struct in_addr),
53 .describe = user_describe,
54 .instantiate = dns_resolver_instantiate,
55 .match = user_match,
56};
57
58/* Checks if supplied name is IP address 32/* Checks if supplied name is IP address
59 * returns: 33 * returns:
60 * 1 - name is IP 34 * 1 - name is IP
61 * 0 - name is not IP 35 * 0 - name is not IP
62 */ 36 */
63static int is_ip(const char *name) 37static int
38is_ip(const char *name)
64{ 39{
65 int rc; 40 int rc;
66 struct sockaddr_in sin_server; 41 struct sockaddr_in sin_server;
@@ -82,6 +57,47 @@ static int is_ip(const char *name)
82 return 0; 57 return 0;
83} 58}
84 59
60static int
61dns_resolver_instantiate(struct key *key, const void *data,
62 size_t datalen)
63{
64 int rc = 0;
65 char *ip;
66
67 ip = kmalloc(datalen + 1, GFP_KERNEL);
68 if (!ip)
69 return -ENOMEM;
70
71 memcpy(ip, data, datalen);
72 ip[datalen] = '\0';
73
74 /* make sure this looks like an address */
75 if (!is_ip((const char *) ip)) {
76 kfree(ip);
77 return -EINVAL;
78 }
79
80 key->type_data.x[0] = datalen;
81 rcu_assign_pointer(key->payload.data, ip);
82
83 return rc;
84}
85
86static void
87dns_resolver_destroy(struct key *key)
88{
89 kfree(key->payload.data);
90}
91
92struct key_type key_type_dns_resolver = {
93 .name = "dns_resolver",
94 .def_datalen = sizeof(struct in_addr),
95 .describe = user_describe,
96 .instantiate = dns_resolver_instantiate,
97 .destroy = dns_resolver_destroy,
98 .match = user_match,
99};
100
85/* Resolves server name to ip address. 101/* Resolves server name to ip address.
86 * input: 102 * input:
87 * unc - server UNC 103 * unc - server UNC
@@ -133,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
133 149
134 rkey = request_key(&key_type_dns_resolver, name, ""); 150 rkey = request_key(&key_type_dns_resolver, name, "");
135 if (!IS_ERR(rkey)) { 151 if (!IS_ERR(rkey)) {
152 len = rkey->type_data.x[0];
136 data = rkey->payload.data; 153 data = rkey->payload.data;
137 } else { 154 } else {
138 cERROR(1, ("%s: unable to resolve: %s", __func__, name)); 155 cERROR(1, ("%s: unable to resolve: %s", __func__, name));
@@ -141,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
141 158
142skip_upcall: 159skip_upcall:
143 if (data) { 160 if (data) {
144 len = strlen(data); 161 *ip_addr = kmalloc(len + 1, GFP_KERNEL);
145 *ip_addr = kmalloc(len+1, GFP_KERNEL);
146 if (*ip_addr) { 162 if (*ip_addr) {
147 memcpy(*ip_addr, data, len); 163 memcpy(*ip_addr, data, len + 1);
148 (*ip_addr)[len] = '\0';
149 if (!IS_ERR(rkey)) 164 if (!IS_ERR(rkey))
150 cFYI(1, ("%s: resolved: %s to %s", __func__, 165 cFYI(1, ("%s: resolved: %s to %s", __func__,
151 name, 166 name,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ff14d14903a0..c4a8a0605125 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
107 107
108 /* want handles we can use to read with first 108 /* want handles we can use to read with first
109 in the list so we do not have to walk the 109 in the list so we do not have to walk the
110 list to search for one in prepare_write */ 110 list to search for one in write_begin */
111 if ((file->f_flags & O_ACCMODE) == O_WRONLY) { 111 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
112 list_add_tail(&pCifsFile->flist, 112 list_add_tail(&pCifsFile->flist,
113 &pCifsInode->openFileList); 113 &pCifsInode->openFileList);
@@ -833,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
833 return -EBADF; 833 return -EBADF;
834 open_file = (struct cifsFileInfo *) file->private_data; 834 open_file = (struct cifsFileInfo *) file->private_data;
835 835
836 rc = generic_write_checks(file, poffset, &write_size, 0);
837 if (rc)
838 return rc;
839
836 xid = GetXid(); 840 xid = GetXid();
837 841
838 if (*poffset > file->f_path.dentry->d_inode->i_size) 842 if (*poffset > file->f_path.dentry->d_inode->i_size)
@@ -911,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
911} 915}
912 916
913static ssize_t cifs_write(struct file *file, const char *write_data, 917static ssize_t cifs_write(struct file *file, const char *write_data,
914 size_t write_size, loff_t *poffset) 918 size_t write_size, loff_t *poffset)
915{ 919{
916 int rc = 0; 920 int rc = 0;
917 unsigned int bytes_written = 0; 921 unsigned int bytes_written = 0;
@@ -1061,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1061struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) 1065struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1062{ 1066{
1063 struct cifsFileInfo *open_file; 1067 struct cifsFileInfo *open_file;
1068 bool any_available = false;
1064 int rc; 1069 int rc;
1065 1070
1066 /* Having a null inode here (because mapping->host was set to zero by 1071 /* Having a null inode here (because mapping->host was set to zero by
@@ -1076,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
1076 read_lock(&GlobalSMBSeslock); 1081 read_lock(&GlobalSMBSeslock);
1077refind_writable: 1082refind_writable:
1078 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { 1083 list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
1079 if (open_file->closePend) 1084 if (open_file->closePend ||
1085 (!any_available && open_file->pid != current->tgid))
1080 continue; 1086 continue;
1087
1081 if (open_file->pfile && 1088 if (open_file->pfile &&
1082 ((open_file->pfile->f_flags & O_RDWR) || 1089 ((open_file->pfile->f_flags & O_RDWR) ||
1083 (open_file->pfile->f_flags & O_WRONLY))) { 1090 (open_file->pfile->f_flags & O_WRONLY))) {
@@ -1127,6 +1134,11 @@ refind_writable:
1127 of the loop here. */ 1134 of the loop here. */
1128 } 1135 }
1129 } 1136 }
1137 /* couldn't find useable FH with same pid, try any available */
1138 if (!any_available) {
1139 any_available = true;
1140 goto refind_writable;
1141 }
1130 read_unlock(&GlobalSMBSeslock); 1142 read_unlock(&GlobalSMBSeslock);
1131 return NULL; 1143 return NULL;
1132} 1144}
@@ -1443,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
1443 return rc; 1455 return rc;
1444} 1456}
1445 1457
1446static int cifs_commit_write(struct file *file, struct page *page, 1458static int cifs_write_end(struct file *file, struct address_space *mapping,
1447 unsigned offset, unsigned to) 1459 loff_t pos, unsigned len, unsigned copied,
1460 struct page *page, void *fsdata)
1448{ 1461{
1449 int xid; 1462 int rc;
1450 int rc = 0; 1463 struct inode *inode = mapping->host;
1451 struct inode *inode = page->mapping->host;
1452 loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1453 char *page_data;
1454 1464
1455 xid = GetXid(); 1465 cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
1456 cFYI(1, ("commit write for page %p up to position %lld for %d", 1466 page, pos, copied));
1457 page, position, to)); 1467
1458 spin_lock(&inode->i_lock); 1468 if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
1459 if (position > inode->i_size) 1469 SetPageUptodate(page);
1460 i_size_write(inode, position);
1461 1470
1462 spin_unlock(&inode->i_lock);
1463 if (!PageUptodate(page)) { 1471 if (!PageUptodate(page)) {
1464 position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset; 1472 char *page_data;
1465 /* can not rely on (or let) writepage write this data */ 1473 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1466 if (to < offset) { 1474 int xid;
1467 cFYI(1, ("Illegal offsets, can not copy from %d to %d", 1475
1468 offset, to)); 1476 xid = GetXid();
1469 FreeXid(xid);
1470 return rc;
1471 }
1472 /* this is probably better than directly calling 1477 /* this is probably better than directly calling
1473 partialpage_write since in this function the file handle is 1478 partialpage_write since in this function the file handle is
1474 known which we might as well leverage */ 1479 known which we might as well leverage */
1475 /* BB check if anything else missing out of ppw 1480 /* BB check if anything else missing out of ppw
1476 such as updating last write time */ 1481 such as updating last write time */
1477 page_data = kmap(page); 1482 page_data = kmap(page);
1478 rc = cifs_write(file, page_data + offset, to-offset, 1483 rc = cifs_write(file, page_data + offset, copied, &pos);
1479 &position); 1484 /* if (rc < 0) should we set writebehind rc? */
1480 if (rc > 0)
1481 rc = 0;
1482 /* else if (rc < 0) should we set writebehind rc? */
1483 kunmap(page); 1485 kunmap(page);
1486
1487 FreeXid(xid);
1484 } else { 1488 } else {
1489 rc = copied;
1490 pos += copied;
1485 set_page_dirty(page); 1491 set_page_dirty(page);
1486 } 1492 }
1487 1493
1488 FreeXid(xid); 1494 if (rc > 0) {
1495 spin_lock(&inode->i_lock);
1496 if (pos > inode->i_size)
1497 i_size_write(inode, pos);
1498 spin_unlock(&inode->i_lock);
1499 }
1500
1501 unlock_page(page);
1502 page_cache_release(page);
1503
1489 return rc; 1504 return rc;
1490} 1505}
1491 1506
@@ -2031,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
2031 return true; 2046 return true;
2032} 2047}
2033 2048
2034static int cifs_prepare_write(struct file *file, struct page *page, 2049static int cifs_write_begin(struct file *file, struct address_space *mapping,
2035 unsigned from, unsigned to) 2050 loff_t pos, unsigned len, unsigned flags,
2051 struct page **pagep, void **fsdata)
2036{ 2052{
2037 int rc = 0; 2053 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2038 loff_t i_size; 2054 loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
2039 loff_t offset; 2055
2056 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
2040 2057
2041 cFYI(1, ("prepare write for page %p from %d to %d", page, from, to)); 2058 *pagep = __grab_cache_page(mapping, index);
2042 if (PageUptodate(page)) 2059 if (!*pagep)
2060 return -ENOMEM;
2061
2062 if (PageUptodate(*pagep))
2043 return 0; 2063 return 0;
2044 2064
2045 /* If we are writing a full page it will be up to date, 2065 /* If we are writing a full page it will be up to date,
2046 no need to read from the server */ 2066 no need to read from the server */
2047 if ((to == PAGE_CACHE_SIZE) && (from == 0)) { 2067 if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
2048 SetPageUptodate(page);
2049 return 0; 2068 return 0;
2050 }
2051 2069
2052 offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 2070 if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
2053 i_size = i_size_read(page->mapping->host); 2071 int rc;
2054 2072
2055 if ((offset >= i_size) ||
2056 ((from == 0) && (offset + to) >= i_size)) {
2057 /*
2058 * We don't need to read data beyond the end of the file.
2059 * zero it, and set the page uptodate
2060 */
2061 simple_prepare_write(file, page, from, to);
2062 SetPageUptodate(page);
2063 } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
2064 /* might as well read a page, it is fast enough */ 2073 /* might as well read a page, it is fast enough */
2065 rc = cifs_readpage_worker(file, page, &offset); 2074 rc = cifs_readpage_worker(file, *pagep, &offset);
2075
2076 /* we do not need to pass errors back
2077 e.g. if we do not have read access to the file
2078 because cifs_write_end will attempt synchronous writes
2079 -- shaggy */
2066 } else { 2080 } else {
2067 /* we could try using another file handle if there is one - 2081 /* we could try using another file handle if there is one -
2068 but how would we lock it to prevent close of that handle 2082 but how would we lock it to prevent close of that handle
2069 racing with this read? In any case 2083 racing with this read? In any case
2070 this will be written out by commit_write so is fine */ 2084 this will be written out by write_end so is fine */
2071 } 2085 }
2072 2086
2073 /* we do not need to pass errors back
2074 e.g. if we do not have read access to the file
2075 because cifs_commit_write will do the right thing. -- shaggy */
2076
2077 return 0; 2087 return 0;
2078} 2088}
2079 2089
@@ -2082,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = {
2082 .readpages = cifs_readpages, 2092 .readpages = cifs_readpages,
2083 .writepage = cifs_writepage, 2093 .writepage = cifs_writepage,
2084 .writepages = cifs_writepages, 2094 .writepages = cifs_writepages,
2085 .prepare_write = cifs_prepare_write, 2095 .write_begin = cifs_write_begin,
2086 .commit_write = cifs_commit_write, 2096 .write_end = cifs_write_end,
2087 .set_page_dirty = __set_page_dirty_nobuffers, 2097 .set_page_dirty = __set_page_dirty_nobuffers,
2088 /* .sync_page = cifs_sync_page, */ 2098 /* .sync_page = cifs_sync_page, */
2089 /* .direct_IO = */ 2099 /* .direct_IO = */
@@ -2098,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
2098 .readpage = cifs_readpage, 2108 .readpage = cifs_readpage,
2099 .writepage = cifs_writepage, 2109 .writepage = cifs_writepage,
2100 .writepages = cifs_writepages, 2110 .writepages = cifs_writepages,
2101 .prepare_write = cifs_prepare_write, 2111 .write_begin = cifs_write_begin,
2102 .commit_write = cifs_commit_write, 2112 .write_end = cifs_write_end,
2103 .set_page_dirty = __set_page_dirty_nobuffers, 2113 .set_page_dirty = __set_page_dirty_nobuffers,
2104 /* .sync_page = cifs_sync_page, */ 2114 /* .sync_page = cifs_sync_page, */
2105 /* .direct_IO = */ 2115 /* .direct_IO = */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 848286861c31..a8c833345fc9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode,
546 if ((inode->i_mode & S_IWUGO) == 0 && 546 if ((inode->i_mode & S_IWUGO) == 0 &&
547 (attr & ATTR_READONLY) == 0) 547 (attr & ATTR_READONLY) == 0)
548 inode->i_mode |= (S_IWUGO & default_mode); 548 inode->i_mode |= (S_IWUGO & default_mode);
549 inode->i_mode &= ~S_IFMT; 549
550 inode->i_mode &= ~S_IFMT;
550 } 551 }
551 /* clear write bits if ATTR_READONLY is set */ 552 /* clear write bits if ATTR_READONLY is set */
552 if (attr & ATTR_READONLY) 553 if (attr & ATTR_READONLY)
@@ -664,40 +665,201 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
664 return inode; 665 return inode;
665} 666}
666 667
667int cifs_unlink(struct inode *inode, struct dentry *direntry) 668static int
669cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
670 char *full_path, __u32 dosattr)
671{
672 int rc;
673 int oplock = 0;
674 __u16 netfid;
675 __u32 netpid;
676 bool set_time = false;
677 struct cifsFileInfo *open_file;
678 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
679 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
680 struct cifsTconInfo *pTcon = cifs_sb->tcon;
681 FILE_BASIC_INFO info_buf;
682
683 if (attrs->ia_valid & ATTR_ATIME) {
684 set_time = true;
685 info_buf.LastAccessTime =
686 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
687 } else
688 info_buf.LastAccessTime = 0;
689
690 if (attrs->ia_valid & ATTR_MTIME) {
691 set_time = true;
692 info_buf.LastWriteTime =
693 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
694 } else
695 info_buf.LastWriteTime = 0;
696
697 /*
698 * Samba throws this field away, but windows may actually use it.
699 * Do not set ctime unless other time stamps are changed explicitly
700 * (i.e. by utimes()) since we would then have a mix of client and
701 * server times.
702 */
703 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
704 cFYI(1, ("CIFS - CTIME changed"));
705 info_buf.ChangeTime =
706 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
707 } else
708 info_buf.ChangeTime = 0;
709
710 info_buf.CreationTime = 0; /* don't change */
711 info_buf.Attributes = cpu_to_le32(dosattr);
712
713 /*
714 * If the file is already open for write, just use that fileid
715 */
716 open_file = find_writable_file(cifsInode);
717 if (open_file) {
718 netfid = open_file->netfid;
719 netpid = open_file->pid;
720 goto set_via_filehandle;
721 }
722
723 /*
724 * NT4 apparently returns success on this call, but it doesn't
725 * really work.
726 */
727 if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
728 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
729 &info_buf, cifs_sb->local_nls,
730 cifs_sb->mnt_cifs_flags &
731 CIFS_MOUNT_MAP_SPECIAL_CHR);
732 if (rc == 0) {
733 cifsInode->cifsAttrs = dosattr;
734 goto out;
735 } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
736 goto out;
737 }
738
739 cFYI(1, ("calling SetFileInfo since SetPathInfo for "
740 "times not supported by this server"));
741 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
742 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
743 CREATE_NOT_DIR, &netfid, &oplock,
744 NULL, cifs_sb->local_nls,
745 cifs_sb->mnt_cifs_flags &
746 CIFS_MOUNT_MAP_SPECIAL_CHR);
747
748 if (rc != 0) {
749 if (rc == -EIO)
750 rc = -EINVAL;
751 goto out;
752 }
753
754 netpid = current->tgid;
755
756set_via_filehandle:
757 rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
758 if (!rc)
759 cifsInode->cifsAttrs = dosattr;
760
761 if (open_file == NULL)
762 CIFSSMBClose(xid, pTcon, netfid);
763 else
764 atomic_dec(&open_file->wrtPending);
765out:
766 return rc;
767}
768
769/*
770 * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
771 * and rename it to a random name that hopefully won't conflict with
772 * anything else.
773 */
774static int
775cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
776{
777 int oplock = 0;
778 int rc;
779 __u16 netfid;
780 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
781 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
782 struct cifsTconInfo *tcon = cifs_sb->tcon;
783 __u32 dosattr;
784 FILE_BASIC_INFO *info_buf;
785
786 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
787 DELETE|FILE_WRITE_ATTRIBUTES,
788 CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
789 &netfid, &oplock, NULL, cifs_sb->local_nls,
790 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
791 if (rc != 0)
792 goto out;
793
794 /* set ATTR_HIDDEN and clear ATTR_READONLY */
795 cifsInode = CIFS_I(inode);
796 dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
797 if (dosattr == 0)
798 dosattr |= ATTR_NORMAL;
799 dosattr |= ATTR_HIDDEN;
800
801 info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
802 if (info_buf == NULL) {
803 rc = -ENOMEM;
804 goto out_close;
805 }
806 info_buf->Attributes = cpu_to_le32(dosattr);
807 rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid);
808 kfree(info_buf);
809 if (rc != 0)
810 goto out_close;
811 cifsInode->cifsAttrs = dosattr;
812
813 /* silly-rename the file */
814 CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
815 cifs_sb->mnt_cifs_flags &
816 CIFS_MOUNT_MAP_SPECIAL_CHR);
817
818 /* set DELETE_ON_CLOSE */
819 rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
820
821 /*
822 * some samba versions return -ENOENT when we try to set the file
823 * disposition here. Likely a samba bug, but work around it for now
824 */
825 if (rc == -ENOENT)
826 rc = 0;
827
828out_close:
829 CIFSSMBClose(xid, tcon, netfid);
830out:
831 return rc;
832}
833
834int cifs_unlink(struct inode *dir, struct dentry *dentry)
668{ 835{
669 int rc = 0; 836 int rc = 0;
670 int xid; 837 int xid;
671 struct cifs_sb_info *cifs_sb;
672 struct cifsTconInfo *pTcon;
673 char *full_path = NULL; 838 char *full_path = NULL;
674 struct cifsInodeInfo *cifsInode; 839 struct inode *inode = dentry->d_inode;
675 FILE_BASIC_INFO *pinfo_buf; 840 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
841 struct super_block *sb = dir->i_sb;
842 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
843 struct cifsTconInfo *tcon = cifs_sb->tcon;
844 struct iattr *attrs = NULL;
845 __u32 dosattr = 0, origattr = 0;
676 846
677 cFYI(1, ("cifs_unlink, inode = 0x%p", inode)); 847 cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
678 848
679 xid = GetXid(); 849 xid = GetXid();
680 850
681 if (inode) 851 /* Unlink can be called from rename so we can not take the
682 cifs_sb = CIFS_SB(inode->i_sb); 852 * sb->s_vfs_rename_mutex here */
683 else 853 full_path = build_path_from_dentry(dentry);
684 cifs_sb = CIFS_SB(direntry->d_sb);
685 pTcon = cifs_sb->tcon;
686
687 /* Unlink can be called from rename so we can not grab the sem here
688 since we deadlock otherwise */
689/* mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/
690 full_path = build_path_from_dentry(direntry);
691/* mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/
692 if (full_path == NULL) { 854 if (full_path == NULL) {
693 FreeXid(xid); 855 FreeXid(xid);
694 return -ENOMEM; 856 return -ENOMEM;
695 } 857 }
696 858
697 if ((pTcon->ses->capabilities & CAP_UNIX) && 859 if ((tcon->ses->capabilities & CAP_UNIX) &&
698 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 860 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
699 le64_to_cpu(pTcon->fsUnixInfo.Capability))) { 861 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
700 rc = CIFSPOSIXDelFile(xid, pTcon, full_path, 862 rc = CIFSPOSIXDelFile(xid, tcon, full_path,
701 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, 863 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
702 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 864 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
703 cFYI(1, ("posix del rc %d", rc)); 865 cFYI(1, ("posix del rc %d", rc));
@@ -705,125 +867,60 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
705 goto psx_del_no_retry; 867 goto psx_del_no_retry;
706 } 868 }
707 869
708 rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls, 870retry_std_delete:
871 rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
709 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 872 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
873
710psx_del_no_retry: 874psx_del_no_retry:
711 if (!rc) { 875 if (!rc) {
712 if (direntry->d_inode) 876 if (inode)
713 drop_nlink(direntry->d_inode); 877 drop_nlink(inode);
714 } else if (rc == -ENOENT) { 878 } else if (rc == -ENOENT) {
715 d_drop(direntry); 879 d_drop(dentry);
716 } else if (rc == -ETXTBSY) { 880 } else if (rc == -ETXTBSY) {
717 int oplock = 0; 881 rc = cifs_rename_pending_delete(full_path, inode, xid);
718 __u16 netfid; 882 if (rc == 0)
719 883 drop_nlink(inode);
720 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE, 884 } else if (rc == -EACCES && dosattr == 0) {
721 CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE, 885 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
722 &netfid, &oplock, NULL, cifs_sb->local_nls, 886 if (attrs == NULL) {
723 cifs_sb->mnt_cifs_flags & 887 rc = -ENOMEM;
724 CIFS_MOUNT_MAP_SPECIAL_CHR); 888 goto out_reval;
725 if (rc == 0) {
726 CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL,
727 cifs_sb->local_nls,
728 cifs_sb->mnt_cifs_flags &
729 CIFS_MOUNT_MAP_SPECIAL_CHR);
730 CIFSSMBClose(xid, pTcon, netfid);
731 if (direntry->d_inode)
732 drop_nlink(direntry->d_inode);
733 } 889 }
734 } else if (rc == -EACCES) {
735 /* try only if r/o attribute set in local lookup data? */
736 pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL);
737 if (pinfo_buf) {
738 /* ATTRS set to normal clears r/o bit */
739 pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
740 if (!(pTcon->ses->flags & CIFS_SES_NT4))
741 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
742 pinfo_buf,
743 cifs_sb->local_nls,
744 cifs_sb->mnt_cifs_flags &
745 CIFS_MOUNT_MAP_SPECIAL_CHR);
746 else
747 rc = -EOPNOTSUPP;
748 890
749 if (rc == -EOPNOTSUPP) { 891 /* try to reset dos attributes */
750 int oplock = 0; 892 origattr = cifsInode->cifsAttrs;
751 __u16 netfid; 893 if (origattr == 0)
752 /* rc = CIFSSMBSetAttrLegacy(xid, pTcon, 894 origattr |= ATTR_NORMAL;
753 full_path, 895 dosattr = origattr & ~ATTR_READONLY;
754 (__u16)ATTR_NORMAL, 896 if (dosattr == 0)
755 cifs_sb->local_nls); 897 dosattr |= ATTR_NORMAL;
756 For some strange reason it seems that NT4 eats the 898 dosattr |= ATTR_HIDDEN;
757 old setattr call without actually setting the 899
758 attributes so on to the third attempted workaround 900 rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
759 */ 901 if (rc != 0)
760 902 goto out_reval;
761 /* BB could scan to see if we already have it open 903
762 and pass in pid of opener to function */ 904 goto retry_std_delete;
763 rc = CIFSSMBOpen(xid, pTcon, full_path,
764 FILE_OPEN, SYNCHRONIZE |
765 FILE_WRITE_ATTRIBUTES, 0,
766 &netfid, &oplock, NULL,
767 cifs_sb->local_nls,
768 cifs_sb->mnt_cifs_flags &
769 CIFS_MOUNT_MAP_SPECIAL_CHR);
770 if (rc == 0) {
771 rc = CIFSSMBSetFileInfo(xid, pTcon,
772 pinfo_buf,
773 netfid,
774 current->tgid);
775 CIFSSMBClose(xid, pTcon, netfid);
776 }
777 }
778 kfree(pinfo_buf);
779 }
780 if (rc == 0) {
781 rc = CIFSSMBDelFile(xid, pTcon, full_path,
782 cifs_sb->local_nls,
783 cifs_sb->mnt_cifs_flags &
784 CIFS_MOUNT_MAP_SPECIAL_CHR);
785 if (!rc) {
786 if (direntry->d_inode)
787 drop_nlink(direntry->d_inode);
788 } else if (rc == -ETXTBSY) {
789 int oplock = 0;
790 __u16 netfid;
791
792 rc = CIFSSMBOpen(xid, pTcon, full_path,
793 FILE_OPEN, DELETE,
794 CREATE_NOT_DIR |
795 CREATE_DELETE_ON_CLOSE,
796 &netfid, &oplock, NULL,
797 cifs_sb->local_nls,
798 cifs_sb->mnt_cifs_flags &
799 CIFS_MOUNT_MAP_SPECIAL_CHR);
800 if (rc == 0) {
801 CIFSSMBRenameOpenFile(xid, pTcon,
802 netfid, NULL,
803 cifs_sb->local_nls,
804 cifs_sb->mnt_cifs_flags &
805 CIFS_MOUNT_MAP_SPECIAL_CHR);
806 CIFSSMBClose(xid, pTcon, netfid);
807 if (direntry->d_inode)
808 drop_nlink(direntry->d_inode);
809 }
810 /* BB if rc = -ETXTBUSY goto the rename logic BB */
811 }
812 }
813 }
814 if (direntry->d_inode) {
815 cifsInode = CIFS_I(direntry->d_inode);
816 cifsInode->time = 0; /* will force revalidate to get info
817 when needed */
818 direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
819 } 905 }
906
907 /* undo the setattr if we errored out and it's needed */
908 if (rc != 0 && dosattr != 0)
909 cifs_set_file_info(inode, attrs, xid, full_path, origattr);
910
911out_reval:
820 if (inode) { 912 if (inode) {
821 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
822 cifsInode = CIFS_I(inode); 913 cifsInode = CIFS_I(inode);
823 cifsInode->time = 0; /* force revalidate of dir as well */ 914 cifsInode->time = 0; /* will force revalidate to get info
915 when needed */
916 inode->i_ctime = current_fs_time(sb);
824 } 917 }
918 dir->i_ctime = dir->i_mtime = current_fs_time(sb);
919 cifsInode = CIFS_I(dir);
920 CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
825 921
826 kfree(full_path); 922 kfree(full_path);
923 kfree(attrs);
827 FreeXid(xid); 924 FreeXid(xid);
828 return rc; 925 return rc;
829} 926}
@@ -868,7 +965,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode,
868 965
869int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) 966int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
870{ 967{
871 int rc = 0; 968 int rc = 0, tmprc;
872 int xid; 969 int xid;
873 struct cifs_sb_info *cifs_sb; 970 struct cifs_sb_info *cifs_sb;
874 struct cifsTconInfo *pTcon; 971 struct cifsTconInfo *pTcon;
@@ -930,6 +1027,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
930 kfree(pInfo); 1027 kfree(pInfo);
931 goto mkdir_get_info; 1028 goto mkdir_get_info;
932 } 1029 }
1030
933 /* Is an i_ino of zero legal? */ 1031 /* Is an i_ino of zero legal? */
934 /* Are there sanity checks we can use to ensure that 1032 /* Are there sanity checks we can use to ensure that
935 the server is really filling in that field? */ 1033 the server is really filling in that field? */
@@ -1018,12 +1116,20 @@ mkdir_get_info:
1018 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && 1116 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1019 (mode & S_IWUGO) == 0) { 1117 (mode & S_IWUGO) == 0) {
1020 FILE_BASIC_INFO pInfo; 1118 FILE_BASIC_INFO pInfo;
1119 struct cifsInodeInfo *cifsInode;
1120 u32 dosattrs;
1121
1021 memset(&pInfo, 0, sizeof(pInfo)); 1122 memset(&pInfo, 0, sizeof(pInfo));
1022 pInfo.Attributes = cpu_to_le32(ATTR_READONLY); 1123 cifsInode = CIFS_I(newinode);
1023 CIFSSMBSetPathInfo(xid, pTcon, full_path, 1124 dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
1024 &pInfo, cifs_sb->local_nls, 1125 pInfo.Attributes = cpu_to_le32(dosattrs);
1126 tmprc = CIFSSMBSetPathInfo(xid, pTcon,
1127 full_path, &pInfo,
1128 cifs_sb->local_nls,
1025 cifs_sb->mnt_cifs_flags & 1129 cifs_sb->mnt_cifs_flags &
1026 CIFS_MOUNT_MAP_SPECIAL_CHR); 1130 CIFS_MOUNT_MAP_SPECIAL_CHR);
1131 if (tmprc == 0)
1132 cifsInode->cifsAttrs = dosattrs;
1027 } 1133 }
1028 if (direntry->d_inode) { 1134 if (direntry->d_inode) {
1029 if (cifs_sb->mnt_cifs_flags & 1135 if (cifs_sb->mnt_cifs_flags &
@@ -1095,117 +1201,141 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1095 return rc; 1201 return rc;
1096} 1202}
1097 1203
1204static int
1205cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
1206 struct dentry *to_dentry, const char *toPath)
1207{
1208 struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
1209 struct cifsTconInfo *pTcon = cifs_sb->tcon;
1210 __u16 srcfid;
1211 int oplock, rc;
1212
1213 /* try path-based rename first */
1214 rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
1215 cifs_sb->mnt_cifs_flags &
1216 CIFS_MOUNT_MAP_SPECIAL_CHR);
1217
1218 /*
1219 * don't bother with rename by filehandle unless file is busy and
1220 * source Note that cross directory moves do not work with
1221 * rename by filehandle to various Windows servers.
1222 */
1223 if (rc == 0 || rc != -ETXTBSY)
1224 return rc;
1225
1226 /* open the file to be renamed -- we need DELETE perms */
1227 rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
1228 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
1229 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1230 CIFS_MOUNT_MAP_SPECIAL_CHR);
1231
1232 if (rc == 0) {
1233 rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
1234 (const char *) to_dentry->d_name.name,
1235 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1236 CIFS_MOUNT_MAP_SPECIAL_CHR);
1237
1238 CIFSSMBClose(xid, pTcon, srcfid);
1239 }
1240
1241 return rc;
1242}
1243
1098int cifs_rename(struct inode *source_inode, struct dentry *source_direntry, 1244int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
1099 struct inode *target_inode, struct dentry *target_direntry) 1245 struct inode *target_inode, struct dentry *target_direntry)
1100{ 1246{
1101 char *fromName; 1247 char *fromName = NULL;
1102 char *toName; 1248 char *toName = NULL;
1103 struct cifs_sb_info *cifs_sb_source; 1249 struct cifs_sb_info *cifs_sb_source;
1104 struct cifs_sb_info *cifs_sb_target; 1250 struct cifs_sb_info *cifs_sb_target;
1105 struct cifsTconInfo *pTcon; 1251 struct cifsTconInfo *pTcon;
1252 FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
1253 FILE_UNIX_BASIC_INFO *info_buf_target;
1106 int xid; 1254 int xid;
1107 int rc = 0; 1255 int rc;
1108
1109 xid = GetXid();
1110 1256
1111 cifs_sb_target = CIFS_SB(target_inode->i_sb); 1257 cifs_sb_target = CIFS_SB(target_inode->i_sb);
1112 cifs_sb_source = CIFS_SB(source_inode->i_sb); 1258 cifs_sb_source = CIFS_SB(source_inode->i_sb);
1113 pTcon = cifs_sb_source->tcon; 1259 pTcon = cifs_sb_source->tcon;
1114 1260
1261 xid = GetXid();
1262
1263 /*
1264 * BB: this might be allowed if same server, but different share.
1265 * Consider adding support for this
1266 */
1115 if (pTcon != cifs_sb_target->tcon) { 1267 if (pTcon != cifs_sb_target->tcon) {
1116 FreeXid(xid); 1268 rc = -EXDEV;
1117 return -EXDEV; /* BB actually could be allowed if same server, 1269 goto cifs_rename_exit;
1118 but different share.
1119 Might eventually add support for this */
1120 } 1270 }
1121 1271
1122 /* we already have the rename sem so we do not need to grab it again 1272 /*
1123 here to protect the path integrity */ 1273 * we already have the rename sem so we do not need to
1274 * grab it again here to protect the path integrity
1275 */
1124 fromName = build_path_from_dentry(source_direntry); 1276 fromName = build_path_from_dentry(source_direntry);
1277 if (fromName == NULL) {
1278 rc = -ENOMEM;
1279 goto cifs_rename_exit;
1280 }
1281
1125 toName = build_path_from_dentry(target_direntry); 1282 toName = build_path_from_dentry(target_direntry);
1126 if ((fromName == NULL) || (toName == NULL)) { 1283 if (toName == NULL) {
1127 rc = -ENOMEM; 1284 rc = -ENOMEM;
1128 goto cifs_rename_exit; 1285 goto cifs_rename_exit;
1129 } 1286 }
1130 1287
1131 rc = CIFSSMBRename(xid, pTcon, fromName, toName, 1288 rc = cifs_do_rename(xid, source_direntry, fromName,
1132 cifs_sb_source->local_nls, 1289 target_direntry, toName);
1133 cifs_sb_source->mnt_cifs_flags & 1290
1134 CIFS_MOUNT_MAP_SPECIAL_CHR);
1135 if (rc == -EEXIST) { 1291 if (rc == -EEXIST) {
1136 /* check if they are the same file because rename of hardlinked 1292 if (pTcon->unix_ext) {
1137 files is a noop */ 1293 /*
1138 FILE_UNIX_BASIC_INFO *info_buf_source; 1294 * Are src and dst hardlinks of same inode? We can
1139 FILE_UNIX_BASIC_INFO *info_buf_target; 1295 * only tell with unix extensions enabled
1140 1296 */
1141 info_buf_source = 1297 info_buf_source =
1142 kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); 1298 kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
1143 if (info_buf_source != NULL) { 1299 GFP_KERNEL);
1300 if (info_buf_source == NULL)
1301 goto unlink_target;
1302
1144 info_buf_target = info_buf_source + 1; 1303 info_buf_target = info_buf_source + 1;
1145 if (pTcon->unix_ext) 1304 rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
1146 rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, 1305 info_buf_source,
1147 info_buf_source, 1306 cifs_sb_source->local_nls,
1148 cifs_sb_source->local_nls, 1307 cifs_sb_source->mnt_cifs_flags &
1149 cifs_sb_source->mnt_cifs_flags &
1150 CIFS_MOUNT_MAP_SPECIAL_CHR); 1308 CIFS_MOUNT_MAP_SPECIAL_CHR);
1151 /* else rc is still EEXIST so will fall through to 1309 if (rc != 0)
1152 unlink the target and retry rename */ 1310 goto unlink_target;
1153 if (rc == 0) { 1311
1154 rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName, 1312 rc = CIFSSMBUnixQPathInfo(xid, pTcon,
1155 info_buf_target, 1313 toName, info_buf_target,
1156 cifs_sb_target->local_nls, 1314 cifs_sb_target->local_nls,
1157 /* remap based on source sb */ 1315 /* remap based on source sb */
1158 cifs_sb_source->mnt_cifs_flags & 1316 cifs_sb_source->mnt_cifs_flags &
1159 CIFS_MOUNT_MAP_SPECIAL_CHR);
1160 }
1161 if ((rc == 0) &&
1162 (info_buf_source->UniqueId ==
1163 info_buf_target->UniqueId)) {
1164 /* do not rename since the files are hardlinked which
1165 is a noop */
1166 } else {
1167 /* we either can not tell the files are hardlinked
1168 (as with Windows servers) or files are not
1169 hardlinked so delete the target manually before
1170 renaming to follow POSIX rather than Windows
1171 semantics */
1172 cifs_unlink(target_inode, target_direntry);
1173 rc = CIFSSMBRename(xid, pTcon, fromName,
1174 toName,
1175 cifs_sb_source->local_nls,
1176 cifs_sb_source->mnt_cifs_flags
1177 & CIFS_MOUNT_MAP_SPECIAL_CHR);
1178 }
1179 kfree(info_buf_source);
1180 } /* if we can not get memory just leave rc as EEXIST */
1181 }
1182
1183 if (rc)
1184 cFYI(1, ("rename rc %d", rc));
1185
1186 if ((rc == -EIO) || (rc == -EEXIST)) {
1187 int oplock = 0;
1188 __u16 netfid;
1189
1190 /* BB FIXME Is Generic Read correct for rename? */
1191 /* if renaming directory - we should not say CREATE_NOT_DIR,
1192 need to test renaming open directory, also GENERIC_READ
1193 might not right be right access to request */
1194 rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ,
1195 CREATE_NOT_DIR, &netfid, &oplock, NULL,
1196 cifs_sb_source->local_nls,
1197 cifs_sb_source->mnt_cifs_flags &
1198 CIFS_MOUNT_MAP_SPECIAL_CHR);
1199 if (rc == 0) {
1200 rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName,
1201 cifs_sb_source->local_nls,
1202 cifs_sb_source->mnt_cifs_flags &
1203 CIFS_MOUNT_MAP_SPECIAL_CHR); 1317 CIFS_MOUNT_MAP_SPECIAL_CHR);
1204 CIFSSMBClose(xid, pTcon, netfid); 1318
1205 } 1319 if (rc == 0 && (info_buf_source->UniqueId ==
1320 info_buf_target->UniqueId))
1321 /* same file, POSIX says that this is a noop */
1322 goto cifs_rename_exit;
1323 } /* else ... BB we could add the same check for Windows by
1324 checking the UniqueId via FILE_INTERNAL_INFO */
1325unlink_target:
1326 /*
1327 * we either can not tell the files are hardlinked (as with
1328 * Windows servers) or files are not hardlinked. Delete the
1329 * target manually before renaming to follow POSIX rather than
1330 * Windows semantics
1331 */
1332 cifs_unlink(target_inode, target_direntry);
1333 rc = cifs_do_rename(xid, source_direntry, fromName,
1334 target_direntry, toName);
1206 } 1335 }
1207 1336
1208cifs_rename_exit: 1337cifs_rename_exit:
1338 kfree(info_buf_source);
1209 kfree(fromName); 1339 kfree(fromName);
1210 kfree(toName); 1340 kfree(toName);
1211 FreeXid(xid); 1341 FreeXid(xid);
@@ -1506,101 +1636,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1506} 1636}
1507 1637
1508static int 1638static int
1509cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
1510 char *full_path, __u32 dosattr)
1511{
1512 int rc;
1513 int oplock = 0;
1514 __u16 netfid;
1515 __u32 netpid;
1516 bool set_time = false;
1517 struct cifsFileInfo *open_file;
1518 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1519 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1520 struct cifsTconInfo *pTcon = cifs_sb->tcon;
1521 FILE_BASIC_INFO info_buf;
1522
1523 if (attrs->ia_valid & ATTR_ATIME) {
1524 set_time = true;
1525 info_buf.LastAccessTime =
1526 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
1527 } else
1528 info_buf.LastAccessTime = 0;
1529
1530 if (attrs->ia_valid & ATTR_MTIME) {
1531 set_time = true;
1532 info_buf.LastWriteTime =
1533 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
1534 } else
1535 info_buf.LastWriteTime = 0;
1536
1537 /*
1538 * Samba throws this field away, but windows may actually use it.
1539 * Do not set ctime unless other time stamps are changed explicitly
1540 * (i.e. by utimes()) since we would then have a mix of client and
1541 * server times.
1542 */
1543 if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
1544 cFYI(1, ("CIFS - CTIME changed"));
1545 info_buf.ChangeTime =
1546 cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
1547 } else
1548 info_buf.ChangeTime = 0;
1549
1550 info_buf.CreationTime = 0; /* don't change */
1551 info_buf.Attributes = cpu_to_le32(dosattr);
1552
1553 /*
1554 * If the file is already open for write, just use that fileid
1555 */
1556 open_file = find_writable_file(cifsInode);
1557 if (open_file) {
1558 netfid = open_file->netfid;
1559 netpid = open_file->pid;
1560 goto set_via_filehandle;
1561 }
1562
1563 /*
1564 * NT4 apparently returns success on this call, but it doesn't
1565 * really work.
1566 */
1567 if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
1568 rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
1569 &info_buf, cifs_sb->local_nls,
1570 cifs_sb->mnt_cifs_flags &
1571 CIFS_MOUNT_MAP_SPECIAL_CHR);
1572 if (rc != -EOPNOTSUPP && rc != -EINVAL)
1573 goto out;
1574 }
1575
1576 cFYI(1, ("calling SetFileInfo since SetPathInfo for "
1577 "times not supported by this server"));
1578 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
1579 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
1580 CREATE_NOT_DIR, &netfid, &oplock,
1581 NULL, cifs_sb->local_nls,
1582 cifs_sb->mnt_cifs_flags &
1583 CIFS_MOUNT_MAP_SPECIAL_CHR);
1584
1585 if (rc != 0) {
1586 if (rc == -EIO)
1587 rc = -EINVAL;
1588 goto out;
1589 }
1590
1591 netpid = current->tgid;
1592
1593set_via_filehandle:
1594 rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
1595 if (open_file == NULL)
1596 CIFSSMBClose(xid, pTcon, netfid);
1597 else
1598 atomic_dec(&open_file->wrtPending);
1599out:
1600 return rc;
1601}
1602
1603static int
1604cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) 1639cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1605{ 1640{
1606 int rc; 1641 int rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4b17f8fe3157..88786ba02d27 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -150,8 +150,7 @@ cifs_buf_get(void)
150 but it may be more efficient to always alloc same size 150 but it may be more efficient to always alloc same size
151 albeit slightly larger than necessary and maxbuffersize 151 albeit slightly larger than necessary and maxbuffersize
152 defaults to this and can not be bigger */ 152 defaults to this and can not be bigger */
153 ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp, 153 ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
154 GFP_KERNEL | GFP_NOFS);
155 154
156 /* clear the first few header bytes */ 155 /* clear the first few header bytes */
157 /* for most paths, more is cleared in header_assemble */ 156 /* for most paths, more is cleared in header_assemble */
@@ -188,8 +187,7 @@ cifs_small_buf_get(void)
188 but it may be more efficient to always alloc same size 187 but it may be more efficient to always alloc same size
189 albeit slightly larger than necessary and maxbuffersize 188 albeit slightly larger than necessary and maxbuffersize
190 defaults to this and can not be bigger */ 189 defaults to this and can not be bigger */
191 ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, 190 ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
192 GFP_KERNEL | GFP_NOFS);
193 if (ret_buf) { 191 if (ret_buf) {
194 /* No need to clear memory here, cleared in header assemble */ 192 /* No need to clear memory here, cleared in header assemble */
195 /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ 193 /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
@@ -313,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
313 buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES; 311 buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
314 buffer->Pid = cpu_to_le16((__u16)current->tgid); 312 buffer->Pid = cpu_to_le16((__u16)current->tgid);
315 buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16)); 313 buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
316 spin_lock(&GlobalMid_Lock);
317 spin_unlock(&GlobalMid_Lock);
318 if (treeCon) { 314 if (treeCon) {
319 buffer->Tid = treeCon->tid; 315 buffer->Tid = treeCon->tid;
320 if (treeCon->ses) { 316 if (treeCon->ses) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5f40ed3473f5..765adf12d54f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file)
640 640
641} 641}
642 642
643static int cifs_save_resume_key(const char *current_entry,
644 struct cifsFileInfo *cifsFile)
645{
646 int rc = 0;
647 unsigned int len = 0;
648 __u16 level;
649 char *filename;
650
651 if ((cifsFile == NULL) || (current_entry == NULL))
652 return -EINVAL;
653
654 level = cifsFile->srch_inf.info_level;
655
656 if (level == SMB_FIND_FILE_UNIX) {
657 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
658
659 filename = &pFindData->FileName[0];
660 if (cifsFile->srch_inf.unicode) {
661 len = cifs_unicode_bytelen(filename);
662 } else {
663 /* BB should we make this strnlen of PATH_MAX? */
664 len = strnlen(filename, PATH_MAX);
665 }
666 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
667 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
668 FILE_DIRECTORY_INFO *pFindData =
669 (FILE_DIRECTORY_INFO *)current_entry;
670 filename = &pFindData->FileName[0];
671 len = le32_to_cpu(pFindData->FileNameLength);
672 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
673 } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
674 FILE_FULL_DIRECTORY_INFO *pFindData =
675 (FILE_FULL_DIRECTORY_INFO *)current_entry;
676 filename = &pFindData->FileName[0];
677 len = le32_to_cpu(pFindData->FileNameLength);
678 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
679 } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
680 SEARCH_ID_FULL_DIR_INFO *pFindData =
681 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
682 filename = &pFindData->FileName[0];
683 len = le32_to_cpu(pFindData->FileNameLength);
684 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
685 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
686 FILE_BOTH_DIRECTORY_INFO *pFindData =
687 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
688 filename = &pFindData->FileName[0];
689 len = le32_to_cpu(pFindData->FileNameLength);
690 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
691 } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
692 FIND_FILE_STANDARD_INFO *pFindData =
693 (FIND_FILE_STANDARD_INFO *)current_entry;
694 filename = &pFindData->FileName[0];
695 /* one byte length, no name conversion */
696 len = (unsigned int)pFindData->FileNameLength;
697 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
698 } else {
699 cFYI(1, ("Unknown findfirst level %d", level));
700 return -EINVAL;
701 }
702 cifsFile->srch_inf.resume_name_len = len;
703 cifsFile->srch_inf.presume_name = filename;
704 return rc;
705}
706
643/* find the corresponding entry in the search */ 707/* find the corresponding entry in the search */
644/* Note that the SMB server returns search entries for . and .. which 708/* Note that the SMB server returns search entries for . and .. which
645 complicates logic here if we choose to parse for them and we do not 709 complicates logic here if we choose to parse for them and we do not
@@ -703,6 +767,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
703 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && 767 while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
704 (rc == 0) && !cifsFile->srch_inf.endOfSearch) { 768 (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
705 cFYI(1, ("calling findnext2")); 769 cFYI(1, ("calling findnext2"));
770 cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
706 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, 771 rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
707 &cifsFile->srch_inf); 772 &cifsFile->srch_inf);
708 if (rc) 773 if (rc)
@@ -919,69 +984,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
919 return rc; 984 return rc;
920} 985}
921 986
922static int cifs_save_resume_key(const char *current_entry,
923 struct cifsFileInfo *cifsFile)
924{
925 int rc = 0;
926 unsigned int len = 0;
927 __u16 level;
928 char *filename;
929
930 if ((cifsFile == NULL) || (current_entry == NULL))
931 return -EINVAL;
932
933 level = cifsFile->srch_inf.info_level;
934
935 if (level == SMB_FIND_FILE_UNIX) {
936 FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
937
938 filename = &pFindData->FileName[0];
939 if (cifsFile->srch_inf.unicode) {
940 len = cifs_unicode_bytelen(filename);
941 } else {
942 /* BB should we make this strnlen of PATH_MAX? */
943 len = strnlen(filename, PATH_MAX);
944 }
945 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
946 } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
947 FILE_DIRECTORY_INFO *pFindData =
948 (FILE_DIRECTORY_INFO *)current_entry;
949 filename = &pFindData->FileName[0];
950 len = le32_to_cpu(pFindData->FileNameLength);
951 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
952 } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
953 FILE_FULL_DIRECTORY_INFO *pFindData =
954 (FILE_FULL_DIRECTORY_INFO *)current_entry;
955 filename = &pFindData->FileName[0];
956 len = le32_to_cpu(pFindData->FileNameLength);
957 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
958 } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
959 SEARCH_ID_FULL_DIR_INFO *pFindData =
960 (SEARCH_ID_FULL_DIR_INFO *)current_entry;
961 filename = &pFindData->FileName[0];
962 len = le32_to_cpu(pFindData->FileNameLength);
963 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
964 } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
965 FILE_BOTH_DIRECTORY_INFO *pFindData =
966 (FILE_BOTH_DIRECTORY_INFO *)current_entry;
967 filename = &pFindData->FileName[0];
968 len = le32_to_cpu(pFindData->FileNameLength);
969 cifsFile->srch_inf.resume_key = pFindData->FileIndex;
970 } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
971 FIND_FILE_STANDARD_INFO *pFindData =
972 (FIND_FILE_STANDARD_INFO *)current_entry;
973 filename = &pFindData->FileName[0];
974 /* one byte length, no name conversion */
975 len = (unsigned int)pFindData->FileNameLength;
976 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
977 } else {
978 cFYI(1, ("Unknown findfirst level %d", level));
979 return -EINVAL;
980 }
981 cifsFile->srch_inf.resume_name_len = len;
982 cifsFile->srch_inf.presume_name = filename;
983 return rc;
984}
985 987
986int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) 988int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
987{ 989{
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index ed150efbe27c..2851d5da0c8c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
409#ifdef CONFIG_CIFS_WEAK_PW_HASH 409#ifdef CONFIG_CIFS_WEAK_PW_HASH
410 char lnm_session_key[CIFS_SESS_KEY_SIZE]; 410 char lnm_session_key[CIFS_SESS_KEY_SIZE];
411 411
412 pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
413
412 /* no capabilities flags in old lanman negotiation */ 414 /* no capabilities flags in old lanman negotiation */
413 415
414 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 416 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
@@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
505 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); 507 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
506 } else 508 } else
507 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); 509 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
508 } else if (type == Kerberos) { 510 } else if (type == Kerberos || type == MSKerberos) {
509#ifdef CONFIG_CIFS_UPCALL 511#ifdef CONFIG_CIFS_UPCALL
510 struct cifs_spnego_msg *msg; 512 struct cifs_spnego_msg *msg;
511 spnego_key = cifs_get_spnego_key(ses); 513 spnego_key = cifs_get_spnego_key(ses);
@@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
516 } 518 }
517 519
518 msg = spnego_key->payload.data; 520 msg = spnego_key->payload.data;
521 /* check version field to make sure that cifs.upcall is
522 sending us a response in an expected form */
523 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
524 cERROR(1, ("incorrect version of cifs.upcall (expected"
525 " %d but got %d)",
526 CIFS_SPNEGO_UPCALL_VERSION, msg->version));
527 rc = -EKEYREJECTED;
528 goto ssetup_exit;
529 }
519 /* bail out if key is too long */ 530 /* bail out if key is too long */
520 if (msg->sesskey_len > 531 if (msg->sesskey_len >
521 sizeof(ses->server->mac_signing_key.data.krb5)) { 532 sizeof(ses->server->mac_signing_key.data.krb5)) {
@@ -613,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
613 ses, nls_cp); 624 ses, nls_cp);
614 625
615ssetup_exit: 626ssetup_exit:
616 if (spnego_key) 627 if (spnego_key) {
628 key_revoke(spnego_key);
617 key_put(spnego_key); 629 key_put(spnego_key);
630 }
618 kfree(str_area); 631 kfree(str_area);
619 if (resp_buf_type == CIFS_SMALL_BUFFER) { 632 if (resp_buf_type == CIFS_SMALL_BUFFER) {
620 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base)); 633 cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e286db9f5ee2..bf0e6d8e382a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
50 return NULL; 50 return NULL;
51 } 51 }
52 52
53 temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp, 53 temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
54 GFP_KERNEL | GFP_NOFS);
55 if (temp == NULL) 54 if (temp == NULL)
56 return temp; 55 return temp;
57 else { 56 else {
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0d9b80ec689c..cfd29da714d1 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,9 +362,8 @@ static int init_coda_psdev(void)
362 goto out_chrdev; 362 goto out_chrdev;
363 } 363 }
364 for (i = 0; i < MAX_CODADEVS; i++) 364 for (i = 0; i < MAX_CODADEVS; i++)
365 device_create_drvdata(coda_psdev_class, NULL, 365 device_create(coda_psdev_class, NULL,
366 MKDEV(CODA_PSDEV_MAJOR, i), 366 MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
367 NULL, "cfs%d", i);
368 coda_sysctl_init(); 367 coda_sysctl_init();
369 goto out; 368 goto out;
370 369
diff --git a/fs/compat.c b/fs/compat.c
index c9d1472e65c5..5f9ec449c799 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -137,6 +137,45 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _
137 return compat_sys_futimesat(AT_FDCWD, filename, t); 137 return compat_sys_futimesat(AT_FDCWD, filename, t);
138} 138}
139 139
140static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
141{
142 compat_ino_t ino = stat->ino;
143 typeof(ubuf->st_uid) uid = 0;
144 typeof(ubuf->st_gid) gid = 0;
145 int err;
146
147 SET_UID(uid, stat->uid);
148 SET_GID(gid, stat->gid);
149
150 if ((u64) stat->size > MAX_NON_LFS ||
151 !old_valid_dev(stat->dev) ||
152 !old_valid_dev(stat->rdev))
153 return -EOVERFLOW;
154 if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
155 return -EOVERFLOW;
156
157 if (clear_user(ubuf, sizeof(*ubuf)))
158 return -EFAULT;
159
160 err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
161 err |= __put_user(ino, &ubuf->st_ino);
162 err |= __put_user(stat->mode, &ubuf->st_mode);
163 err |= __put_user(stat->nlink, &ubuf->st_nlink);
164 err |= __put_user(uid, &ubuf->st_uid);
165 err |= __put_user(gid, &ubuf->st_gid);
166 err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
167 err |= __put_user(stat->size, &ubuf->st_size);
168 err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
169 err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
170 err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
171 err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
172 err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
173 err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
174 err |= __put_user(stat->blksize, &ubuf->st_blksize);
175 err |= __put_user(stat->blocks, &ubuf->st_blocks);
176 return err;
177}
178
140asmlinkage long compat_sys_newstat(char __user * filename, 179asmlinkage long compat_sys_newstat(char __user * filename,
141 struct compat_stat __user *statbuf) 180 struct compat_stat __user *statbuf)
142{ 181{
@@ -792,8 +831,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen,
792 if (buf->result) 831 if (buf->result)
793 return -EINVAL; 832 return -EINVAL;
794 d_ino = ino; 833 d_ino = ino;
795 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) 834 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
835 buf->result = -EOVERFLOW;
796 return -EOVERFLOW; 836 return -EOVERFLOW;
837 }
797 buf->result++; 838 buf->result++;
798 dirent = buf->dirent; 839 dirent = buf->dirent;
799 if (!access_ok(VERIFY_WRITE, dirent, 840 if (!access_ok(VERIFY_WRITE, dirent,
@@ -862,8 +903,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
862 if (reclen > buf->count) 903 if (reclen > buf->count)
863 return -EINVAL; 904 return -EINVAL;
864 d_ino = ino; 905 d_ino = ino;
865 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) 906 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
907 buf->error = -EOVERFLOW;
866 return -EOVERFLOW; 908 return -EOVERFLOW;
909 }
867 dirent = buf->previous; 910 dirent = buf->previous;
868 if (dirent) { 911 if (dirent) {
869 if (__put_user(offset, &dirent->d_off)) 912 if (__put_user(offset, &dirent->d_off))
@@ -1235,7 +1278,7 @@ static int compat_count(compat_uptr_t __user *argv, int max)
1235 if (!p) 1278 if (!p)
1236 break; 1279 break;
1237 argv++; 1280 argv++;
1238 if(++i > max) 1281 if (i++ >= max)
1239 return -E2BIG; 1282 return -E2BIG;
1240 } 1283 }
1241 } 1284 }
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7a8db78a91d2..8e93341f3e82 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1311,16 +1311,18 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1311 * Ensure that no racing symlink() will make detach_prep() fail while 1311 * Ensure that no racing symlink() will make detach_prep() fail while
1312 * the new link is temporarily attached 1312 * the new link is temporarily attached
1313 */ 1313 */
1314 mutex_lock(&configfs_symlink_mutex);
1315 spin_lock(&configfs_dirent_lock);
1316 do { 1314 do {
1317 struct mutex *wait_mutex; 1315 struct mutex *wait_mutex;
1318 1316
1317 mutex_lock(&configfs_symlink_mutex);
1318 spin_lock(&configfs_dirent_lock);
1319 ret = configfs_detach_prep(dentry, &wait_mutex); 1319 ret = configfs_detach_prep(dentry, &wait_mutex);
1320 if (ret) { 1320 if (ret)
1321 configfs_detach_rollback(dentry); 1321 configfs_detach_rollback(dentry);
1322 spin_unlock(&configfs_dirent_lock); 1322 spin_unlock(&configfs_dirent_lock);
1323 mutex_unlock(&configfs_symlink_mutex); 1323 mutex_unlock(&configfs_symlink_mutex);
1324
1325 if (ret) {
1324 if (ret != -EAGAIN) { 1326 if (ret != -EAGAIN) {
1325 config_item_put(parent_item); 1327 config_item_put(parent_item);
1326 return ret; 1328 return ret;
@@ -1329,13 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1329 /* Wait until the racing operation terminates */ 1331 /* Wait until the racing operation terminates */
1330 mutex_lock(wait_mutex); 1332 mutex_lock(wait_mutex);
1331 mutex_unlock(wait_mutex); 1333 mutex_unlock(wait_mutex);
1332
1333 mutex_lock(&configfs_symlink_mutex);
1334 spin_lock(&configfs_dirent_lock);
1335 } 1334 }
1336 } while (ret == -EAGAIN); 1335 } while (ret == -EAGAIN);
1337 spin_unlock(&configfs_dirent_lock);
1338 mutex_unlock(&configfs_symlink_mutex);
1339 1336
1340 /* Get a working ref for the duration of this function */ 1337 /* Get a working ref for the duration of this function */
1341 item = configfs_get_config_item(dentry); 1338 item = configfs_get_config_item(dentry);
diff --git a/fs/dcache.c b/fs/dcache.c
index 101663d15e9f..e7a1a99b7464 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1236,7 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1236 * If no entry exists with the exact case name, allocate new dentry with 1236 * If no entry exists with the exact case name, allocate new dentry with
1237 * the exact case, and return the spliced entry. 1237 * the exact case, and return the spliced entry.
1238 */ 1238 */
1239struct dentry *d_add_ci(struct inode *inode, struct dentry *dentry, 1239struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1240 struct qstr *name) 1240 struct qstr *name)
1241{ 1241{
1242 int error; 1242 int error;
@@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1395 if (dentry->d_parent != parent) 1395 if (dentry->d_parent != parent)
1396 goto next; 1396 goto next;
1397 1397
1398 /* non-existing due to RCU? */
1399 if (d_unhashed(dentry))
1400 goto next;
1401
1398 /* 1402 /*
1399 * It is safe to compare names since d_move() cannot 1403 * It is safe to compare names since d_move() cannot
1400 * change the qstr (protected by d_lock). 1404 * change the qstr (protected by d_lock).
@@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
1410 goto next; 1414 goto next;
1411 } 1415 }
1412 1416
1413 if (!d_unhashed(dentry)) { 1417 atomic_inc(&dentry->d_count);
1414 atomic_inc(&dentry->d_count); 1418 found = dentry;
1415 found = dentry;
1416 }
1417 spin_unlock(&dentry->d_lock); 1419 spin_unlock(&dentry->d_lock);
1418 break; 1420 break;
1419next: 1421next:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 08e28c9bb416..3dbe2169cf36 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -26,8 +26,7 @@
26#include <linux/debugfs.h> 26#include <linux/debugfs.h>
27#include <linux/fsnotify.h> 27#include <linux/fsnotify.h>
28#include <linux/string.h> 28#include <linux/string.h>
29 29#include <linux/magic.h>
30#define DEBUGFS_MAGIC 0x64626720
31 30
32static struct vfsmount *debugfs_mount; 31static struct vfsmount *debugfs_mount;
33static int debugfs_mount_count; 32static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 488eb424f662..4a714f6c1bed 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,6 +27,7 @@
27#define DEVPTS_SUPER_MAGIC 0x1cd1 27#define DEVPTS_SUPER_MAGIC 0x1cd1
28 28
29#define DEVPTS_DEFAULT_MODE 0600 29#define DEVPTS_DEFAULT_MODE 0600
30#define PTMX_MINOR 2
30 31
31extern int pty_limit; /* Config limit on Unix98 ptys */ 32extern int pty_limit; /* Config limit on Unix98 ptys */
32static DEFINE_IDA(allocated_ptys); 33static DEFINE_IDA(allocated_ptys);
@@ -48,7 +49,7 @@ enum {
48 Opt_err 49 Opt_err
49}; 50};
50 51
51static match_table_t tokens = { 52static const match_table_t tokens = {
52 {Opt_uid, "uid=%u"}, 53 {Opt_uid, "uid=%u"},
53 {Opt_gid, "gid=%u"}, 54 {Opt_gid, "gid=%u"},
54 {Opt_mode, "mode=%o"}, 55 {Opt_mode, "mode=%o"},
@@ -169,15 +170,7 @@ static struct file_system_type devpts_fs_type = {
169 * to the System V naming convention 170 * to the System V naming convention
170 */ 171 */
171 172
172static struct dentry *get_node(int num) 173int devpts_new_index(struct inode *ptmx_inode)
173{
174 char s[12];
175 struct dentry *root = devpts_root;
176 mutex_lock(&root->d_inode->i_mutex);
177 return lookup_one_len(s, root, sprintf(s, "%d", num));
178}
179
180int devpts_new_index(void)
181{ 174{
182 int index; 175 int index;
183 int ida_ret; 176 int ida_ret;
@@ -205,20 +198,21 @@ retry:
205 return index; 198 return index;
206} 199}
207 200
208void devpts_kill_index(int idx) 201void devpts_kill_index(struct inode *ptmx_inode, int idx)
209{ 202{
210 mutex_lock(&allocated_ptys_lock); 203 mutex_lock(&allocated_ptys_lock);
211 ida_remove(&allocated_ptys, idx); 204 ida_remove(&allocated_ptys, idx);
212 mutex_unlock(&allocated_ptys_lock); 205 mutex_unlock(&allocated_ptys_lock);
213} 206}
214 207
215int devpts_pty_new(struct tty_struct *tty) 208int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
216{ 209{
217 int number = tty->index; /* tty layer puts index from devpts_new_index() in here */ 210 int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
218 struct tty_driver *driver = tty->driver; 211 struct tty_driver *driver = tty->driver;
219 dev_t device = MKDEV(driver->major, driver->minor_start+number); 212 dev_t device = MKDEV(driver->major, driver->minor_start+number);
220 struct dentry *dentry; 213 struct dentry *dentry;
221 struct inode *inode = new_inode(devpts_mnt->mnt_sb); 214 struct inode *inode = new_inode(devpts_mnt->mnt_sb);
215 char s[12];
222 216
223 /* We're supposed to be given the slave end of a pty */ 217 /* We're supposed to be given the slave end of a pty */
224 BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY); 218 BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
@@ -233,10 +227,15 @@ int devpts_pty_new(struct tty_struct *tty)
233 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 227 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
234 init_special_inode(inode, S_IFCHR|config.mode, device); 228 init_special_inode(inode, S_IFCHR|config.mode, device);
235 inode->i_private = tty; 229 inode->i_private = tty;
230 tty->driver_data = inode;
236 231
237 dentry = get_node(number); 232 sprintf(s, "%d", number);
238 if (!IS_ERR(dentry) && !dentry->d_inode) { 233
239 d_instantiate(dentry, inode); 234 mutex_lock(&devpts_root->d_inode->i_mutex);
235
236 dentry = d_alloc_name(devpts_root, s);
237 if (!IS_ERR(dentry)) {
238 d_add(dentry, inode);
240 fsnotify_create(devpts_root->d_inode, dentry); 239 fsnotify_create(devpts_root->d_inode, dentry);
241 } 240 }
242 241
@@ -245,36 +244,31 @@ int devpts_pty_new(struct tty_struct *tty)
245 return 0; 244 return 0;
246} 245}
247 246
248struct tty_struct *devpts_get_tty(int number) 247struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
249{ 248{
250 struct dentry *dentry = get_node(number); 249 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
251 struct tty_struct *tty;
252
253 tty = NULL;
254 if (!IS_ERR(dentry)) {
255 if (dentry->d_inode)
256 tty = dentry->d_inode->i_private;
257 dput(dentry);
258 }
259 250
260 mutex_unlock(&devpts_root->d_inode->i_mutex); 251 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
261 252 return (struct tty_struct *)pts_inode->i_private;
262 return tty; 253 return NULL;
263} 254}
264 255
265void devpts_pty_kill(int number) 256void devpts_pty_kill(struct tty_struct *tty)
266{ 257{
267 struct dentry *dentry = get_node(number); 258 struct inode *inode = tty->driver_data;
259 struct dentry *dentry;
268 260
269 if (!IS_ERR(dentry)) { 261 BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
270 struct inode *inode = dentry->d_inode; 262
271 if (inode) { 263 mutex_lock(&devpts_root->d_inode->i_mutex);
272 inode->i_nlink--; 264
273 d_delete(dentry); 265 dentry = d_find_alias(inode);
274 dput(dentry); 266 if (dentry && !IS_ERR(dentry)) {
275 } 267 inode->i_nlink--;
268 d_delete(dentry);
276 dput(dentry); 269 dput(dentry);
277 } 270 }
271
278 mutex_unlock(&devpts_root->d_inode->i_mutex); 272 mutex_unlock(&devpts_root->d_inode->i_mutex);
279} 273}
280 274
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9606ee848fd8..af0558dbe8b7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -5,11 +5,11 @@
5 * 5 *
6 * O_DIRECT 6 * O_DIRECT
7 * 7 *
8 * 04Jul2002 akpm@zip.com.au 8 * 04Jul2002 Andrew Morton
9 * Initial version 9 * Initial version
10 * 11Sep2002 janetinc@us.ibm.com 10 * 11Sep2002 janetinc@us.ibm.com
11 * added readv/writev support. 11 * added readv/writev support.
12 * 29Oct2002 akpm@zip.com.au 12 * 29Oct2002 Andrew Morton
13 * rewrote bio_add_page() support. 13 * rewrote bio_add_page() support.
14 * 30Oct2002 pbadari@us.ibm.com 14 * 30Oct2002 pbadari@us.ibm.com
15 * added support for non-aligned IO. 15 * added support for non-aligned IO.
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 89d2fb7b991a..fd9859f92fad 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,9 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/configfs.h> 16#include <linux/configfs.h>
17#include <linux/in.h>
18#include <linux/in6.h>
19#include <net/ipv6.h>
17#include <net/sock.h> 20#include <net/sock.h>
18 21
19#include "config.h" 22#include "config.h"
@@ -377,24 +380,24 @@ static struct config_item_type node_type = {
377 .ct_owner = THIS_MODULE, 380 .ct_owner = THIS_MODULE,
378}; 381};
379 382
380static struct dlm_cluster *to_cluster(struct config_item *i) 383static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
381{ 384{
382 return i ? container_of(to_config_group(i), struct dlm_cluster, group) : 385 return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
383 NULL; 386 NULL;
384} 387}
385 388
386static struct dlm_space *to_space(struct config_item *i) 389static struct dlm_space *config_item_to_space(struct config_item *i)
387{ 390{
388 return i ? container_of(to_config_group(i), struct dlm_space, group) : 391 return i ? container_of(to_config_group(i), struct dlm_space, group) :
389 NULL; 392 NULL;
390} 393}
391 394
392static struct dlm_comm *to_comm(struct config_item *i) 395static struct dlm_comm *config_item_to_comm(struct config_item *i)
393{ 396{
394 return i ? container_of(i, struct dlm_comm, item) : NULL; 397 return i ? container_of(i, struct dlm_comm, item) : NULL;
395} 398}
396 399
397static struct dlm_node *to_node(struct config_item *i) 400static struct dlm_node *config_item_to_node(struct config_item *i)
398{ 401{
399 return i ? container_of(i, struct dlm_node, item) : NULL; 402 return i ? container_of(i, struct dlm_node, item) : NULL;
400} 403}
@@ -450,7 +453,7 @@ static struct config_group *make_cluster(struct config_group *g,
450 453
451static void drop_cluster(struct config_group *g, struct config_item *i) 454static void drop_cluster(struct config_group *g, struct config_item *i)
452{ 455{
453 struct dlm_cluster *cl = to_cluster(i); 456 struct dlm_cluster *cl = config_item_to_cluster(i);
454 struct config_item *tmp; 457 struct config_item *tmp;
455 int j; 458 int j;
456 459
@@ -468,7 +471,7 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
468 471
469static void release_cluster(struct config_item *i) 472static void release_cluster(struct config_item *i)
470{ 473{
471 struct dlm_cluster *cl = to_cluster(i); 474 struct dlm_cluster *cl = config_item_to_cluster(i);
472 kfree(cl->group.default_groups); 475 kfree(cl->group.default_groups);
473 kfree(cl); 476 kfree(cl);
474} 477}
@@ -507,7 +510,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
507 510
508static void drop_space(struct config_group *g, struct config_item *i) 511static void drop_space(struct config_group *g, struct config_item *i)
509{ 512{
510 struct dlm_space *sp = to_space(i); 513 struct dlm_space *sp = config_item_to_space(i);
511 struct config_item *tmp; 514 struct config_item *tmp;
512 int j; 515 int j;
513 516
@@ -524,7 +527,7 @@ static void drop_space(struct config_group *g, struct config_item *i)
524 527
525static void release_space(struct config_item *i) 528static void release_space(struct config_item *i)
526{ 529{
527 struct dlm_space *sp = to_space(i); 530 struct dlm_space *sp = config_item_to_space(i);
528 kfree(sp->group.default_groups); 531 kfree(sp->group.default_groups);
529 kfree(sp); 532 kfree(sp);
530} 533}
@@ -546,7 +549,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
546 549
547static void drop_comm(struct config_group *g, struct config_item *i) 550static void drop_comm(struct config_group *g, struct config_item *i)
548{ 551{
549 struct dlm_comm *cm = to_comm(i); 552 struct dlm_comm *cm = config_item_to_comm(i);
550 if (local_comm == cm) 553 if (local_comm == cm)
551 local_comm = NULL; 554 local_comm = NULL;
552 dlm_lowcomms_close(cm->nodeid); 555 dlm_lowcomms_close(cm->nodeid);
@@ -557,13 +560,13 @@ static void drop_comm(struct config_group *g, struct config_item *i)
557 560
558static void release_comm(struct config_item *i) 561static void release_comm(struct config_item *i)
559{ 562{
560 struct dlm_comm *cm = to_comm(i); 563 struct dlm_comm *cm = config_item_to_comm(i);
561 kfree(cm); 564 kfree(cm);
562} 565}
563 566
564static struct config_item *make_node(struct config_group *g, const char *name) 567static struct config_item *make_node(struct config_group *g, const char *name)
565{ 568{
566 struct dlm_space *sp = to_space(g->cg_item.ci_parent); 569 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
567 struct dlm_node *nd; 570 struct dlm_node *nd;
568 571
569 nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); 572 nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
@@ -585,8 +588,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
585 588
586static void drop_node(struct config_group *g, struct config_item *i) 589static void drop_node(struct config_group *g, struct config_item *i)
587{ 590{
588 struct dlm_space *sp = to_space(g->cg_item.ci_parent); 591 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
589 struct dlm_node *nd = to_node(i); 592 struct dlm_node *nd = config_item_to_node(i);
590 593
591 mutex_lock(&sp->members_lock); 594 mutex_lock(&sp->members_lock);
592 list_del(&nd->list); 595 list_del(&nd->list);
@@ -598,7 +601,7 @@ static void drop_node(struct config_group *g, struct config_item *i)
598 601
599static void release_node(struct config_item *i) 602static void release_node(struct config_item *i)
600{ 603{
601 struct dlm_node *nd = to_node(i); 604 struct dlm_node *nd = config_item_to_node(i);
602 kfree(nd); 605 kfree(nd);
603} 606}
604 607
@@ -632,7 +635,7 @@ void dlm_config_exit(void)
632static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, 635static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
633 char *buf) 636 char *buf)
634{ 637{
635 struct dlm_cluster *cl = to_cluster(i); 638 struct dlm_cluster *cl = config_item_to_cluster(i);
636 struct cluster_attribute *cla = 639 struct cluster_attribute *cla =
637 container_of(a, struct cluster_attribute, attr); 640 container_of(a, struct cluster_attribute, attr);
638 return cla->show ? cla->show(cl, buf) : 0; 641 return cla->show ? cla->show(cl, buf) : 0;
@@ -642,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i,
642 struct configfs_attribute *a, 645 struct configfs_attribute *a,
643 const char *buf, size_t len) 646 const char *buf, size_t len)
644{ 647{
645 struct dlm_cluster *cl = to_cluster(i); 648 struct dlm_cluster *cl = config_item_to_cluster(i);
646 struct cluster_attribute *cla = 649 struct cluster_attribute *cla =
647 container_of(a, struct cluster_attribute, attr); 650 container_of(a, struct cluster_attribute, attr);
648 return cla->store ? cla->store(cl, buf, len) : -EINVAL; 651 return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -651,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i,
651static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, 654static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
652 char *buf) 655 char *buf)
653{ 656{
654 struct dlm_comm *cm = to_comm(i); 657 struct dlm_comm *cm = config_item_to_comm(i);
655 struct comm_attribute *cma = 658 struct comm_attribute *cma =
656 container_of(a, struct comm_attribute, attr); 659 container_of(a, struct comm_attribute, attr);
657 return cma->show ? cma->show(cm, buf) : 0; 660 return cma->show ? cma->show(cm, buf) : 0;
@@ -660,7 +663,7 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
660static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, 663static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
661 const char *buf, size_t len) 664 const char *buf, size_t len)
662{ 665{
663 struct dlm_comm *cm = to_comm(i); 666 struct dlm_comm *cm = config_item_to_comm(i);
664 struct comm_attribute *cma = 667 struct comm_attribute *cma =
665 container_of(a, struct comm_attribute, attr); 668 container_of(a, struct comm_attribute, attr);
666 return cma->store ? cma->store(cm, buf, len) : -EINVAL; 669 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
@@ -714,7 +717,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
714static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, 717static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
715 char *buf) 718 char *buf)
716{ 719{
717 struct dlm_node *nd = to_node(i); 720 struct dlm_node *nd = config_item_to_node(i);
718 struct node_attribute *nda = 721 struct node_attribute *nda =
719 container_of(a, struct node_attribute, attr); 722 container_of(a, struct node_attribute, attr);
720 return nda->show ? nda->show(nd, buf) : 0; 723 return nda->show ? nda->show(nd, buf) : 0;
@@ -723,7 +726,7 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
723static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, 726static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
724 const char *buf, size_t len) 727 const char *buf, size_t len)
725{ 728{
726 struct dlm_node *nd = to_node(i); 729 struct dlm_node *nd = config_item_to_node(i);
727 struct node_attribute *nda = 730 struct node_attribute *nda =
728 container_of(a, struct node_attribute, attr); 731 container_of(a, struct node_attribute, attr);
729 return nda->store ? nda->store(nd, buf, len) : -EINVAL; 732 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
@@ -768,7 +771,7 @@ static struct dlm_space *get_space(char *name)
768 i = config_group_find_item(space_list, name); 771 i = config_group_find_item(space_list, name);
769 mutex_unlock(&space_list->cg_subsys->su_mutex); 772 mutex_unlock(&space_list->cg_subsys->su_mutex);
770 773
771 return to_space(i); 774 return config_item_to_space(i);
772} 775}
773 776
774static void put_space(struct dlm_space *sp) 777static void put_space(struct dlm_space *sp)
@@ -776,6 +779,33 @@ static void put_space(struct dlm_space *sp)
776 config_item_put(&sp->group.cg_item); 779 config_item_put(&sp->group.cg_item);
777} 780}
778 781
782static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
783{
784 switch (x->ss_family) {
785 case AF_INET: {
786 struct sockaddr_in *sinx = (struct sockaddr_in *)x;
787 struct sockaddr_in *siny = (struct sockaddr_in *)y;
788 if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
789 return 0;
790 if (sinx->sin_port != siny->sin_port)
791 return 0;
792 break;
793 }
794 case AF_INET6: {
795 struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
796 struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
797 if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
798 return 0;
799 if (sinx->sin6_port != siny->sin6_port)
800 return 0;
801 break;
802 }
803 default:
804 return 0;
805 }
806 return 1;
807}
808
779static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) 809static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
780{ 810{
781 struct config_item *i; 811 struct config_item *i;
@@ -788,7 +818,7 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
788 mutex_lock(&clusters_root.subsys.su_mutex); 818 mutex_lock(&clusters_root.subsys.su_mutex);
789 819
790 list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 820 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
791 cm = to_comm(i); 821 cm = config_item_to_comm(i);
792 822
793 if (nodeid) { 823 if (nodeid) {
794 if (cm->nodeid != nodeid) 824 if (cm->nodeid != nodeid)
@@ -797,8 +827,7 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
797 config_item_get(i); 827 config_item_get(i);
798 break; 828 break;
799 } else { 829 } else {
800 if (!cm->addr_count || 830 if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
801 memcmp(cm->addr[0], addr, sizeof(*addr)))
802 continue; 831 continue;
803 found = 1; 832 found = 1;
804 config_item_get(i); 833 config_item_get(i);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5a7ac33b629c..868e4c9ef127 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -441,8 +441,11 @@ struct dlm_ls {
441 uint32_t ls_global_id; /* global unique lockspace ID */ 441 uint32_t ls_global_id; /* global unique lockspace ID */
442 uint32_t ls_exflags; 442 uint32_t ls_exflags;
443 int ls_lvblen; 443 int ls_lvblen;
444 int ls_count; /* reference count */ 444 int ls_count; /* refcount of processes in
445 the dlm using this ls */
446 int ls_create_count; /* create/release refcount */
445 unsigned long ls_flags; /* LSFL_ */ 447 unsigned long ls_flags; /* LSFL_ */
448 unsigned long ls_scan_time;
446 struct kobject ls_kobj; 449 struct kobject ls_kobj;
447 450
448 struct dlm_rsbtable *ls_rsbtbl; 451 struct dlm_rsbtable *ls_rsbtbl;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 499e16759e96..d910501de6d2 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -23,6 +23,7 @@
23#include "lock.h" 23#include "lock.h"
24#include "recover.h" 24#include "recover.h"
25#include "requestqueue.h" 25#include "requestqueue.h"
26#include "user.h"
26 27
27static int ls_count; 28static int ls_count;
28static struct mutex ls_lock; 29static struct mutex ls_lock;
@@ -211,19 +212,41 @@ void dlm_lockspace_exit(void)
211 kset_unregister(dlm_kset); 212 kset_unregister(dlm_kset);
212} 213}
213 214
215static struct dlm_ls *find_ls_to_scan(void)
216{
217 struct dlm_ls *ls;
218
219 spin_lock(&lslist_lock);
220 list_for_each_entry(ls, &lslist, ls_list) {
221 if (time_after_eq(jiffies, ls->ls_scan_time +
222 dlm_config.ci_scan_secs * HZ)) {
223 spin_unlock(&lslist_lock);
224 return ls;
225 }
226 }
227 spin_unlock(&lslist_lock);
228 return NULL;
229}
230
214static int dlm_scand(void *data) 231static int dlm_scand(void *data)
215{ 232{
216 struct dlm_ls *ls; 233 struct dlm_ls *ls;
234 int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
217 235
218 while (!kthread_should_stop()) { 236 while (!kthread_should_stop()) {
219 list_for_each_entry(ls, &lslist, ls_list) { 237 ls = find_ls_to_scan();
238 if (ls) {
220 if (dlm_lock_recovery_try(ls)) { 239 if (dlm_lock_recovery_try(ls)) {
240 ls->ls_scan_time = jiffies;
221 dlm_scan_rsbs(ls); 241 dlm_scan_rsbs(ls);
222 dlm_scan_timeout(ls); 242 dlm_scan_timeout(ls);
223 dlm_unlock_recovery(ls); 243 dlm_unlock_recovery(ls);
244 } else {
245 ls->ls_scan_time += HZ;
224 } 246 }
247 } else {
248 schedule_timeout_interruptible(timeout_jiffies);
225 } 249 }
226 schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
227 } 250 }
228 return 0; 251 return 0;
229} 252}
@@ -246,23 +269,6 @@ static void dlm_scand_stop(void)
246 kthread_stop(scand_task); 269 kthread_stop(scand_task);
247} 270}
248 271
249static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
250{
251 struct dlm_ls *ls;
252
253 spin_lock(&lslist_lock);
254
255 list_for_each_entry(ls, &lslist, ls_list) {
256 if (ls->ls_namelen == namelen &&
257 memcmp(ls->ls_name, name, namelen) == 0)
258 goto out;
259 }
260 ls = NULL;
261 out:
262 spin_unlock(&lslist_lock);
263 return ls;
264}
265
266struct dlm_ls *dlm_find_lockspace_global(uint32_t id) 272struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
267{ 273{
268 struct dlm_ls *ls; 274 struct dlm_ls *ls;
@@ -327,6 +333,7 @@ static void remove_lockspace(struct dlm_ls *ls)
327 for (;;) { 333 for (;;) {
328 spin_lock(&lslist_lock); 334 spin_lock(&lslist_lock);
329 if (ls->ls_count == 0) { 335 if (ls->ls_count == 0) {
336 WARN_ON(ls->ls_create_count != 0);
330 list_del(&ls->ls_list); 337 list_del(&ls->ls_list);
331 spin_unlock(&lslist_lock); 338 spin_unlock(&lslist_lock);
332 return; 339 return;
@@ -381,7 +388,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
381 uint32_t flags, int lvblen) 388 uint32_t flags, int lvblen)
382{ 389{
383 struct dlm_ls *ls; 390 struct dlm_ls *ls;
384 int i, size, error = -ENOMEM; 391 int i, size, error;
385 int do_unreg = 0; 392 int do_unreg = 0;
386 393
387 if (namelen > DLM_LOCKSPACE_LEN) 394 if (namelen > DLM_LOCKSPACE_LEN)
@@ -393,12 +400,37 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
393 if (!try_module_get(THIS_MODULE)) 400 if (!try_module_get(THIS_MODULE))
394 return -EINVAL; 401 return -EINVAL;
395 402
396 ls = dlm_find_lockspace_name(name, namelen); 403 if (!dlm_user_daemon_available()) {
397 if (ls) { 404 module_put(THIS_MODULE);
398 *lockspace = ls; 405 return -EUNATCH;
406 }
407
408 error = 0;
409
410 spin_lock(&lslist_lock);
411 list_for_each_entry(ls, &lslist, ls_list) {
412 WARN_ON(ls->ls_create_count <= 0);
413 if (ls->ls_namelen != namelen)
414 continue;
415 if (memcmp(ls->ls_name, name, namelen))
416 continue;
417 if (flags & DLM_LSFL_NEWEXCL) {
418 error = -EEXIST;
419 break;
420 }
421 ls->ls_create_count++;
399 module_put(THIS_MODULE); 422 module_put(THIS_MODULE);
400 return -EEXIST; 423 error = 1; /* not an error, return 0 */
424 break;
401 } 425 }
426 spin_unlock(&lslist_lock);
427
428 if (error < 0)
429 goto out;
430 if (error)
431 goto ret_zero;
432
433 error = -ENOMEM;
402 434
403 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); 435 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
404 if (!ls) 436 if (!ls)
@@ -408,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
408 ls->ls_lvblen = lvblen; 440 ls->ls_lvblen = lvblen;
409 ls->ls_count = 0; 441 ls->ls_count = 0;
410 ls->ls_flags = 0; 442 ls->ls_flags = 0;
443 ls->ls_scan_time = jiffies;
411 444
412 if (flags & DLM_LSFL_TIMEWARN) 445 if (flags & DLM_LSFL_TIMEWARN)
413 set_bit(LSFL_TIMEWARN, &ls->ls_flags); 446 set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -418,8 +451,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
418 ls->ls_allocation = GFP_KERNEL; 451 ls->ls_allocation = GFP_KERNEL;
419 452
420 /* ls_exflags are forced to match among nodes, and we don't 453 /* ls_exflags are forced to match among nodes, and we don't
421 need to require all nodes to have TIMEWARN or FS set */ 454 need to require all nodes to have some flags set */
422 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS)); 455 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
456 DLM_LSFL_NEWEXCL));
423 457
424 size = dlm_config.ci_rsbtbl_size; 458 size = dlm_config.ci_rsbtbl_size;
425 ls->ls_rsbtbl_size = size; 459 ls->ls_rsbtbl_size = size;
@@ -510,6 +544,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
510 down_write(&ls->ls_in_recovery); 544 down_write(&ls->ls_in_recovery);
511 545
512 spin_lock(&lslist_lock); 546 spin_lock(&lslist_lock);
547 ls->ls_create_count = 1;
513 list_add(&ls->ls_list, &lslist); 548 list_add(&ls->ls_list, &lslist);
514 spin_unlock(&lslist_lock); 549 spin_unlock(&lslist_lock);
515 550
@@ -548,7 +583,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
548 dlm_create_debug_file(ls); 583 dlm_create_debug_file(ls);
549 584
550 log_debug(ls, "join complete"); 585 log_debug(ls, "join complete");
551 586 ret_zero:
552 *lockspace = ls; 587 *lockspace = ls;
553 return 0; 588 return 0;
554 589
@@ -635,13 +670,34 @@ static int release_lockspace(struct dlm_ls *ls, int force)
635 struct dlm_lkb *lkb; 670 struct dlm_lkb *lkb;
636 struct dlm_rsb *rsb; 671 struct dlm_rsb *rsb;
637 struct list_head *head; 672 struct list_head *head;
638 int i; 673 int i, busy, rv;
639 int busy = lockspace_busy(ls); 674
675 busy = lockspace_busy(ls);
676
677 spin_lock(&lslist_lock);
678 if (ls->ls_create_count == 1) {
679 if (busy > force)
680 rv = -EBUSY;
681 else {
682 /* remove_lockspace takes ls off lslist */
683 ls->ls_create_count = 0;
684 rv = 0;
685 }
686 } else if (ls->ls_create_count > 1) {
687 rv = --ls->ls_create_count;
688 } else {
689 rv = -EINVAL;
690 }
691 spin_unlock(&lslist_lock);
640 692
641 if (busy > force) 693 if (rv) {
642 return -EBUSY; 694 log_debug(ls, "release_lockspace no remove %d", rv);
695 return rv;
696 }
697
698 dlm_device_deregister(ls);
643 699
644 if (force < 3) 700 if (force < 3 && dlm_user_daemon_available())
645 do_uevent(ls, 0); 701 do_uevent(ls, 0);
646 702
647 dlm_recoverd_stop(ls); 703 dlm_recoverd_stop(ls);
@@ -720,15 +776,10 @@ static int release_lockspace(struct dlm_ls *ls, int force)
720 dlm_clear_members(ls); 776 dlm_clear_members(ls);
721 dlm_clear_members_gone(ls); 777 dlm_clear_members_gone(ls);
722 kfree(ls->ls_node_array); 778 kfree(ls->ls_node_array);
779 log_debug(ls, "release_lockspace final free");
723 kobject_put(&ls->ls_kobj); 780 kobject_put(&ls->ls_kobj);
724 /* The ls structure will be freed when the kobject is done with */ 781 /* The ls structure will be freed when the kobject is done with */
725 782
726 mutex_lock(&ls_lock);
727 ls_count--;
728 if (!ls_count)
729 threads_stop();
730 mutex_unlock(&ls_lock);
731
732 module_put(THIS_MODULE); 783 module_put(THIS_MODULE);
733 return 0; 784 return 0;
734} 785}
@@ -750,11 +801,38 @@ static int release_lockspace(struct dlm_ls *ls, int force)
750int dlm_release_lockspace(void *lockspace, int force) 801int dlm_release_lockspace(void *lockspace, int force)
751{ 802{
752 struct dlm_ls *ls; 803 struct dlm_ls *ls;
804 int error;
753 805
754 ls = dlm_find_lockspace_local(lockspace); 806 ls = dlm_find_lockspace_local(lockspace);
755 if (!ls) 807 if (!ls)
756 return -EINVAL; 808 return -EINVAL;
757 dlm_put_lockspace(ls); 809 dlm_put_lockspace(ls);
758 return release_lockspace(ls, force); 810
811 mutex_lock(&ls_lock);
812 error = release_lockspace(ls, force);
813 if (!error)
814 ls_count--;
815 else if (!ls_count)
816 threads_stop();
817 mutex_unlock(&ls_lock);
818
819 return error;
820}
821
822void dlm_stop_lockspaces(void)
823{
824 struct dlm_ls *ls;
825
826 restart:
827 spin_lock(&lslist_lock);
828 list_for_each_entry(ls, &lslist, ls_list) {
829 if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
830 continue;
831 spin_unlock(&lslist_lock);
832 log_error(ls, "no userland control daemon, stopping lockspace");
833 dlm_ls_stop(ls);
834 goto restart;
835 }
836 spin_unlock(&lslist_lock);
759} 837}
760 838
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 891eabbdd021..f879f87901f8 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id); 20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor); 21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls); 22void dlm_put_lockspace(struct dlm_ls *ls);
23void dlm_stop_lockspaces(void);
23 24
24#endif /* __LOCKSPACE_DOT_H__ */ 25#endif /* __LOCKSPACE_DOT_H__ */
25 26
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 34f14a14fb4e..b3832c67194a 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -15,7 +15,6 @@
15#include <linux/poll.h> 15#include <linux/poll.h>
16#include <linux/signal.h> 16#include <linux/signal.h>
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/smp_lock.h>
19#include <linux/dlm.h> 18#include <linux/dlm.h>
20#include <linux/dlm_device.h> 19#include <linux/dlm_device.h>
21 20
@@ -27,6 +26,8 @@
27 26
28static const char name_prefix[] = "dlm"; 27static const char name_prefix[] = "dlm";
29static const struct file_operations device_fops; 28static const struct file_operations device_fops;
29static atomic_t dlm_monitor_opened;
30static int dlm_monitor_unused = 1;
30 31
31#ifdef CONFIG_COMPAT 32#ifdef CONFIG_COMPAT
32 33
@@ -340,10 +341,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc,
340 return error; 341 return error;
341} 342}
342 343
343static int create_misc_device(struct dlm_ls *ls, char *name) 344static int dlm_device_register(struct dlm_ls *ls, char *name)
344{ 345{
345 int error, len; 346 int error, len;
346 347
348 /* The device is already registered. This happens when the
349 lockspace is created multiple times from userspace. */
350 if (ls->ls_device.name)
351 return 0;
352
347 error = -ENOMEM; 353 error = -ENOMEM;
348 len = strlen(name) + strlen(name_prefix) + 2; 354 len = strlen(name) + strlen(name_prefix) + 2;
349 ls->ls_device.name = kzalloc(len, GFP_KERNEL); 355 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
@@ -363,6 +369,22 @@ fail:
363 return error; 369 return error;
364} 370}
365 371
372int dlm_device_deregister(struct dlm_ls *ls)
373{
374 int error;
375
376 /* The device is not registered. This happens when the lockspace
377 was never used from userspace, or when device_create_lockspace()
378 calls dlm_release_lockspace() after the register fails. */
379 if (!ls->ls_device.name)
380 return 0;
381
382 error = misc_deregister(&ls->ls_device);
383 if (!error)
384 kfree(ls->ls_device.name);
385 return error;
386}
387
366static int device_user_purge(struct dlm_user_proc *proc, 388static int device_user_purge(struct dlm_user_proc *proc,
367 struct dlm_purge_params *params) 389 struct dlm_purge_params *params)
368{ 390{
@@ -397,7 +419,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
397 if (!ls) 419 if (!ls)
398 return -ENOENT; 420 return -ENOENT;
399 421
400 error = create_misc_device(ls, params->name); 422 error = dlm_device_register(ls, params->name);
401 dlm_put_lockspace(ls); 423 dlm_put_lockspace(ls);
402 424
403 if (error) 425 if (error)
@@ -421,31 +443,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
421 if (!ls) 443 if (!ls)
422 return -ENOENT; 444 return -ENOENT;
423 445
424 /* Deregister the misc device first, so we don't have
425 * a device that's not attached to a lockspace. If
426 * dlm_release_lockspace fails then we can recreate it
427 */
428 error = misc_deregister(&ls->ls_device);
429 if (error) {
430 dlm_put_lockspace(ls);
431 goto out;
432 }
433 kfree(ls->ls_device.name);
434
435 if (params->flags & DLM_USER_LSFLG_FORCEFREE) 446 if (params->flags & DLM_USER_LSFLG_FORCEFREE)
436 force = 2; 447 force = 2;
437 448
438 lockspace = ls->ls_local_handle; 449 lockspace = ls->ls_local_handle;
450 dlm_put_lockspace(ls);
439 451
440 /* dlm_release_lockspace waits for references to go to zero, 452 /* The final dlm_release_lockspace waits for references to go to
441 so all processes will need to close their device for the ls 453 zero, so all processes will need to close their device for the
442 before the release will procede */ 454 ls before the release will proceed. release also calls the
455 device_deregister above. Converting a positive return value
456 from release to zero means that userspace won't know when its
457 release was the final one, but it shouldn't need to know. */
443 458
444 dlm_put_lockspace(ls);
445 error = dlm_release_lockspace(lockspace, force); 459 error = dlm_release_lockspace(lockspace, force);
446 if (error) 460 if (error > 0)
447 create_misc_device(ls, ls->ls_name); 461 error = 0;
448 out:
449 return error; 462 return error;
450} 463}
451 464
@@ -623,17 +636,13 @@ static int device_open(struct inode *inode, struct file *file)
623 struct dlm_user_proc *proc; 636 struct dlm_user_proc *proc;
624 struct dlm_ls *ls; 637 struct dlm_ls *ls;
625 638
626 lock_kernel();
627 ls = dlm_find_lockspace_device(iminor(inode)); 639 ls = dlm_find_lockspace_device(iminor(inode));
628 if (!ls) { 640 if (!ls)
629 unlock_kernel();
630 return -ENOENT; 641 return -ENOENT;
631 }
632 642
633 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); 643 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
634 if (!proc) { 644 if (!proc) {
635 dlm_put_lockspace(ls); 645 dlm_put_lockspace(ls);
636 unlock_kernel();
637 return -ENOMEM; 646 return -ENOMEM;
638 } 647 }
639 648
@@ -645,7 +654,6 @@ static int device_open(struct inode *inode, struct file *file)
645 spin_lock_init(&proc->locks_spin); 654 spin_lock_init(&proc->locks_spin);
646 init_waitqueue_head(&proc->wait); 655 init_waitqueue_head(&proc->wait);
647 file->private_data = proc; 656 file->private_data = proc;
648 unlock_kernel();
649 657
650 return 0; 658 return 0;
651} 659}
@@ -878,9 +886,28 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
878 return 0; 886 return 0;
879} 887}
880 888
889int dlm_user_daemon_available(void)
890{
891 /* dlm_controld hasn't started (or, has started, but not
892 properly populated configfs) */
893
894 if (!dlm_our_nodeid())
895 return 0;
896
897 /* This is to deal with versions of dlm_controld that don't
898 know about the monitor device. We assume that if the
899 dlm_controld was started (above), but the monitor device
900 was never opened, that it's an old version. dlm_controld
901 should open the monitor device before populating configfs. */
902
903 if (dlm_monitor_unused)
904 return 1;
905
906 return atomic_read(&dlm_monitor_opened) ? 1 : 0;
907}
908
881static int ctl_device_open(struct inode *inode, struct file *file) 909static int ctl_device_open(struct inode *inode, struct file *file)
882{ 910{
883 cycle_kernel_lock();
884 file->private_data = NULL; 911 file->private_data = NULL;
885 return 0; 912 return 0;
886} 913}
@@ -890,6 +917,20 @@ static int ctl_device_close(struct inode *inode, struct file *file)
890 return 0; 917 return 0;
891} 918}
892 919
920static int monitor_device_open(struct inode *inode, struct file *file)
921{
922 atomic_inc(&dlm_monitor_opened);
923 dlm_monitor_unused = 0;
924 return 0;
925}
926
927static int monitor_device_close(struct inode *inode, struct file *file)
928{
929 if (atomic_dec_and_test(&dlm_monitor_opened))
930 dlm_stop_lockspaces();
931 return 0;
932}
933
893static const struct file_operations device_fops = { 934static const struct file_operations device_fops = {
894 .open = device_open, 935 .open = device_open,
895 .release = device_close, 936 .release = device_close,
@@ -913,19 +954,42 @@ static struct miscdevice ctl_device = {
913 .minor = MISC_DYNAMIC_MINOR, 954 .minor = MISC_DYNAMIC_MINOR,
914}; 955};
915 956
957static const struct file_operations monitor_device_fops = {
958 .open = monitor_device_open,
959 .release = monitor_device_close,
960 .owner = THIS_MODULE,
961};
962
963static struct miscdevice monitor_device = {
964 .name = "dlm-monitor",
965 .fops = &monitor_device_fops,
966 .minor = MISC_DYNAMIC_MINOR,
967};
968
916int __init dlm_user_init(void) 969int __init dlm_user_init(void)
917{ 970{
918 int error; 971 int error;
919 972
973 atomic_set(&dlm_monitor_opened, 0);
974
920 error = misc_register(&ctl_device); 975 error = misc_register(&ctl_device);
921 if (error) 976 if (error) {
922 log_print("misc_register failed for control device"); 977 log_print("misc_register failed for control device");
978 goto out;
979 }
923 980
981 error = misc_register(&monitor_device);
982 if (error) {
983 log_print("misc_register failed for monitor device");
984 misc_deregister(&ctl_device);
985 }
986 out:
924 return error; 987 return error;
925} 988}
926 989
927void dlm_user_exit(void) 990void dlm_user_exit(void)
928{ 991{
929 misc_deregister(&ctl_device); 992 misc_deregister(&ctl_device);
993 misc_deregister(&monitor_device);
930} 994}
931 995
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index d38e9f3e4151..35eb6a13d616 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
@@ -12,5 +12,7 @@
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls);
16int dlm_user_daemon_available(void);
15 17
16#endif 18#endif
diff --git a/fs/dquot.c b/fs/dquot.c
index 8ec4d6cc7633..da30a27f2242 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -9,8 +9,6 @@
9 * implementation is based on one of the several variants of the LINUX 9 * implementation is based on one of the several variants of the LINUX
10 * inode-subsystem with added complexity of the diskquota system. 10 * inode-subsystem with added complexity of the diskquota system.
11 * 11 *
12 * Version: $Id: dquot.c,v 6.3 1996/11/17 18:35:34 mvw Exp mvw $
13 *
14 * Author: Marco van Wieringen <mvw@planets.elm.net> 12 * Author: Marco van Wieringen <mvw@planets.elm.net>
15 * 13 *
16 * Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96 14 * Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
@@ -895,10 +893,9 @@ static void print_warning(struct dquot *dquot, const int warntype)
895 warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) 893 warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
896 return; 894 return;
897 895
898 mutex_lock(&tty_mutex);
899 tty = get_current_tty(); 896 tty = get_current_tty();
900 if (!tty) 897 if (!tty)
901 goto out_lock; 898 return;
902 tty_write_message(tty, dquot->dq_sb->s_id); 899 tty_write_message(tty, dquot->dq_sb->s_id);
903 if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) 900 if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
904 tty_write_message(tty, ": warning, "); 901 tty_write_message(tty, ": warning, ");
@@ -926,8 +923,7 @@ static void print_warning(struct dquot *dquot, const int warntype)
926 break; 923 break;
927 } 924 }
928 tty_write_message(tty, msg); 925 tty_write_message(tty, msg);
929out_lock: 926 tty_kref_put(tty);
930 mutex_unlock(&tty_mutex);
931} 927}
932#endif 928#endif
933 929
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index b4755a85996e..2cc9ee4ad2eb 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o 5obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
6 6
7ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o 7ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b73fb752c5f8..3504cf9df358 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -79,11 +79,6 @@
79#define ECRYPTFS_MAX_PKI_NAME_BYTES 16 79#define ECRYPTFS_MAX_PKI_NAME_BYTES 16
80#define ECRYPTFS_DEFAULT_NUM_USERS 4 80#define ECRYPTFS_DEFAULT_NUM_USERS 4
81#define ECRYPTFS_MAX_NUM_USERS 32768 81#define ECRYPTFS_MAX_NUM_USERS 32768
82#define ECRYPTFS_TRANSPORT_NETLINK 0
83#define ECRYPTFS_TRANSPORT_CONNECTOR 1
84#define ECRYPTFS_TRANSPORT_RELAYFS 2
85#define ECRYPTFS_TRANSPORT_MISCDEV 3
86#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_MISCDEV
87#define ECRYPTFS_XATTR_NAME "user.ecryptfs" 82#define ECRYPTFS_XATTR_NAME "user.ecryptfs"
88 83
89#define RFC2440_CIPHER_DES3_EDE 0x02 84#define RFC2440_CIPHER_DES3_EDE 0x02
@@ -400,8 +395,6 @@ struct ecryptfs_msg_ctx {
400 struct mutex mux; 395 struct mutex mux;
401}; 396};
402 397
403extern unsigned int ecryptfs_transport;
404
405struct ecryptfs_daemon; 398struct ecryptfs_daemon;
406 399
407struct ecryptfs_daemon { 400struct ecryptfs_daemon {
@@ -627,31 +620,20 @@ int
627ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, 620ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
628 size_t size, int flags); 621 size_t size, int flags);
629int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); 622int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
630int ecryptfs_process_helo(unsigned int transport, uid_t euid, 623int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
631 struct user_namespace *user_ns, struct pid *pid); 624 struct pid *pid);
632int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, 625int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
633 struct pid *pid); 626 struct pid *pid);
634int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, 627int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
635 struct user_namespace *user_ns, struct pid *pid, 628 struct user_namespace *user_ns, struct pid *pid,
636 u32 seq); 629 u32 seq);
637int ecryptfs_send_message(unsigned int transport, char *data, int data_len, 630int ecryptfs_send_message(char *data, int data_len,
638 struct ecryptfs_msg_ctx **msg_ctx); 631 struct ecryptfs_msg_ctx **msg_ctx);
639int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, 632int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
640 struct ecryptfs_message **emsg); 633 struct ecryptfs_message **emsg);
641int ecryptfs_init_messaging(unsigned int transport); 634int ecryptfs_init_messaging(void);
642void ecryptfs_release_messaging(unsigned int transport); 635void ecryptfs_release_messaging(void);
643 636
644int ecryptfs_send_netlink(char *data, int data_len,
645 struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
646 u16 msg_flags, struct pid *daemon_pid);
647int ecryptfs_init_netlink(void);
648void ecryptfs_release_netlink(void);
649
650int ecryptfs_send_connector(char *data, int data_len,
651 struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
652 u16 msg_flags, struct pid *daemon_pid);
653int ecryptfs_init_connector(void);
654void ecryptfs_release_connector(void);
655void 637void
656ecryptfs_write_header_metadata(char *virt, 638ecryptfs_write_header_metadata(char *virt,
657 struct ecryptfs_crypt_stat *crypt_stat, 639 struct ecryptfs_crypt_stat *crypt_stat,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 9244d653743e..eb3dc4c7ac06 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -71,12 +71,11 @@ struct ecryptfs_getdents_callback {
71 void *dirent; 71 void *dirent;
72 struct dentry *dentry; 72 struct dentry *dentry;
73 filldir_t filldir; 73 filldir_t filldir;
74 int err;
75 int filldir_called; 74 int filldir_called;
76 int entries_written; 75 int entries_written;
77}; 76};
78 77
79/* Inspired by generic filldir in fs/readir.c */ 78/* Inspired by generic filldir in fs/readdir.c */
80static int 79static int
81ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, 80ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
82 u64 ino, unsigned int d_type) 81 u64 ino, unsigned int d_type)
@@ -125,18 +124,18 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
125 buf.dirent = dirent; 124 buf.dirent = dirent;
126 buf.dentry = file->f_path.dentry; 125 buf.dentry = file->f_path.dentry;
127 buf.filldir = filldir; 126 buf.filldir = filldir;
128retry:
129 buf.filldir_called = 0; 127 buf.filldir_called = 0;
130 buf.entries_written = 0; 128 buf.entries_written = 0;
131 buf.err = 0;
132 rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf); 129 rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
133 if (buf.err)
134 rc = buf.err;
135 if (buf.filldir_called && !buf.entries_written)
136 goto retry;
137 file->f_pos = lower_file->f_pos; 130 file->f_pos = lower_file->f_pos;
131 if (rc < 0)
132 goto out;
133 if (buf.filldir_called && !buf.entries_written)
134 goto out;
138 if (rc >= 0) 135 if (rc >= 0)
139 fsstack_copy_attr_atime(inode, lower_file->f_path.dentry->d_inode); 136 fsstack_copy_attr_atime(inode,
137 lower_file->f_path.dentry->d_inode);
138out:
140 return rc; 139 return rc;
141} 140}
142 141
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index f5b76a331b9c..e22bc3961345 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -234,8 +234,8 @@ parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code,
234 } 234 }
235 i += data_len; 235 i += data_len;
236 if (message_len < (i + m_size)) { 236 if (message_len < (i + m_size)) {
237 ecryptfs_printk(KERN_ERR, "The received netlink message is " 237 ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd "
238 "shorter than expected\n"); 238 "is shorter than expected\n");
239 rc = -EIO; 239 rc = -EIO;
240 goto out; 240 goto out;
241 } 241 }
@@ -438,8 +438,8 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
438 struct ecryptfs_msg_ctx *msg_ctx; 438 struct ecryptfs_msg_ctx *msg_ctx;
439 struct ecryptfs_message *msg = NULL; 439 struct ecryptfs_message *msg = NULL;
440 char *auth_tok_sig; 440 char *auth_tok_sig;
441 char *netlink_message; 441 char *payload;
442 size_t netlink_message_length; 442 size_t payload_len;
443 int rc; 443 int rc;
444 444
445 rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); 445 rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok);
@@ -449,15 +449,15 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
449 goto out; 449 goto out;
450 } 450 }
451 rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key), 451 rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key),
452 &netlink_message, &netlink_message_length); 452 &payload, &payload_len);
453 if (rc) { 453 if (rc) {
454 ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n"); 454 ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n");
455 goto out; 455 goto out;
456 } 456 }
457 rc = ecryptfs_send_message(ecryptfs_transport, netlink_message, 457 rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
458 netlink_message_length, &msg_ctx);
459 if (rc) { 458 if (rc) {
460 ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); 459 ecryptfs_printk(KERN_ERR, "Error sending message to "
460 "ecryptfsd\n");
461 goto out; 461 goto out;
462 } 462 }
463 rc = ecryptfs_wait_for_response(msg_ctx, &msg); 463 rc = ecryptfs_wait_for_response(msg_ctx, &msg);
@@ -1333,23 +1333,22 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
1333 struct ecryptfs_key_record *key_rec) 1333 struct ecryptfs_key_record *key_rec)
1334{ 1334{
1335 struct ecryptfs_msg_ctx *msg_ctx = NULL; 1335 struct ecryptfs_msg_ctx *msg_ctx = NULL;
1336 char *netlink_payload; 1336 char *payload = NULL;
1337 size_t netlink_payload_length; 1337 size_t payload_len;
1338 struct ecryptfs_message *msg; 1338 struct ecryptfs_message *msg;
1339 int rc; 1339 int rc;
1340 1340
1341 rc = write_tag_66_packet(auth_tok->token.private_key.signature, 1341 rc = write_tag_66_packet(auth_tok->token.private_key.signature,
1342 ecryptfs_code_for_cipher_string(crypt_stat), 1342 ecryptfs_code_for_cipher_string(crypt_stat),
1343 crypt_stat, &netlink_payload, 1343 crypt_stat, &payload, &payload_len);
1344 &netlink_payload_length);
1345 if (rc) { 1344 if (rc) {
1346 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); 1345 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
1347 goto out; 1346 goto out;
1348 } 1347 }
1349 rc = ecryptfs_send_message(ecryptfs_transport, netlink_payload, 1348 rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
1350 netlink_payload_length, &msg_ctx);
1351 if (rc) { 1349 if (rc) {
1352 ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); 1350 ecryptfs_printk(KERN_ERR, "Error sending message to "
1351 "ecryptfsd\n");
1353 goto out; 1352 goto out;
1354 } 1353 }
1355 rc = ecryptfs_wait_for_response(msg_ctx, &msg); 1354 rc = ecryptfs_wait_for_response(msg_ctx, &msg);
@@ -1364,8 +1363,7 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
1364 ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n"); 1363 ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n");
1365 kfree(msg); 1364 kfree(msg);
1366out: 1365out:
1367 if (netlink_payload) 1366 kfree(payload);
1368 kfree(netlink_payload);
1369 return rc; 1367 return rc;
1370} 1368}
1371/** 1369/**
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 448dfd597b5f..046e027a4cb1 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -30,7 +30,6 @@
30#include <linux/namei.h> 30#include <linux/namei.h>
31#include <linux/skbuff.h> 31#include <linux/skbuff.h>
32#include <linux/crypto.h> 32#include <linux/crypto.h>
33#include <linux/netlink.h>
34#include <linux/mount.h> 33#include <linux/mount.h>
35#include <linux/pagemap.h> 34#include <linux/pagemap.h>
36#include <linux/key.h> 35#include <linux/key.h>
@@ -49,8 +48,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity,
49 "0, which is Quiet)"); 48 "0, which is Quiet)");
50 49
51/** 50/**
52 * Module parameter that defines the number of netlink message buffer 51 * Module parameter that defines the number of message buffer elements
53 * elements
54 */ 52 */
55unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; 53unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS;
56 54
@@ -60,9 +58,9 @@ MODULE_PARM_DESC(ecryptfs_message_buf_len,
60 58
61/** 59/**
62 * Module parameter that defines the maximum guaranteed amount of time to wait 60 * Module parameter that defines the maximum guaranteed amount of time to wait
63 * for a response through netlink. The actual sleep time will be, more than 61 * for a response from ecryptfsd. The actual sleep time will be, more than
64 * likely, a small amount greater than this specified value, but only less if 62 * likely, a small amount greater than this specified value, but only less if
65 * the netlink message successfully arrives. 63 * the message successfully arrives.
66 */ 64 */
67signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; 65signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ;
68 66
@@ -83,8 +81,6 @@ module_param(ecryptfs_number_of_users, uint, 0);
83MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " 81MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of "
84 "concurrent users of eCryptfs"); 82 "concurrent users of eCryptfs");
85 83
86unsigned int ecryptfs_transport = ECRYPTFS_DEFAULT_TRANSPORT;
87
88void __ecryptfs_printk(const char *fmt, ...) 84void __ecryptfs_printk(const char *fmt, ...)
89{ 85{
90 va_list args; 86 va_list args;
@@ -211,7 +207,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
211 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 207 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
212 ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; 208 ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
213 209
214static match_table_t tokens = { 210static const match_table_t tokens = {
215 {ecryptfs_opt_sig, "sig=%s"}, 211 {ecryptfs_opt_sig, "sig=%s"},
216 {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, 212 {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
217 {ecryptfs_opt_cipher, "cipher=%s"}, 213 {ecryptfs_opt_cipher, "cipher=%s"},
@@ -779,10 +775,11 @@ static int __init ecryptfs_init(void)
779 "rc = [%d]\n", __func__, rc); 775 "rc = [%d]\n", __func__, rc);
780 goto out_do_sysfs_unregistration; 776 goto out_do_sysfs_unregistration;
781 } 777 }
782 rc = ecryptfs_init_messaging(ecryptfs_transport); 778 rc = ecryptfs_init_messaging();
783 if (rc) { 779 if (rc) {
784 printk(KERN_ERR "Failure occured while attempting to " 780 printk(KERN_ERR "Failure occured while attempting to "
785 "initialize the eCryptfs netlink socket\n"); 781 "initialize the communications channel to "
782 "ecryptfsd\n");
786 goto out_destroy_kthread; 783 goto out_destroy_kthread;
787 } 784 }
788 rc = ecryptfs_init_crypto(); 785 rc = ecryptfs_init_crypto();
@@ -797,7 +794,7 @@ static int __init ecryptfs_init(void)
797 794
798 goto out; 795 goto out;
799out_release_messaging: 796out_release_messaging:
800 ecryptfs_release_messaging(ecryptfs_transport); 797 ecryptfs_release_messaging();
801out_destroy_kthread: 798out_destroy_kthread:
802 ecryptfs_destroy_kthread(); 799 ecryptfs_destroy_kthread();
803out_do_sysfs_unregistration: 800out_do_sysfs_unregistration:
@@ -818,7 +815,7 @@ static void __exit ecryptfs_exit(void)
818 if (rc) 815 if (rc)
819 printk(KERN_ERR "Failure whilst attempting to destroy crypto; " 816 printk(KERN_ERR "Failure whilst attempting to destroy crypto; "
820 "rc = [%d]\n", rc); 817 "rc = [%d]\n", rc);
821 ecryptfs_release_messaging(ecryptfs_transport); 818 ecryptfs_release_messaging();
822 ecryptfs_destroy_kthread(); 819 ecryptfs_destroy_kthread();
823 do_sysfs_unregistration(); 820 do_sysfs_unregistration();
824 unregister_filesystem(&ecryptfs_fs_type); 821 unregister_filesystem(&ecryptfs_fs_type);
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 1b5c20058acb..c6983978a31e 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -134,12 +134,11 @@ out:
134} 134}
135 135
136static int 136static int
137ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, 137ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
138 u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx); 138 struct ecryptfs_msg_ctx **msg_ctx);
139 139
140/** 140/**
141 * ecryptfs_send_raw_message 141 * ecryptfs_send_raw_message
142 * @transport: Transport type
143 * @msg_type: Message type 142 * @msg_type: Message type
144 * @daemon: Daemon struct for recipient of message 143 * @daemon: Daemon struct for recipient of message
145 * 144 *
@@ -150,38 +149,25 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
150 * 149 *
151 * Returns zero on success; non-zero otherwise 150 * Returns zero on success; non-zero otherwise
152 */ 151 */
153static int ecryptfs_send_raw_message(unsigned int transport, u8 msg_type, 152static int ecryptfs_send_raw_message(u8 msg_type,
154 struct ecryptfs_daemon *daemon) 153 struct ecryptfs_daemon *daemon)
155{ 154{
156 struct ecryptfs_msg_ctx *msg_ctx; 155 struct ecryptfs_msg_ctx *msg_ctx;
157 int rc; 156 int rc;
158 157
159 switch(transport) { 158 rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
160 case ECRYPTFS_TRANSPORT_NETLINK: 159 if (rc) {
161 rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0, 160 printk(KERN_ERR "%s: Error whilst attempting to send "
162 daemon->pid); 161 "message to ecryptfsd; rc = [%d]\n", __func__, rc);
163 break; 162 goto out;
164 case ECRYPTFS_TRANSPORT_MISCDEV:
165 rc = ecryptfs_send_message_locked(transport, NULL, 0, msg_type,
166 &msg_ctx);
167 if (rc) {
168 printk(KERN_ERR "%s: Error whilst attempting to send "
169 "message via procfs; rc = [%d]\n", __func__, rc);
170 goto out;
171 }
172 /* Raw messages are logically context-free (e.g., no
173 * reply is expected), so we set the state of the
174 * ecryptfs_msg_ctx object to indicate that it should
175 * be freed as soon as the transport sends out the message. */
176 mutex_lock(&msg_ctx->mux);
177 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
178 mutex_unlock(&msg_ctx->mux);
179 break;
180 case ECRYPTFS_TRANSPORT_CONNECTOR:
181 case ECRYPTFS_TRANSPORT_RELAYFS:
182 default:
183 rc = -ENOSYS;
184 } 163 }
164 /* Raw messages are logically context-free (e.g., no
165 * reply is expected), so we set the state of the
166 * ecryptfs_msg_ctx object to indicate that it should
167 * be freed as soon as the message is sent. */
168 mutex_lock(&msg_ctx->mux);
169 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
170 mutex_unlock(&msg_ctx->mux);
185out: 171out:
186 return rc; 172 return rc;
187} 173}
@@ -227,7 +213,6 @@ out:
227 213
228/** 214/**
229 * ecryptfs_process_helo 215 * ecryptfs_process_helo
230 * @transport: The underlying transport (netlink, etc.)
231 * @euid: The user ID owner of the message 216 * @euid: The user ID owner of the message
232 * @user_ns: The namespace in which @euid applies 217 * @user_ns: The namespace in which @euid applies
233 * @pid: The process ID for the userspace program that sent the 218 * @pid: The process ID for the userspace program that sent the
@@ -239,8 +224,8 @@ out:
239 * Returns zero after adding a new daemon to the hash list; 224 * Returns zero after adding a new daemon to the hash list;
240 * non-zero otherwise. 225 * non-zero otherwise.
241 */ 226 */
242int ecryptfs_process_helo(unsigned int transport, uid_t euid, 227int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
243 struct user_namespace *user_ns, struct pid *pid) 228 struct pid *pid)
244{ 229{
245 struct ecryptfs_daemon *new_daemon; 230 struct ecryptfs_daemon *new_daemon;
246 struct ecryptfs_daemon *old_daemon; 231 struct ecryptfs_daemon *old_daemon;
@@ -252,8 +237,7 @@ int ecryptfs_process_helo(unsigned int transport, uid_t euid,
252 printk(KERN_WARNING "Received request from user [%d] " 237 printk(KERN_WARNING "Received request from user [%d] "
253 "to register daemon [0x%p]; unregistering daemon " 238 "to register daemon [0x%p]; unregistering daemon "
254 "[0x%p]\n", euid, pid, old_daemon->pid); 239 "[0x%p]\n", euid, pid, old_daemon->pid);
255 rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT, 240 rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
256 old_daemon);
257 if (rc) 241 if (rc)
258 printk(KERN_WARNING "Failed to send QUIT " 242 printk(KERN_WARNING "Failed to send QUIT "
259 "message to daemon [0x%p]; rc = [%d]\n", 243 "message to daemon [0x%p]; rc = [%d]\n",
@@ -467,8 +451,6 @@ out:
467 451
468/** 452/**
469 * ecryptfs_send_message_locked 453 * ecryptfs_send_message_locked
470 * @transport: The transport over which to send the message (i.e.,
471 * netlink)
472 * @data: The data to send 454 * @data: The data to send
473 * @data_len: The length of data 455 * @data_len: The length of data
474 * @msg_ctx: The message context allocated for the send 456 * @msg_ctx: The message context allocated for the send
@@ -478,8 +460,8 @@ out:
478 * Returns zero on success; non-zero otherwise 460 * Returns zero on success; non-zero otherwise
479 */ 461 */
480static int 462static int
481ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, 463ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
482 u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx) 464 struct ecryptfs_msg_ctx **msg_ctx)
483{ 465{
484 struct ecryptfs_daemon *daemon; 466 struct ecryptfs_daemon *daemon;
485 int rc; 467 int rc;
@@ -503,20 +485,8 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
503 ecryptfs_msg_ctx_free_to_alloc(*msg_ctx); 485 ecryptfs_msg_ctx_free_to_alloc(*msg_ctx);
504 mutex_unlock(&(*msg_ctx)->mux); 486 mutex_unlock(&(*msg_ctx)->mux);
505 mutex_unlock(&ecryptfs_msg_ctx_lists_mux); 487 mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
506 switch (transport) { 488 rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0,
507 case ECRYPTFS_TRANSPORT_NETLINK: 489 daemon);
508 rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, msg_type,
509 0, daemon->pid);
510 break;
511 case ECRYPTFS_TRANSPORT_MISCDEV:
512 rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type,
513 0, daemon);
514 break;
515 case ECRYPTFS_TRANSPORT_CONNECTOR:
516 case ECRYPTFS_TRANSPORT_RELAYFS:
517 default:
518 rc = -ENOSYS;
519 }
520 if (rc) 490 if (rc)
521 printk(KERN_ERR "%s: Error attempting to send message to " 491 printk(KERN_ERR "%s: Error attempting to send message to "
522 "userspace daemon; rc = [%d]\n", __func__, rc); 492 "userspace daemon; rc = [%d]\n", __func__, rc);
@@ -526,8 +496,6 @@ out:
526 496
527/** 497/**
528 * ecryptfs_send_message 498 * ecryptfs_send_message
529 * @transport: The transport over which to send the message (i.e.,
530 * netlink)
531 * @data: The data to send 499 * @data: The data to send
532 * @data_len: The length of data 500 * @data_len: The length of data
533 * @msg_ctx: The message context allocated for the send 501 * @msg_ctx: The message context allocated for the send
@@ -536,14 +504,14 @@ out:
536 * 504 *
537 * Returns zero on success; non-zero otherwise 505 * Returns zero on success; non-zero otherwise
538 */ 506 */
539int ecryptfs_send_message(unsigned int transport, char *data, int data_len, 507int ecryptfs_send_message(char *data, int data_len,
540 struct ecryptfs_msg_ctx **msg_ctx) 508 struct ecryptfs_msg_ctx **msg_ctx)
541{ 509{
542 int rc; 510 int rc;
543 511
544 mutex_lock(&ecryptfs_daemon_hash_mux); 512 mutex_lock(&ecryptfs_daemon_hash_mux);
545 rc = ecryptfs_send_message_locked(transport, data, data_len, 513 rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST,
546 ECRYPTFS_MSG_REQUEST, msg_ctx); 514 msg_ctx);
547 mutex_unlock(&ecryptfs_daemon_hash_mux); 515 mutex_unlock(&ecryptfs_daemon_hash_mux);
548 return rc; 516 return rc;
549} 517}
@@ -586,7 +554,7 @@ sleep:
586 return rc; 554 return rc;
587} 555}
588 556
589int ecryptfs_init_messaging(unsigned int transport) 557int ecryptfs_init_messaging(void)
590{ 558{
591 int i; 559 int i;
592 int rc = 0; 560 int rc = 0;
@@ -639,27 +607,14 @@ int ecryptfs_init_messaging(unsigned int transport)
639 mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); 607 mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
640 } 608 }
641 mutex_unlock(&ecryptfs_msg_ctx_lists_mux); 609 mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
642 switch(transport) { 610 rc = ecryptfs_init_ecryptfs_miscdev();
643 case ECRYPTFS_TRANSPORT_NETLINK: 611 if (rc)
644 rc = ecryptfs_init_netlink(); 612 ecryptfs_release_messaging();
645 if (rc)
646 ecryptfs_release_messaging(transport);
647 break;
648 case ECRYPTFS_TRANSPORT_MISCDEV:
649 rc = ecryptfs_init_ecryptfs_miscdev();
650 if (rc)
651 ecryptfs_release_messaging(transport);
652 break;
653 case ECRYPTFS_TRANSPORT_CONNECTOR:
654 case ECRYPTFS_TRANSPORT_RELAYFS:
655 default:
656 rc = -ENOSYS;
657 }
658out: 613out:
659 return rc; 614 return rc;
660} 615}
661 616
662void ecryptfs_release_messaging(unsigned int transport) 617void ecryptfs_release_messaging(void)
663{ 618{
664 if (ecryptfs_msg_ctx_arr) { 619 if (ecryptfs_msg_ctx_arr) {
665 int i; 620 int i;
@@ -698,17 +653,6 @@ void ecryptfs_release_messaging(unsigned int transport)
698 kfree(ecryptfs_daemon_hash); 653 kfree(ecryptfs_daemon_hash);
699 mutex_unlock(&ecryptfs_daemon_hash_mux); 654 mutex_unlock(&ecryptfs_daemon_hash_mux);
700 } 655 }
701 switch(transport) { 656 ecryptfs_destroy_ecryptfs_miscdev();
702 case ECRYPTFS_TRANSPORT_NETLINK:
703 ecryptfs_release_netlink();
704 break;
705 case ECRYPTFS_TRANSPORT_MISCDEV:
706 ecryptfs_destroy_ecryptfs_miscdev();
707 break;
708 case ECRYPTFS_TRANSPORT_CONNECTOR:
709 case ECRYPTFS_TRANSPORT_RELAYFS:
710 default:
711 break;
712 }
713 return; 657 return;
714} 658}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 245c2dc02d5c..04d7b3fa1ac6 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -265,22 +265,34 @@ out:
265} 265}
266 266
267/** 267/**
268 * ecryptfs_prepare_write 268 * ecryptfs_write_begin
269 * @file: The eCryptfs file 269 * @file: The eCryptfs file
270 * @page: The eCryptfs page 270 * @mapping: The eCryptfs object
271 * @from: The start byte from which we will write 271 * @pos: The file offset at which to start writing
272 * @to: The end byte to which we will write 272 * @len: Length of the write
273 * @flags: Various flags
274 * @pagep: Pointer to return the page
275 * @fsdata: Pointer to return fs data (unused)
273 * 276 *
274 * This function must zero any hole we create 277 * This function must zero any hole we create
275 * 278 *
276 * Returns zero on success; non-zero otherwise 279 * Returns zero on success; non-zero otherwise
277 */ 280 */
278static int ecryptfs_prepare_write(struct file *file, struct page *page, 281static int ecryptfs_write_begin(struct file *file,
279 unsigned from, unsigned to) 282 struct address_space *mapping,
283 loff_t pos, unsigned len, unsigned flags,
284 struct page **pagep, void **fsdata)
280{ 285{
286 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
287 struct page *page;
281 loff_t prev_page_end_size; 288 loff_t prev_page_end_size;
282 int rc = 0; 289 int rc = 0;
283 290
291 page = __grab_cache_page(mapping, index);
292 if (!page)
293 return -ENOMEM;
294 *pagep = page;
295
284 if (!PageUptodate(page)) { 296 if (!PageUptodate(page)) {
285 struct ecryptfs_crypt_stat *crypt_stat = 297 struct ecryptfs_crypt_stat *crypt_stat =
286 &ecryptfs_inode_to_private( 298 &ecryptfs_inode_to_private(
@@ -289,8 +301,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
289 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) 301 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
290 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { 302 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
291 rc = ecryptfs_read_lower_page_segment( 303 rc = ecryptfs_read_lower_page_segment(
292 page, page->index, 0, PAGE_CACHE_SIZE, 304 page, index, 0, PAGE_CACHE_SIZE, mapping->host);
293 page->mapping->host);
294 if (rc) { 305 if (rc) {
295 printk(KERN_ERR "%s: Error attemping to read " 306 printk(KERN_ERR "%s: Error attemping to read "
296 "lower page segment; rc = [%d]\n", 307 "lower page segment; rc = [%d]\n",
@@ -316,8 +327,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
316 SetPageUptodate(page); 327 SetPageUptodate(page);
317 } else { 328 } else {
318 rc = ecryptfs_read_lower_page_segment( 329 rc = ecryptfs_read_lower_page_segment(
319 page, page->index, 0, PAGE_CACHE_SIZE, 330 page, index, 0, PAGE_CACHE_SIZE,
320 page->mapping->host); 331 mapping->host);
321 if (rc) { 332 if (rc) {
322 printk(KERN_ERR "%s: Error reading " 333 printk(KERN_ERR "%s: Error reading "
323 "page; rc = [%d]\n", 334 "page; rc = [%d]\n",
@@ -339,10 +350,10 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
339 SetPageUptodate(page); 350 SetPageUptodate(page);
340 } 351 }
341 } 352 }
342 prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT); 353 prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
343 /* If creating a page or more of holes, zero them out via truncate. 354 /* If creating a page or more of holes, zero them out via truncate.
344 * Note, this will increase i_size. */ 355 * Note, this will increase i_size. */
345 if (page->index != 0) { 356 if (index != 0) {
346 if (prev_page_end_size > i_size_read(page->mapping->host)) { 357 if (prev_page_end_size > i_size_read(page->mapping->host)) {
347 rc = ecryptfs_truncate(file->f_path.dentry, 358 rc = ecryptfs_truncate(file->f_path.dentry,
348 prev_page_end_size); 359 prev_page_end_size);
@@ -357,8 +368,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
357 } 368 }
358 /* Writing to a new page, and creating a small hole from start 369 /* Writing to a new page, and creating a small hole from start
359 * of page? Zero it out. */ 370 * of page? Zero it out. */
360 if ((i_size_read(page->mapping->host) == prev_page_end_size) 371 if ((i_size_read(mapping->host) == prev_page_end_size)
361 && (from != 0)) 372 && (pos != 0))
362 zero_user(page, 0, PAGE_CACHE_SIZE); 373 zero_user(page, 0, PAGE_CACHE_SIZE);
363out: 374out:
364 return rc; 375 return rc;
@@ -445,21 +456,28 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
445} 456}
446 457
447/** 458/**
448 * ecryptfs_commit_write 459 * ecryptfs_write_end
449 * @file: The eCryptfs file object 460 * @file: The eCryptfs file object
461 * @mapping: The eCryptfs object
462 * @pos: The file position
463 * @len: The length of the data (unused)
464 * @copied: The amount of data copied
450 * @page: The eCryptfs page 465 * @page: The eCryptfs page
451 * @from: Ignored (we rotate the page IV on each write) 466 * @fsdata: The fsdata (unused)
452 * @to: Ignored
453 * 467 *
454 * This is where we encrypt the data and pass the encrypted data to 468 * This is where we encrypt the data and pass the encrypted data to
455 * the lower filesystem. In OpenPGP-compatible mode, we operate on 469 * the lower filesystem. In OpenPGP-compatible mode, we operate on
456 * entire underlying packets. 470 * entire underlying packets.
457 */ 471 */
458static int ecryptfs_commit_write(struct file *file, struct page *page, 472static int ecryptfs_write_end(struct file *file,
459 unsigned from, unsigned to) 473 struct address_space *mapping,
474 loff_t pos, unsigned len, unsigned copied,
475 struct page *page, void *fsdata)
460{ 476{
461 loff_t pos; 477 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
462 struct inode *ecryptfs_inode = page->mapping->host; 478 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
479 unsigned to = from + copied;
480 struct inode *ecryptfs_inode = mapping->host;
463 struct ecryptfs_crypt_stat *crypt_stat = 481 struct ecryptfs_crypt_stat *crypt_stat =
464 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 482 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
465 int rc; 483 int rc;
@@ -471,25 +489,22 @@ static int ecryptfs_commit_write(struct file *file, struct page *page,
471 } else 489 } else
472 ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); 490 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
473 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 491 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
474 "(page w/ index = [0x%.16x], to = [%d])\n", page->index, 492 "(page w/ index = [0x%.16x], to = [%d])\n", index, to);
475 to);
476 /* Fills in zeros if 'to' goes beyond inode size */ 493 /* Fills in zeros if 'to' goes beyond inode size */
477 rc = fill_zeros_to_end_of_page(page, to); 494 rc = fill_zeros_to_end_of_page(page, to);
478 if (rc) { 495 if (rc) {
479 ecryptfs_printk(KERN_WARNING, "Error attempting to fill " 496 ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
480 "zeros in page with index = [0x%.16x]\n", 497 "zeros in page with index = [0x%.16x]\n", index);
481 page->index);
482 goto out; 498 goto out;
483 } 499 }
484 rc = ecryptfs_encrypt_page(page); 500 rc = ecryptfs_encrypt_page(page);
485 if (rc) { 501 if (rc) {
486 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " 502 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
487 "index [0x%.16x])\n", page->index); 503 "index [0x%.16x])\n", index);
488 goto out; 504 goto out;
489 } 505 }
490 pos = (((loff_t)page->index) << PAGE_CACHE_SHIFT) + to; 506 if (pos + copied > i_size_read(ecryptfs_inode)) {
491 if (pos > i_size_read(ecryptfs_inode)) { 507 i_size_write(ecryptfs_inode, pos + copied);
492 i_size_write(ecryptfs_inode, pos);
493 ecryptfs_printk(KERN_DEBUG, "Expanded file size to " 508 ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
494 "[0x%.16x]\n", i_size_read(ecryptfs_inode)); 509 "[0x%.16x]\n", i_size_read(ecryptfs_inode));
495 } 510 }
@@ -497,7 +512,11 @@ static int ecryptfs_commit_write(struct file *file, struct page *page,
497 if (rc) 512 if (rc)
498 printk(KERN_ERR "Error writing inode size to metadata; " 513 printk(KERN_ERR "Error writing inode size to metadata; "
499 "rc = [%d]\n", rc); 514 "rc = [%d]\n", rc);
515 else
516 rc = copied;
500out: 517out:
518 unlock_page(page);
519 page_cache_release(page);
501 return rc; 520 return rc;
502} 521}
503 522
@@ -518,7 +537,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
518struct address_space_operations ecryptfs_aops = { 537struct address_space_operations ecryptfs_aops = {
519 .writepage = ecryptfs_writepage, 538 .writepage = ecryptfs_writepage,
520 .readpage = ecryptfs_readpage, 539 .readpage = ecryptfs_readpage,
521 .prepare_write = ecryptfs_prepare_write, 540 .write_begin = ecryptfs_write_begin,
522 .commit_write = ecryptfs_commit_write, 541 .write_end = ecryptfs_write_end,
523 .bmap = ecryptfs_bmap, 542 .bmap = ecryptfs_bmap,
524}; 543};
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
deleted file mode 100644
index e0abad62b395..000000000000
--- a/fs/ecryptfs/netlink.c
+++ /dev/null
@@ -1,249 +0,0 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 *
4 * Copyright (C) 2004-2006 International Business Machines Corp.
5 * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
6 * Tyler Hicks <tyhicks@ou.edu>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version
10 * 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 * 02111-1307, USA.
21 */
22
23#include <net/sock.h>
24#include <linux/hash.h>
25#include <linux/random.h>
26#include "ecryptfs_kernel.h"
27
28static struct sock *ecryptfs_nl_sock;
29
30/**
31 * ecryptfs_send_netlink
32 * @data: The data to include as the payload
33 * @data_len: The byte count of the data
34 * @msg_ctx: The netlink context that will be used to handle the
35 * response message
36 * @msg_type: The type of netlink message to send
37 * @msg_flags: The flags to include in the netlink header
38 * @daemon_pid: The process id of the daemon to send the message to
39 *
40 * Sends the data to the specified daemon pid and uses the netlink
41 * context element to store the data needed for validation upon
42 * receiving the response. The data and the netlink context can be
43 * null if just sending a netlink header is sufficient. Returns zero
44 * upon sending the message; non-zero upon error.
45 */
46int ecryptfs_send_netlink(char *data, int data_len,
47 struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
48 u16 msg_flags, struct pid *daemon_pid)
49{
50 struct sk_buff *skb;
51 struct nlmsghdr *nlh;
52 struct ecryptfs_message *msg;
53 size_t payload_len;
54 int rc;
55
56 payload_len = ((data && data_len) ? (sizeof(*msg) + data_len) : 0);
57 skb = alloc_skb(NLMSG_SPACE(payload_len), GFP_KERNEL);
58 if (!skb) {
59 rc = -ENOMEM;
60 ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n");
61 goto out;
62 }
63 nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0,
64 msg_type, payload_len);
65 nlh->nlmsg_flags = msg_flags;
66 if (msg_ctx && payload_len) {
67 msg = (struct ecryptfs_message *)NLMSG_DATA(nlh);
68 msg->index = msg_ctx->index;
69 msg->data_len = data_len;
70 memcpy(msg->data, data, data_len);
71 }
72 rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0);
73 if (rc < 0) {
74 ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink "
75 "message; rc = [%d]\n", rc);
76 goto out;
77 }
78 rc = 0;
79 goto out;
80nlmsg_failure:
81 rc = -EMSGSIZE;
82 kfree_skb(skb);
83out:
84 return rc;
85}
86
87/**
88 * ecryptfs_process_nl_reponse
89 * @skb: The socket buffer containing the netlink message of state
90 * RESPONSE
91 *
92 * Processes a response message after sending a operation request to
93 * userspace. Attempts to assign the msg to a netlink context element
94 * at the index specified in the msg. The sk_buff and nlmsghdr must
95 * be validated before this function. Returns zero upon delivery to
96 * desired context element; non-zero upon delivery failure or error.
97 */
98static int ecryptfs_process_nl_response(struct sk_buff *skb)
99{
100 struct nlmsghdr *nlh = nlmsg_hdr(skb);
101 struct ecryptfs_message *msg = NLMSG_DATA(nlh);
102 struct pid *pid;
103 int rc;
104
105 if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) {
106 rc = -EINVAL;
107 ecryptfs_printk(KERN_ERR, "Received netlink message with "
108 "incorrectly specified data length\n");
109 goto out;
110 }
111 pid = find_get_pid(NETLINK_CREDS(skb)->pid);
112 rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL,
113 pid, nlh->nlmsg_seq);
114 put_pid(pid);
115 if (rc)
116 printk(KERN_ERR
117 "Error processing response message; rc = [%d]\n", rc);
118out:
119 return rc;
120}
121
122/**
123 * ecryptfs_process_nl_helo
124 * @skb: The socket buffer containing the nlmsghdr in HELO state
125 *
126 * Gets uid and pid of the skb and adds the values to the daemon id
127 * hash. Returns zero after adding a new daemon id to the hash list;
128 * non-zero otherwise.
129 */
130static int ecryptfs_process_nl_helo(struct sk_buff *skb)
131{
132 struct pid *pid;
133 int rc;
134
135 pid = find_get_pid(NETLINK_CREDS(skb)->pid);
136 rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK,
137 NETLINK_CREDS(skb)->uid, NULL, pid);
138 put_pid(pid);
139 if (rc)
140 printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
141 return rc;
142}
143
144/**
145 * ecryptfs_process_nl_quit
146 * @skb: The socket buffer containing the nlmsghdr in QUIT state
147 *
148 * Gets uid and pid of the skb and deletes the corresponding daemon
149 * id, if it is the registered that is requesting the
150 * deletion. Returns zero after deleting the desired daemon id;
151 * non-zero otherwise.
152 */
153static int ecryptfs_process_nl_quit(struct sk_buff *skb)
154{
155 struct pid *pid;
156 int rc;
157
158 pid = find_get_pid(NETLINK_CREDS(skb)->pid);
159 rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid);
160 put_pid(pid);
161 if (rc)
162 printk(KERN_WARNING
163 "Error processing QUIT message; rc = [%d]\n", rc);
164 return rc;
165}
166
167/**
168 * ecryptfs_receive_nl_message
169 *
170 * Callback function called by netlink system when a message arrives.
171 * If the message looks to be valid, then an attempt is made to assign
172 * it to its desired netlink context element and wake up the process
173 * that is waiting for a response.
174 */
175static void ecryptfs_receive_nl_message(struct sk_buff *skb)
176{
177 struct nlmsghdr *nlh;
178
179 nlh = nlmsg_hdr(skb);
180 if (!NLMSG_OK(nlh, skb->len)) {
181 ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
182 "message\n");
183 goto free;
184 }
185 switch (nlh->nlmsg_type) {
186 case ECRYPTFS_MSG_RESPONSE:
187 if (ecryptfs_process_nl_response(skb)) {
188 ecryptfs_printk(KERN_WARNING, "Failed to "
189 "deliver netlink response to "
190 "requesting operation\n");
191 }
192 break;
193 case ECRYPTFS_MSG_HELO:
194 if (ecryptfs_process_nl_helo(skb)) {
195 ecryptfs_printk(KERN_WARNING, "Failed to "
196 "fulfill HELO request\n");
197 }
198 break;
199 case ECRYPTFS_MSG_QUIT:
200 if (ecryptfs_process_nl_quit(skb)) {
201 ecryptfs_printk(KERN_WARNING, "Failed to "
202 "fulfill QUIT request\n");
203 }
204 break;
205 default:
206 ecryptfs_printk(KERN_WARNING, "Dropping netlink "
207 "message of unrecognized type [%d]\n",
208 nlh->nlmsg_type);
209 break;
210 }
211free:
212 kfree_skb(skb);
213}
214
215/**
216 * ecryptfs_init_netlink
217 *
218 * Initializes the daemon id hash list, netlink context array, and
219 * necessary locks. Returns zero upon success; non-zero upon error.
220 */
221int ecryptfs_init_netlink(void)
222{
223 int rc;
224
225 ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0,
226 ecryptfs_receive_nl_message,
227 NULL, THIS_MODULE);
228 if (!ecryptfs_nl_sock) {
229 rc = -EIO;
230 ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n");
231 goto out;
232 }
233 ecryptfs_nl_sock->sk_sndtimeo = ECRYPTFS_DEFAULT_SEND_TIMEOUT;
234 rc = 0;
235out:
236 return rc;
237}
238
239/**
240 * ecryptfs_release_netlink
241 *
242 * Frees all memory used by the netlink context array and releases the
243 * netlink socket.
244 */
245void ecryptfs_release_netlink(void)
246{
247 netlink_kernel_release(ecryptfs_nl_sock);
248 ecryptfs_nl_sock = NULL;
249}
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 3a404e7fad53..291abb11e20e 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
74 } 74 }
75 unlock_kernel(); 75 unlock_kernel();
76 76
77 d_add(dentry, inode); 77 return d_splice_alias(inode, dentry);
78 return NULL;
79} 78}
80 79
81static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, 80static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 567b134fa1f1..73b19cfc91fc 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -341,8 +341,6 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
341 sb->inode_blocks * 341 sb->inode_blocks *
342 (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); 342 (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
343 buf->f_ffree = sb->inode_free; /* free inodes */ 343 buf->f_ffree = sb->inode_free; /* free inodes */
344 buf->f_fsid.val[0] = (sb->fs_magic >> 16) & 0xffff; /* fs ID */
345 buf->f_fsid.val[1] = sb->fs_magic & 0xffff; /* fs ID */
346 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ 344 buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
347 345
348 return 0; 346 return 0;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7cc0eb756b55..99368bda0261 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -927,14 +927,11 @@ errxit:
927 /* 927 /*
928 * During the time we spent in the loop above, some other events 928 * During the time we spent in the loop above, some other events
929 * might have been queued by the poll callback. We re-insert them 929 * might have been queued by the poll callback. We re-insert them
930 * here (in case they are not already queued, or they're one-shot). 930 * inside the main ready-list here.
931 */ 931 */
932 for (nepi = ep->ovflist; (epi = nepi) != NULL; 932 for (nepi = ep->ovflist; (epi = nepi) != NULL;
933 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { 933 nepi = epi->next, epi->next = EP_UNACTIVE_PTR)
934 if (!ep_is_linked(&epi->rdllink) && 934 list_add_tail(&epi->rdllink, &ep->rdllist);
935 (epi->event.events & ~EP_PRIVATE_BITS))
936 list_add_tail(&epi->rdllink, &ep->rdllist);
937 }
938 /* 935 /*
939 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after 936 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
940 * releasing the lock, events will be queued in the normal way inside 937 * releasing the lock, events will be queued in the normal way inside
diff --git a/fs/exec.c b/fs/exec.c
index 32993beecbe9..a41e7902ed0b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,15 +50,12 @@
50#include <linux/cn_proc.h> 50#include <linux/cn_proc.h>
51#include <linux/audit.h> 51#include <linux/audit.h>
52#include <linux/tracehook.h> 52#include <linux/tracehook.h>
53#include <linux/kmod.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/mmu_context.h> 56#include <asm/mmu_context.h>
56#include <asm/tlb.h> 57#include <asm/tlb.h>
57 58
58#ifdef CONFIG_KMOD
59#include <linux/kmod.h>
60#endif
61
62#ifdef __alpha__ 59#ifdef __alpha__
63/* for /sbin/loader handling in search_binary_handler() */ 60/* for /sbin/loader handling in search_binary_handler() */
64#include <linux/a.out.h> 61#include <linux/a.out.h>
@@ -391,7 +388,7 @@ static int count(char __user * __user * argv, int max)
391 if (!p) 388 if (!p)
392 break; 389 break;
393 argv++; 390 argv++;
394 if(++i > max) 391 if (i++ >= max)
395 return -E2BIG; 392 return -E2BIG;
396 cond_resched(); 393 cond_resched();
397 } 394 }
@@ -752,11 +749,11 @@ static int exec_mmap(struct mm_struct *mm)
752 tsk->active_mm = mm; 749 tsk->active_mm = mm;
753 activate_mm(active_mm, mm); 750 activate_mm(active_mm, mm);
754 task_unlock(tsk); 751 task_unlock(tsk);
755 mm_update_next_owner(old_mm);
756 arch_pick_mmap_layout(mm); 752 arch_pick_mmap_layout(mm);
757 if (old_mm) { 753 if (old_mm) {
758 up_read(&old_mm->mmap_sem); 754 up_read(&old_mm->mmap_sem);
759 BUG_ON(active_mm != old_mm); 755 BUG_ON(active_mm != old_mm);
756 mm_update_next_owner(old_mm);
760 mmput(old_mm); 757 mmput(old_mm);
761 return 0; 758 return 0;
762 } 759 }
@@ -825,8 +822,6 @@ static int de_thread(struct task_struct *tsk)
825 schedule(); 822 schedule();
826 } 823 }
827 824
828 if (unlikely(task_child_reaper(tsk) == leader))
829 task_active_pid_ns(tsk)->child_reaper = tsk;
830 /* 825 /*
831 * The only record we have of the real-time age of a 826 * The only record we have of the real-time age of a
832 * process, regardless of execs it's done, is start_time. 827 * process, regardless of execs it's done, is start_time.
@@ -1189,7 +1184,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1189 return retval; 1184 return retval;
1190 1185
1191 /* Remember if the application is TASO. */ 1186 /* Remember if the application is TASO. */
1192 bprm->sh_bang = eh->ah.entry < 0x100000000UL; 1187 bprm->taso = eh->ah.entry < 0x100000000UL;
1193 1188
1194 bprm->file = file; 1189 bprm->file = file;
1195 bprm->loader = loader; 1190 bprm->loader = loader;
@@ -1247,8 +1242,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1247 read_unlock(&binfmt_lock); 1242 read_unlock(&binfmt_lock);
1248 if (retval != -ENOEXEC || bprm->mm == NULL) { 1243 if (retval != -ENOEXEC || bprm->mm == NULL) {
1249 break; 1244 break;
1250#ifdef CONFIG_KMOD 1245#ifdef CONFIG_MODULES
1251 }else{ 1246 } else {
1252#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) 1247#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1253 if (printable(bprm->buf[0]) && 1248 if (printable(bprm->buf[0]) &&
1254 printable(bprm->buf[1]) && 1249 printable(bprm->buf[1]) &&
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 10bb02c3f25c..6dac7ba2d22d 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1295,6 +1295,7 @@ retry_alloc:
1295 * turn off reservation for this allocation 1295 * turn off reservation for this allocation
1296 */ 1296 */
1297 if (my_rsv && (free_blocks < windowsz) 1297 if (my_rsv && (free_blocks < windowsz)
1298 && (free_blocks > 0)
1298 && (rsv_is_empty(&my_rsv->rsv_window))) 1299 && (rsv_is_empty(&my_rsv->rsv_window)))
1299 my_rsv = NULL; 1300 my_rsv = NULL;
1300 1301
@@ -1332,7 +1333,7 @@ retry_alloc:
1332 * free blocks is less than half of the reservation 1333 * free blocks is less than half of the reservation
1333 * window size. 1334 * window size.
1334 */ 1335 */
1335 if (free_blocks <= (windowsz/2)) 1336 if (my_rsv && (free_blocks <= (windowsz/2)))
1336 continue; 1337 continue;
1337 1338
1338 brelse(bitmap_bh); 1339 brelse(bitmap_bh);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index a78c6b4af060..11a49ce84392 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -103,7 +103,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
103 return err; 103 return err;
104} 104}
105 105
106static void ext2_check_page(struct page *page) 106static void ext2_check_page(struct page *page, int quiet)
107{ 107{
108 struct inode *dir = page->mapping->host; 108 struct inode *dir = page->mapping->host;
109 struct super_block *sb = dir->i_sb; 109 struct super_block *sb = dir->i_sb;
@@ -146,10 +146,10 @@ out:
146 /* Too bad, we had an error */ 146 /* Too bad, we had an error */
147 147
148Ebadsize: 148Ebadsize:
149 ext2_error(sb, "ext2_check_page", 149 if (!quiet)
150 "size of directory #%lu is not a multiple of chunk size", 150 ext2_error(sb, __func__,
151 dir->i_ino 151 "size of directory #%lu is not a multiple "
152 ); 152 "of chunk size", dir->i_ino);
153 goto fail; 153 goto fail;
154Eshort: 154Eshort:
155 error = "rec_len is smaller than minimal"; 155 error = "rec_len is smaller than minimal";
@@ -166,32 +166,36 @@ Espan:
166Einumber: 166Einumber:
167 error = "inode out of bounds"; 167 error = "inode out of bounds";
168bad_entry: 168bad_entry:
169 ext2_error (sb, "ext2_check_page", "bad entry in directory #%lu: %s - " 169 if (!quiet)
170 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 170 ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
171 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, 171 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
172 (unsigned long) le32_to_cpu(p->inode), 172 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
173 rec_len, p->name_len); 173 (unsigned long) le32_to_cpu(p->inode),
174 rec_len, p->name_len);
174 goto fail; 175 goto fail;
175Eend: 176Eend:
176 p = (ext2_dirent *)(kaddr + offs); 177 if (!quiet) {
177 ext2_error (sb, "ext2_check_page", 178 p = (ext2_dirent *)(kaddr + offs);
178 "entry in directory #%lu spans the page boundary" 179 ext2_error(sb, "ext2_check_page",
179 "offset=%lu, inode=%lu", 180 "entry in directory #%lu spans the page boundary"
180 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, 181 "offset=%lu, inode=%lu",
181 (unsigned long) le32_to_cpu(p->inode)); 182 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
183 (unsigned long) le32_to_cpu(p->inode));
184 }
182fail: 185fail:
183 SetPageChecked(page); 186 SetPageChecked(page);
184 SetPageError(page); 187 SetPageError(page);
185} 188}
186 189
187static struct page * ext2_get_page(struct inode *dir, unsigned long n) 190static struct page * ext2_get_page(struct inode *dir, unsigned long n,
191 int quiet)
188{ 192{
189 struct address_space *mapping = dir->i_mapping; 193 struct address_space *mapping = dir->i_mapping;
190 struct page *page = read_mapping_page(mapping, n, NULL); 194 struct page *page = read_mapping_page(mapping, n, NULL);
191 if (!IS_ERR(page)) { 195 if (!IS_ERR(page)) {
192 kmap(page); 196 kmap(page);
193 if (!PageChecked(page)) 197 if (!PageChecked(page))
194 ext2_check_page(page); 198 ext2_check_page(page, quiet);
195 if (PageError(page)) 199 if (PageError(page))
196 goto fail; 200 goto fail;
197 } 201 }
@@ -292,7 +296,7 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
292 for ( ; n < npages; n++, offset = 0) { 296 for ( ; n < npages; n++, offset = 0) {
293 char *kaddr, *limit; 297 char *kaddr, *limit;
294 ext2_dirent *de; 298 ext2_dirent *de;
295 struct page *page = ext2_get_page(inode, n); 299 struct page *page = ext2_get_page(inode, n, 0);
296 300
297 if (IS_ERR(page)) { 301 if (IS_ERR(page)) {
298 ext2_error(sb, __func__, 302 ext2_error(sb, __func__,
@@ -361,6 +365,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
361 struct page *page = NULL; 365 struct page *page = NULL;
362 struct ext2_inode_info *ei = EXT2_I(dir); 366 struct ext2_inode_info *ei = EXT2_I(dir);
363 ext2_dirent * de; 367 ext2_dirent * de;
368 int dir_has_error = 0;
364 369
365 if (npages == 0) 370 if (npages == 0)
366 goto out; 371 goto out;
@@ -374,7 +379,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
374 n = start; 379 n = start;
375 do { 380 do {
376 char *kaddr; 381 char *kaddr;
377 page = ext2_get_page(dir, n); 382 page = ext2_get_page(dir, n, dir_has_error);
378 if (!IS_ERR(page)) { 383 if (!IS_ERR(page)) {
379 kaddr = page_address(page); 384 kaddr = page_address(page);
380 de = (ext2_dirent *) kaddr; 385 de = (ext2_dirent *) kaddr;
@@ -391,7 +396,9 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
391 de = ext2_next_entry(de); 396 de = ext2_next_entry(de);
392 } 397 }
393 ext2_put_page(page); 398 ext2_put_page(page);
394 } 399 } else
400 dir_has_error = 1;
401
395 if (++n >= npages) 402 if (++n >= npages)
396 n = 0; 403 n = 0;
397 /* next page is past the blocks we've got */ 404 /* next page is past the blocks we've got */
@@ -414,7 +421,7 @@ found:
414 421
415struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) 422struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
416{ 423{
417 struct page *page = ext2_get_page(dir, 0); 424 struct page *page = ext2_get_page(dir, 0, 0);
418 ext2_dirent *de = NULL; 425 ext2_dirent *de = NULL;
419 426
420 if (!IS_ERR(page)) { 427 if (!IS_ERR(page)) {
@@ -487,7 +494,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
487 for (n = 0; n <= npages; n++) { 494 for (n = 0; n <= npages; n++) {
488 char *dir_end; 495 char *dir_end;
489 496
490 page = ext2_get_page(dir, n); 497 page = ext2_get_page(dir, n, 0);
491 err = PTR_ERR(page); 498 err = PTR_ERR(page);
492 if (IS_ERR(page)) 499 if (IS_ERR(page))
493 goto out; 500 goto out;
@@ -655,14 +662,17 @@ int ext2_empty_dir (struct inode * inode)
655{ 662{
656 struct page *page = NULL; 663 struct page *page = NULL;
657 unsigned long i, npages = dir_pages(inode); 664 unsigned long i, npages = dir_pages(inode);
665 int dir_has_error = 0;
658 666
659 for (i = 0; i < npages; i++) { 667 for (i = 0; i < npages; i++) {
660 char *kaddr; 668 char *kaddr;
661 ext2_dirent * de; 669 ext2_dirent * de;
662 page = ext2_get_page(inode, i); 670 page = ext2_get_page(inode, i, dir_has_error);
663 671
664 if (IS_ERR(page)) 672 if (IS_ERR(page)) {
673 dir_has_error = 1;
665 continue; 674 continue;
675 }
666 676
667 kaddr = page_address(page); 677 kaddr = page_address(page);
668 de = (ext2_dirent *)kaddr; 678 de = (ext2_dirent *)kaddr;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da2d33b..bae998c1e44e 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *);
133extern int ext2_setattr (struct dentry *, struct iattr *); 133extern int ext2_setattr (struct dentry *, struct iattr *);
134extern void ext2_set_inode_flags(struct inode *inode); 134extern void ext2_set_inode_flags(struct inode *inode);
135extern void ext2_get_inode_flags(struct ext2_inode_info *); 135extern void ext2_get_inode_flags(struct ext2_inode_info *);
136extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
137 u64 start, u64 len);
136int __ext2_write_begin(struct file *file, struct address_space *mapping, 138int __ext2_write_begin(struct file *file, struct address_space *mapping,
137 loff_t pos, unsigned len, unsigned flags, 139 loff_t pos, unsigned len, unsigned flags,
138 struct page **pagep, void **fsdata); 140 struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c36293..45ed07122182 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = {
86#endif 86#endif
87 .setattr = ext2_setattr, 87 .setattr = ext2_setattr,
88 .permission = ext2_permission, 88 .permission = ext2_permission,
89 .fiemap = ext2_fiemap,
89}; 90};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 991d6dfeb51f..7658b33e2653 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
31#include <linux/writeback.h> 31#include <linux/writeback.h>
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include <linux/mpage.h> 33#include <linux/mpage.h>
34#include <linux/fiemap.h>
34#include "ext2.h" 35#include "ext2.h"
35#include "acl.h" 36#include "acl.h"
36#include "xip.h" 37#include "xip.h"
@@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_
704 705
705} 706}
706 707
708int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
709 u64 start, u64 len)
710{
711 return generic_block_fiemap(inode, fieinfo, start, len,
712 ext2_get_block);
713}
714
707static int ext2_writepage(struct page *page, struct writeback_control *wbc) 715static int ext2_writepage(struct page *page, struct writeback_control *wbc)
708{ 716{
709 return block_write_full_page(page, ext2_get_block, wbc); 717 return block_write_full_page(page, ext2_get_block, wbc);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index fd88c7b43e66..647cd888ac87 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -393,7 +393,7 @@ enum {
393 Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation 393 Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
394}; 394};
395 395
396static match_table_t tokens = { 396static const match_table_t tokens = {
397 {Opt_bsd_df, "bsddf"}, 397 {Opt_bsd_df, "bsddf"},
398 {Opt_minix_df, "minixdf"}, 398 {Opt_minix_df, "minixdf"},
399 {Opt_grpid, "grpid"}, 399 {Opt_grpid, "grpid"},
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913d3019..3be1e0689c9a 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = {
134 .removexattr = generic_removexattr, 134 .removexattr = generic_removexattr,
135#endif 135#endif
136 .permission = ext3_permission, 136 .permission = ext3_permission,
137 .fiemap = ext3_fiemap,
137}; 138};
138 139
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 507d8689b111..ebfec4d0148e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
36#include <linux/mpage.h> 36#include <linux/mpage.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/fiemap.h>
39#include "xattr.h" 40#include "xattr.h"
40#include "acl.h" 41#include "acl.h"
41 42
@@ -981,6 +982,13 @@ out:
981 return ret; 982 return ret;
982} 983}
983 984
985int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
986 u64 start, u64 len)
987{
988 return generic_block_fiemap(inode, fieinfo, start, len,
989 ext3_get_block);
990}
991
984/* 992/*
985 * `handle' can be NULL if create is zero 993 * `handle' can be NULL if create is zero
986 */ 994 */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f38a5afc39a1..399a96a6c556 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -760,7 +760,7 @@ enum {
760 Opt_grpquota 760 Opt_grpquota
761}; 761};
762 762
763static match_table_t tokens = { 763static const match_table_t tokens = {
764 {Opt_bsd_df, "bsddf"}, 764 {Opt_bsd_df, "bsddf"},
765 {Opt_minix_df, "minixdf"}, 765 {Opt_minix_df, "minixdf"},
766 {Opt_grpid, "grpid"}, 766 {Opt_grpid, "grpid"},
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2f..a8ff003a00f7 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
2# Makefile for the linux ext4-filesystem routines. 2# Makefile for the linux ext4-filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o 5obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o 9 ext4_jbd2.o migrate.o mballoc.o
10 10
11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o 12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
13ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o 13ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cd2b855a07d6..cb45257a246e 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size)
51 } 51 }
52} 52}
53 53
54#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl 56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
57 if the ACL has not been cached */ 57 if the ACL has not been cached */
58#define EXT4_ACL_NOT_CACHED ((void *)-1) 58#define EXT4_ACL_NOT_CACHED ((void *)-1)
59 59
60/* acl.c */ 60/* acl.c */
61extern int ext4_permission (struct inode *, int); 61extern int ext4_permission(struct inode *, int);
62extern int ext4_acl_chmod (struct inode *); 62extern int ext4_acl_chmod(struct inode *);
63extern int ext4_init_acl (handle_t *, struct inode *, struct inode *); 63extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
64 64
65#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */ 65#else /* CONFIG_EXT4_FS_POSIX_ACL */
66#include <linux/sched.h> 66#include <linux/sched.h>
67#define ext4_permission NULL 67#define ext4_permission NULL
68 68
@@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
77{ 77{
78 return 0; 78 return 0;
79} 79}
80#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */ 80#endif /* CONFIG_EXT4_FS_POSIX_ACL */
81 81
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ae5004e93fc..b9821be709bd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
83 } 83 }
84 return used_blocks; 84 return used_blocks;
85} 85}
86
86/* Initializes an uninitialized block bitmap if given, and returns the 87/* Initializes an uninitialized block bitmap if given, and returns the
87 * number of blocks free in the group. */ 88 * number of blocks free in the group. */
88unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, 89unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -132,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
132 */ 133 */
133 group_blocks = ext4_blocks_count(sbi->s_es) - 134 group_blocks = ext4_blocks_count(sbi->s_es) -
134 le32_to_cpu(sbi->s_es->s_first_data_block) - 135 le32_to_cpu(sbi->s_es->s_first_data_block) -
135 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1)); 136 (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
136 } else { 137 } else {
137 group_blocks = EXT4_BLOCKS_PER_GROUP(sb); 138 group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
138 } 139 }
@@ -200,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
200 * @bh: pointer to the buffer head to store the block 201 * @bh: pointer to the buffer head to store the block
201 * group descriptor 202 * group descriptor
202 */ 203 */
203struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 204struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
204 ext4_group_t block_group, 205 ext4_group_t block_group,
205 struct buffer_head ** bh) 206 struct buffer_head **bh)
206{ 207{
207 unsigned long group_desc; 208 unsigned long group_desc;
208 unsigned long offset; 209 unsigned long offset;
209 struct ext4_group_desc * desc; 210 struct ext4_group_desc *desc;
210 struct ext4_sb_info *sbi = EXT4_SB(sb); 211 struct ext4_sb_info *sbi = EXT4_SB(sb);
211 212
212 if (block_group >= sbi->s_groups_count) { 213 if (block_group >= sbi->s_groups_count) {
213 ext4_error (sb, "ext4_get_group_desc", 214 ext4_error(sb, "ext4_get_group_desc",
214 "block_group >= groups_count - " 215 "block_group >= groups_count - "
215 "block_group = %lu, groups_count = %lu", 216 "block_group = %lu, groups_count = %lu",
216 block_group, sbi->s_groups_count); 217 block_group, sbi->s_groups_count);
217 218
218 return NULL; 219 return NULL;
219 } 220 }
@@ -222,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
222 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); 223 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
223 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); 224 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
224 if (!sbi->s_group_desc[group_desc]) { 225 if (!sbi->s_group_desc[group_desc]) {
225 ext4_error (sb, "ext4_get_group_desc", 226 ext4_error(sb, "ext4_get_group_desc",
226 "Group descriptor not loaded - " 227 "Group descriptor not loaded - "
227 "block_group = %lu, group_desc = %lu, desc = %lu", 228 "block_group = %lu, group_desc = %lu, desc = %lu",
228 block_group, group_desc, offset); 229 block_group, group_desc, offset);
229 return NULL; 230 return NULL;
230 } 231 }
231 232
@@ -302,8 +303,8 @@ err_out:
302struct buffer_head * 303struct buffer_head *
303ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 304ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
304{ 305{
305 struct ext4_group_desc * desc; 306 struct ext4_group_desc *desc;
306 struct buffer_head * bh = NULL; 307 struct buffer_head *bh = NULL;
307 ext4_fsblk_t bitmap_blk; 308 ext4_fsblk_t bitmap_blk;
308 309
309 desc = ext4_get_group_desc(sb, block_group, NULL); 310 desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -318,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
318 block_group, bitmap_blk); 319 block_group, bitmap_blk);
319 return NULL; 320 return NULL;
320 } 321 }
321 if (bh_uptodate_or_lock(bh)) 322 if (buffer_uptodate(bh) &&
323 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
322 return bh; 324 return bh;
323 325
326 lock_buffer(bh);
324 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 327 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
325 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 328 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
326 ext4_init_block_bitmap(sb, bh, block_group, desc); 329 ext4_init_block_bitmap(sb, bh, block_group, desc);
@@ -345,301 +348,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
345 */ 348 */
346 return bh; 349 return bh;
347} 350}
348/*
349 * The reservation window structure operations
350 * --------------------------------------------
351 * Operations include:
352 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
353 *
354 * We use a red-black tree to represent per-filesystem reservation
355 * windows.
356 *
357 */
358
359/**
360 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
361 * @rb_root: root of per-filesystem reservation rb tree
362 * @verbose: verbose mode
363 * @fn: function which wishes to dump the reservation map
364 *
365 * If verbose is turned on, it will print the whole block reservation
366 * windows(start, end). Otherwise, it will only print out the "bad" windows,
367 * those windows that overlap with their immediate neighbors.
368 */
369#if 1
370static void __rsv_window_dump(struct rb_root *root, int verbose,
371 const char *fn)
372{
373 struct rb_node *n;
374 struct ext4_reserve_window_node *rsv, *prev;
375 int bad;
376
377restart:
378 n = rb_first(root);
379 bad = 0;
380 prev = NULL;
381
382 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
383 while (n) {
384 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
385 if (verbose)
386 printk("reservation window 0x%p "
387 "start: %llu, end: %llu\n",
388 rsv, rsv->rsv_start, rsv->rsv_end);
389 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
390 printk("Bad reservation %p (start >= end)\n",
391 rsv);
392 bad = 1;
393 }
394 if (prev && prev->rsv_end >= rsv->rsv_start) {
395 printk("Bad reservation %p (prev->end >= start)\n",
396 rsv);
397 bad = 1;
398 }
399 if (bad) {
400 if (!verbose) {
401 printk("Restarting reservation walk in verbose mode\n");
402 verbose = 1;
403 goto restart;
404 }
405 }
406 n = rb_next(n);
407 prev = rsv;
408 }
409 printk("Window map complete.\n");
410 BUG_ON(bad);
411}
412#define rsv_window_dump(root, verbose) \
413 __rsv_window_dump((root), (verbose), __func__)
414#else
415#define rsv_window_dump(root, verbose) do {} while (0)
416#endif
417
418/**
419 * goal_in_my_reservation()
420 * @rsv: inode's reservation window
421 * @grp_goal: given goal block relative to the allocation block group
422 * @group: the current allocation block group
423 * @sb: filesystem super block
424 *
425 * Test if the given goal block (group relative) is within the file's
426 * own block reservation window range.
427 *
428 * If the reservation window is outside the goal allocation group, return 0;
429 * grp_goal (given goal block) could be -1, which means no specific
430 * goal block. In this case, always return 1.
431 * If the goal block is within the reservation window, return 1;
432 * otherwise, return 0;
433 */
434static int
435goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
436 ext4_group_t group, struct super_block *sb)
437{
438 ext4_fsblk_t group_first_block, group_last_block;
439
440 group_first_block = ext4_group_first_block_no(sb, group);
441 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
442
443 if ((rsv->_rsv_start > group_last_block) ||
444 (rsv->_rsv_end < group_first_block))
445 return 0;
446 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
447 || (grp_goal + group_first_block > rsv->_rsv_end)))
448 return 0;
449 return 1;
450}
451
452/**
453 * search_reserve_window()
454 * @rb_root: root of reservation tree
455 * @goal: target allocation block
456 *
457 * Find the reserved window which includes the goal, or the previous one
458 * if the goal is not in any window.
459 * Returns NULL if there are no windows or if all windows start after the goal.
460 */
461static struct ext4_reserve_window_node *
462search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
463{
464 struct rb_node *n = root->rb_node;
465 struct ext4_reserve_window_node *rsv;
466
467 if (!n)
468 return NULL;
469
470 do {
471 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
472
473 if (goal < rsv->rsv_start)
474 n = n->rb_left;
475 else if (goal > rsv->rsv_end)
476 n = n->rb_right;
477 else
478 return rsv;
479 } while (n);
480 /*
481 * We've fallen off the end of the tree: the goal wasn't inside
482 * any particular node. OK, the previous node must be to one
483 * side of the interval containing the goal. If it's the RHS,
484 * we need to back up one.
485 */
486 if (rsv->rsv_start > goal) {
487 n = rb_prev(&rsv->rsv_node);
488 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
489 }
490 return rsv;
491}
492
493/**
494 * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
495 * @sb: super block
496 * @rsv: reservation window to add
497 *
498 * Must be called with rsv_lock hold.
499 */
500void ext4_rsv_window_add(struct super_block *sb,
501 struct ext4_reserve_window_node *rsv)
502{
503 struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
504 struct rb_node *node = &rsv->rsv_node;
505 ext4_fsblk_t start = rsv->rsv_start;
506
507 struct rb_node ** p = &root->rb_node;
508 struct rb_node * parent = NULL;
509 struct ext4_reserve_window_node *this;
510
511 while (*p)
512 {
513 parent = *p;
514 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
515
516 if (start < this->rsv_start)
517 p = &(*p)->rb_left;
518 else if (start > this->rsv_end)
519 p = &(*p)->rb_right;
520 else {
521 rsv_window_dump(root, 1);
522 BUG();
523 }
524 }
525
526 rb_link_node(node, parent, p);
527 rb_insert_color(node, root);
528}
529
530/**
531 * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
532 * @sb: super block
533 * @rsv: reservation window to remove
534 *
535 * Mark the block reservation window as not allocated, and unlink it
536 * from the filesystem reservation window rb tree. Must be called with
537 * rsv_lock hold.
538 */
539static void rsv_window_remove(struct super_block *sb,
540 struct ext4_reserve_window_node *rsv)
541{
542 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
543 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
544 rsv->rsv_alloc_hit = 0;
545 rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
546}
547
548/*
549 * rsv_is_empty() -- Check if the reservation window is allocated.
550 * @rsv: given reservation window to check
551 *
552 * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
553 */
554static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
555{
556 /* a valid reservation end block could not be 0 */
557 return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
558}
559
560/**
561 * ext4_init_block_alloc_info()
562 * @inode: file inode structure
563 *
564 * Allocate and initialize the reservation window structure, and
565 * link the window to the ext4 inode structure at last
566 *
567 * The reservation window structure is only dynamically allocated
568 * and linked to ext4 inode the first time the open file
569 * needs a new block. So, before every ext4_new_block(s) call, for
570 * regular files, we should check whether the reservation window
571 * structure exists or not. In the latter case, this function is called.
572 * Fail to do so will result in block reservation being turned off for that
573 * open file.
574 *
575 * This function is called from ext4_get_blocks_handle(), also called
576 * when setting the reservation window size through ioctl before the file
577 * is open for write (needs block allocation).
578 *
579 * Needs down_write(i_data_sem) protection prior to call this function.
580 */
581void ext4_init_block_alloc_info(struct inode *inode)
582{
583 struct ext4_inode_info *ei = EXT4_I(inode);
584 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
585 struct super_block *sb = inode->i_sb;
586
587 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
588 if (block_i) {
589 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
590
591 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
592 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
593
594 /*
595 * if filesystem is mounted with NORESERVATION, the goal
596 * reservation window size is set to zero to indicate
597 * block reservation is off
598 */
599 if (!test_opt(sb, RESERVATION))
600 rsv->rsv_goal_size = 0;
601 else
602 rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
603 rsv->rsv_alloc_hit = 0;
604 block_i->last_alloc_logical_block = 0;
605 block_i->last_alloc_physical_block = 0;
606 }
607 ei->i_block_alloc_info = block_i;
608}
609
610/**
611 * ext4_discard_reservation()
612 * @inode: inode
613 *
614 * Discard(free) block reservation window on last file close, or truncate
615 * or at last iput().
616 *
617 * It is being called in three cases:
618 * ext4_release_file(): last writer close the file
619 * ext4_clear_inode(): last iput(), when nobody link to this file.
620 * ext4_truncate(): when the block indirect map is about to change.
621 *
622 */
623void ext4_discard_reservation(struct inode *inode)
624{
625 struct ext4_inode_info *ei = EXT4_I(inode);
626 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
627 struct ext4_reserve_window_node *rsv;
628 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
629
630 ext4_mb_discard_inode_preallocations(inode);
631
632 if (!block_i)
633 return;
634
635 rsv = &block_i->rsv_window_node;
636 if (!rsv_is_empty(&rsv->rsv_window)) {
637 spin_lock(rsv_lock);
638 if (!rsv_is_empty(&rsv->rsv_window))
639 rsv_window_remove(inode->i_sb, rsv);
640 spin_unlock(rsv_lock);
641 }
642}
643 351
644/** 352/**
645 * ext4_free_blocks_sb() -- Free given blocks and update quota 353 * ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -648,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode)
648 * @block: start physcial block to free 356 * @block: start physcial block to free
649 * @count: number of blocks to free 357 * @count: number of blocks to free
650 * @pdquot_freed_blocks: pointer to quota 358 * @pdquot_freed_blocks: pointer to quota
359 *
360 * XXX This function is only used by the on-line resizing code, which
361 * should probably be fixed up to call the mballoc variant. There
362 * this needs to be cleaned up later; in fact, I'm not convinced this
363 * is 100% correct in the face of the mballoc code. The online resizing
364 * code needs to be fixed up to more tightly (and correctly) interlock
365 * with the mballoc code.
651 */ 366 */
652void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 367void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
653 ext4_fsblk_t block, unsigned long count, 368 ext4_fsblk_t block, unsigned long count,
@@ -659,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
659 ext4_grpblk_t bit; 374 ext4_grpblk_t bit;
660 unsigned long i; 375 unsigned long i;
661 unsigned long overflow; 376 unsigned long overflow;
662 struct ext4_group_desc * desc; 377 struct ext4_group_desc *desc;
663 struct ext4_super_block * es; 378 struct ext4_super_block *es;
664 struct ext4_sb_info *sbi; 379 struct ext4_sb_info *sbi;
665 int err = 0, ret; 380 int err = 0, ret;
666 ext4_grpblk_t group_freed; 381 ext4_grpblk_t group_freed;
@@ -671,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
671 if (block < le32_to_cpu(es->s_first_data_block) || 386 if (block < le32_to_cpu(es->s_first_data_block) ||
672 block + count < block || 387 block + count < block ||
673 block + count > ext4_blocks_count(es)) { 388 block + count > ext4_blocks_count(es)) {
674 ext4_error (sb, "ext4_free_blocks", 389 ext4_error(sb, "ext4_free_blocks",
675 "Freeing blocks not in datazone - " 390 "Freeing blocks not in datazone - "
676 "block = %llu, count = %lu", block, count); 391 "block = %llu, count = %lu", block, count);
677 goto error_return; 392 goto error_return;
678 } 393 }
679 394
680 ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1); 395 ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
681 396
682do_more: 397do_more:
683 overflow = 0; 398 overflow = 0;
@@ -694,7 +409,7 @@ do_more:
694 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 409 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
695 if (!bitmap_bh) 410 if (!bitmap_bh)
696 goto error_return; 411 goto error_return;
697 desc = ext4_get_group_desc (sb, block_group, &gd_bh); 412 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
698 if (!desc) 413 if (!desc)
699 goto error_return; 414 goto error_return;
700 415
@@ -703,10 +418,10 @@ do_more:
703 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 418 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
704 in_range(block + count - 1, ext4_inode_table(sb, desc), 419 in_range(block + count - 1, ext4_inode_table(sb, desc),
705 sbi->s_itb_per_group)) { 420 sbi->s_itb_per_group)) {
706 ext4_error (sb, "ext4_free_blocks", 421 ext4_error(sb, "ext4_free_blocks",
707 "Freeing blocks in system zones - " 422 "Freeing blocks in system zones - "
708 "Block = %llu, count = %lu", 423 "Block = %llu, count = %lu",
709 block, count); 424 block, count);
710 goto error_return; 425 goto error_return;
711 } 426 }
712 427
@@ -848,759 +563,71 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
848 ext4_fsblk_t block, unsigned long count, 563 ext4_fsblk_t block, unsigned long count,
849 int metadata) 564 int metadata)
850{ 565{
851 struct super_block * sb; 566 struct super_block *sb;
852 unsigned long dquot_freed_blocks; 567 unsigned long dquot_freed_blocks;
853 568
854 /* this isn't the right place to decide whether block is metadata 569 /* this isn't the right place to decide whether block is metadata
855 * inode.c/extents.c knows better, but for safety ... */ 570 * inode.c/extents.c knows better, but for safety ... */
856 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || 571 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
857 ext4_should_journal_data(inode)) 572 metadata = 1;
573
574 /* We need to make sure we don't reuse
575 * block released untill the transaction commit.
576 * writeback mode have weak data consistency so
577 * don't force data as metadata when freeing block
578 * for writeback mode.
579 */
580 if (metadata == 0 && !ext4_should_writeback_data(inode))
858 metadata = 1; 581 metadata = 1;
859 582
860 sb = inode->i_sb; 583 sb = inode->i_sb;
861 584
862 if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info) 585 ext4_mb_free_blocks(handle, inode, block, count,
863 ext4_free_blocks_sb(handle, sb, block, count, 586 metadata, &dquot_freed_blocks);
864 &dquot_freed_blocks);
865 else
866 ext4_mb_free_blocks(handle, inode, block, count,
867 metadata, &dquot_freed_blocks);
868 if (dquot_freed_blocks) 587 if (dquot_freed_blocks)
869 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 588 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
870 return; 589 return;
871} 590}
872 591
873/** 592int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
874 * ext4_test_allocatable() 593 s64 nblocks)
875 * @nr: given allocation block group
876 * @bh: bufferhead contains the bitmap of the given block group
877 *
878 * For ext4 allocations, we must not reuse any blocks which are
879 * allocated in the bitmap buffer's "last committed data" copy. This
880 * prevents deletes from freeing up the page for reuse until we have
881 * committed the delete transaction.
882 *
883 * If we didn't do this, then deleting something and reallocating it as
884 * data would allow the old block to be overwritten before the
885 * transaction committed (because we force data to disk before commit).
886 * This would lead to corruption if we crashed between overwriting the
887 * data and committing the delete.
888 *
889 * @@@ We may want to make this allocation behaviour conditional on
890 * data-writes at some point, and disable it for metadata allocations or
891 * sync-data inodes.
892 */
893static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
894{ 594{
895 int ret; 595 s64 free_blocks, dirty_blocks;
896 struct journal_head *jh = bh2jh(bh); 596 s64 root_blocks = 0;
897 597 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
898 if (ext4_test_bit(nr, bh->b_data)) 598 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
899 return 0;
900
901 jbd_lock_bh_state(bh);
902 if (!jh->b_committed_data)
903 ret = 1;
904 else
905 ret = !ext4_test_bit(nr, jh->b_committed_data);
906 jbd_unlock_bh_state(bh);
907 return ret;
908}
909 599
910/** 600 free_blocks = percpu_counter_read_positive(fbc);
911 * bitmap_search_next_usable_block() 601 dirty_blocks = percpu_counter_read_positive(dbc);
912 * @start: the starting block (group relative) of the search
913 * @bh: bufferhead contains the block group bitmap
914 * @maxblocks: the ending block (group relative) of the reservation
915 *
916 * The bitmap search --- search forward alternately through the actual
917 * bitmap on disk and the last-committed copy in journal, until we find a
918 * bit free in both bitmaps.
919 */
920static ext4_grpblk_t
921bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
922 ext4_grpblk_t maxblocks)
923{
924 ext4_grpblk_t next;
925 struct journal_head *jh = bh2jh(bh);
926
927 while (start < maxblocks) {
928 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
929 if (next >= maxblocks)
930 return -1;
931 if (ext4_test_allocatable(next, bh))
932 return next;
933 jbd_lock_bh_state(bh);
934 if (jh->b_committed_data)
935 start = ext4_find_next_zero_bit(jh->b_committed_data,
936 maxblocks, next);
937 jbd_unlock_bh_state(bh);
938 }
939 return -1;
940}
941 602
942/** 603 if (!capable(CAP_SYS_RESOURCE) &&
943 * find_next_usable_block() 604 sbi->s_resuid != current->fsuid &&
944 * @start: the starting block (group relative) to find next 605 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
945 * allocatable block in bitmap. 606 root_blocks = ext4_r_blocks_count(sbi->s_es);
946 * @bh: bufferhead contains the block group bitmap
947 * @maxblocks: the ending block (group relative) for the search
948 *
949 * Find an allocatable block in a bitmap. We honor both the bitmap and
950 * its last-committed copy (if that exists), and perform the "most
951 * appropriate allocation" algorithm of looking for a free block near
952 * the initial goal; then for a free byte somewhere in the bitmap; then
953 * for any free bit in the bitmap.
954 */
955static ext4_grpblk_t
956find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
957 ext4_grpblk_t maxblocks)
958{
959 ext4_grpblk_t here, next;
960 char *p, *r;
961
962 if (start > 0) {
963 /*
964 * The goal was occupied; search forward for a free
965 * block within the next XX blocks.
966 *
967 * end_goal is more or less random, but it has to be
968 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
969 * next 64-bit boundary is simple..
970 */
971 ext4_grpblk_t end_goal = (start + 63) & ~63;
972 if (end_goal > maxblocks)
973 end_goal = maxblocks;
974 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
975 if (here < end_goal && ext4_test_allocatable(here, bh))
976 return here;
977 ext4_debug("Bit not found near goal\n");
978 }
979
980 here = start;
981 if (here < 0)
982 here = 0;
983
984 p = ((char *)bh->b_data) + (here >> 3);
985 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
986 next = (r - ((char *)bh->b_data)) << 3;
987
988 if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
989 return next;
990
991 /*
992 * The bitmap search --- search forward alternately through the actual
993 * bitmap and the last-committed copy until we find a bit free in
994 * both
995 */
996 here = bitmap_search_next_usable_block(here, bh, maxblocks);
997 return here;
998}
999
1000/**
1001 * claim_block()
1002 * @block: the free block (group relative) to allocate
1003 * @bh: the bufferhead containts the block group bitmap
1004 *
1005 * We think we can allocate this block in this bitmap. Try to set the bit.
1006 * If that succeeds then check that nobody has allocated and then freed the
1007 * block since we saw that is was not marked in b_committed_data. If it _was_
1008 * allocated and freed then clear the bit in the bitmap again and return
1009 * zero (failure).
1010 */
1011static inline int
1012claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
1013{
1014 struct journal_head *jh = bh2jh(bh);
1015 int ret;
1016
1017 if (ext4_set_bit_atomic(lock, block, bh->b_data))
1018 return 0;
1019 jbd_lock_bh_state(bh);
1020 if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
1021 ext4_clear_bit_atomic(lock, block, bh->b_data);
1022 ret = 0;
1023 } else {
1024 ret = 1;
1025 }
1026 jbd_unlock_bh_state(bh);
1027 return ret;
1028}
1029
1030/**
1031 * ext4_try_to_allocate()
1032 * @sb: superblock
1033 * @handle: handle to this transaction
1034 * @group: given allocation block group
1035 * @bitmap_bh: bufferhead holds the block bitmap
1036 * @grp_goal: given target block within the group
1037 * @count: target number of blocks to allocate
1038 * @my_rsv: reservation window
1039 *
1040 * Attempt to allocate blocks within a give range. Set the range of allocation
1041 * first, then find the first free bit(s) from the bitmap (within the range),
1042 * and at last, allocate the blocks by claiming the found free bit as allocated.
1043 *
1044 * To set the range of this allocation:
1045 * if there is a reservation window, only try to allocate block(s) from the
1046 * file's own reservation window;
1047 * Otherwise, the allocation range starts from the give goal block, ends at
1048 * the block group's last block.
1049 *
1050 * If we failed to allocate the desired block then we may end up crossing to a
1051 * new bitmap. In that case we must release write access to the old one via
1052 * ext4_journal_release_buffer(), else we'll run out of credits.
1053 */
1054static ext4_grpblk_t
1055ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
1056 ext4_group_t group, struct buffer_head *bitmap_bh,
1057 ext4_grpblk_t grp_goal, unsigned long *count,
1058 struct ext4_reserve_window *my_rsv)
1059{
1060 ext4_fsblk_t group_first_block;
1061 ext4_grpblk_t start, end;
1062 unsigned long num = 0;
1063
1064 /* we do allocation within the reservation window if we have a window */
1065 if (my_rsv) {
1066 group_first_block = ext4_group_first_block_no(sb, group);
1067 if (my_rsv->_rsv_start >= group_first_block)
1068 start = my_rsv->_rsv_start - group_first_block;
1069 else
1070 /* reservation window cross group boundary */
1071 start = 0;
1072 end = my_rsv->_rsv_end - group_first_block + 1;
1073 if (end > EXT4_BLOCKS_PER_GROUP(sb))
1074 /* reservation window crosses group boundary */
1075 end = EXT4_BLOCKS_PER_GROUP(sb);
1076 if ((start <= grp_goal) && (grp_goal < end))
1077 start = grp_goal;
1078 else
1079 grp_goal = -1;
1080 } else {
1081 if (grp_goal > 0)
1082 start = grp_goal;
1083 else
1084 start = 0;
1085 end = EXT4_BLOCKS_PER_GROUP(sb);
1086 }
1087
1088 BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
1089
1090repeat:
1091 if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
1092 grp_goal = find_next_usable_block(start, bitmap_bh, end);
1093 if (grp_goal < 0)
1094 goto fail_access;
1095 if (!my_rsv) {
1096 int i;
1097
1098 for (i = 0; i < 7 && grp_goal > start &&
1099 ext4_test_allocatable(grp_goal - 1,
1100 bitmap_bh);
1101 i++, grp_goal--)
1102 ;
1103 }
1104 }
1105 start = grp_goal;
1106
1107 if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1108 grp_goal, bitmap_bh)) {
1109 /*
1110 * The block was allocated by another thread, or it was
1111 * allocated and then freed by another thread
1112 */
1113 start++;
1114 grp_goal++;
1115 if (start >= end)
1116 goto fail_access;
1117 goto repeat;
1118 }
1119 num++;
1120 grp_goal++;
1121 while (num < *count && grp_goal < end
1122 && ext4_test_allocatable(grp_goal, bitmap_bh)
1123 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
1124 grp_goal, bitmap_bh)) {
1125 num++;
1126 grp_goal++;
1127 }
1128 *count = num;
1129 return grp_goal - num;
1130fail_access:
1131 *count = num;
1132 return -1;
1133}
1134
1135/**
1136 * find_next_reservable_window():
1137 * find a reservable space within the given range.
1138 * It does not allocate the reservation window for now:
1139 * alloc_new_reservation() will do the work later.
1140 *
1141 * @search_head: the head of the searching list;
1142 * This is not necessarily the list head of the whole filesystem
1143 *
1144 * We have both head and start_block to assist the search
1145 * for the reservable space. The list starts from head,
1146 * but we will shift to the place where start_block is,
1147 * then start from there, when looking for a reservable space.
1148 *
1149 * @size: the target new reservation window size
1150 *
1151 * @group_first_block: the first block we consider to start
1152 * the real search from
1153 *
1154 * @last_block:
1155 * the maximum block number that our goal reservable space
1156 * could start from. This is normally the last block in this
1157 * group. The search will end when we found the start of next
1158 * possible reservable space is out of this boundary.
1159 * This could handle the cross boundary reservation window
1160 * request.
1161 *
1162 * basically we search from the given range, rather than the whole
1163 * reservation double linked list, (start_block, last_block)
1164 * to find a free region that is of my size and has not
1165 * been reserved.
1166 *
1167 */
1168static int find_next_reservable_window(
1169 struct ext4_reserve_window_node *search_head,
1170 struct ext4_reserve_window_node *my_rsv,
1171 struct super_block * sb,
1172 ext4_fsblk_t start_block,
1173 ext4_fsblk_t last_block)
1174{
1175 struct rb_node *next;
1176 struct ext4_reserve_window_node *rsv, *prev;
1177 ext4_fsblk_t cur;
1178 int size = my_rsv->rsv_goal_size;
1179
1180 /* TODO: make the start of the reservation window byte-aligned */
1181 /* cur = *start_block & ~7;*/
1182 cur = start_block;
1183 rsv = search_head;
1184 if (!rsv)
1185 return -1;
1186
1187 while (1) {
1188 if (cur <= rsv->rsv_end)
1189 cur = rsv->rsv_end + 1;
1190
1191 /* TODO?
1192 * in the case we could not find a reservable space
1193 * that is what is expected, during the re-search, we could
1194 * remember what's the largest reservable space we could have
1195 * and return that one.
1196 *
1197 * For now it will fail if we could not find the reservable
1198 * space with expected-size (or more)...
1199 */
1200 if (cur > last_block)
1201 return -1; /* fail */
1202
1203 prev = rsv;
1204 next = rb_next(&rsv->rsv_node);
1205 rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
1206 607
1207 /* 608 if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
1208 * Reached the last reservation, we can just append to the 609 EXT4_FREEBLOCKS_WATERMARK) {
1209 * previous one. 610 free_blocks = percpu_counter_sum(fbc);
1210 */ 611 dirty_blocks = percpu_counter_sum(dbc);
1211 if (!next) 612 if (dirty_blocks < 0) {
1212 break; 613 printk(KERN_CRIT "Dirty block accounting "
1213 614 "went wrong %lld\n",
1214 if (cur + size <= rsv->rsv_start) { 615 dirty_blocks);
1215 /*
1216 * Found a reserveable space big enough. We could
1217 * have a reservation across the group boundary here
1218 */
1219 break;
1220 } 616 }
1221 } 617 }
1222 /* 618 /* Check whether we have space after
1223 * we come here either : 619 * accounting for current dirty blocks
1224 * when we reach the end of the whole list,
1225 * and there is empty reservable space after last entry in the list.
1226 * append it to the end of the list.
1227 *
1228 * or we found one reservable space in the middle of the list,
1229 * return the reservation window that we could append to.
1230 * succeed.
1231 */ 620 */
621 if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
622 /* we don't have free space */
623 return -ENOSPC;
1232 624
1233 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) 625 /* Add the blocks to nblocks */
1234 rsv_window_remove(sb, my_rsv); 626 percpu_counter_add(dbc, nblocks);
1235
1236 /*
1237 * Let's book the whole avaliable window for now. We will check the
1238 * disk bitmap later and then, if there are free blocks then we adjust
1239 * the window size if it's larger than requested.
1240 * Otherwise, we will remove this node from the tree next time
1241 * call find_next_reservable_window.
1242 */
1243 my_rsv->rsv_start = cur;
1244 my_rsv->rsv_end = cur + size - 1;
1245 my_rsv->rsv_alloc_hit = 0;
1246
1247 if (prev != my_rsv)
1248 ext4_rsv_window_add(sb, my_rsv);
1249
1250 return 0; 627 return 0;
1251} 628}
1252 629
1253/** 630/**
1254 * alloc_new_reservation()--allocate a new reservation window
1255 *
1256 * To make a new reservation, we search part of the filesystem
1257 * reservation list (the list that inside the group). We try to
1258 * allocate a new reservation window near the allocation goal,
1259 * or the beginning of the group, if there is no goal.
1260 *
1261 * We first find a reservable space after the goal, then from
1262 * there, we check the bitmap for the first free block after
1263 * it. If there is no free block until the end of group, then the
1264 * whole group is full, we failed. Otherwise, check if the free
1265 * block is inside the expected reservable space, if so, we
1266 * succeed.
1267 * If the first free block is outside the reservable space, then
1268 * start from the first free block, we search for next available
1269 * space, and go on.
1270 *
1271 * on succeed, a new reservation will be found and inserted into the list
1272 * It contains at least one free block, and it does not overlap with other
1273 * reservation windows.
1274 *
1275 * failed: we failed to find a reservation window in this group
1276 *
1277 * @rsv: the reservation
1278 *
1279 * @grp_goal: The goal (group-relative). It is where the search for a
1280 * free reservable space should start from.
1281 * if we have a grp_goal(grp_goal >0 ), then start from there,
1282 * no grp_goal(grp_goal = -1), we start from the first block
1283 * of the group.
1284 *
1285 * @sb: the super block
1286 * @group: the group we are trying to allocate in
1287 * @bitmap_bh: the block group block bitmap
1288 *
1289 */
1290static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1291 ext4_grpblk_t grp_goal, struct super_block *sb,
1292 ext4_group_t group, struct buffer_head *bitmap_bh)
1293{
1294 struct ext4_reserve_window_node *search_head;
1295 ext4_fsblk_t group_first_block, group_end_block, start_block;
1296 ext4_grpblk_t first_free_block;
1297 struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1298 unsigned long size;
1299 int ret;
1300 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1301
1302 group_first_block = ext4_group_first_block_no(sb, group);
1303 group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1304
1305 if (grp_goal < 0)
1306 start_block = group_first_block;
1307 else
1308 start_block = grp_goal + group_first_block;
1309
1310 size = my_rsv->rsv_goal_size;
1311
1312 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1313 /*
1314 * if the old reservation is cross group boundary
1315 * and if the goal is inside the old reservation window,
1316 * we will come here when we just failed to allocate from
1317 * the first part of the window. We still have another part
1318 * that belongs to the next group. In this case, there is no
1319 * point to discard our window and try to allocate a new one
1320 * in this group(which will fail). we should
1321 * keep the reservation window, just simply move on.
1322 *
1323 * Maybe we could shift the start block of the reservation
1324 * window to the first block of next group.
1325 */
1326
1327 if ((my_rsv->rsv_start <= group_end_block) &&
1328 (my_rsv->rsv_end > group_end_block) &&
1329 (start_block >= my_rsv->rsv_start))
1330 return -1;
1331
1332 if ((my_rsv->rsv_alloc_hit >
1333 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1334 /*
1335 * if the previously allocation hit ratio is
1336 * greater than 1/2, then we double the size of
1337 * the reservation window the next time,
1338 * otherwise we keep the same size window
1339 */
1340 size = size * 2;
1341 if (size > EXT4_MAX_RESERVE_BLOCKS)
1342 size = EXT4_MAX_RESERVE_BLOCKS;
1343 my_rsv->rsv_goal_size= size;
1344 }
1345 }
1346
1347 spin_lock(rsv_lock);
1348 /*
1349 * shift the search start to the window near the goal block
1350 */
1351 search_head = search_reserve_window(fs_rsv_root, start_block);
1352
1353 /*
1354 * find_next_reservable_window() simply finds a reservable window
1355 * inside the given range(start_block, group_end_block).
1356 *
1357 * To make sure the reservation window has a free bit inside it, we
1358 * need to check the bitmap after we found a reservable window.
1359 */
1360retry:
1361 ret = find_next_reservable_window(search_head, my_rsv, sb,
1362 start_block, group_end_block);
1363
1364 if (ret == -1) {
1365 if (!rsv_is_empty(&my_rsv->rsv_window))
1366 rsv_window_remove(sb, my_rsv);
1367 spin_unlock(rsv_lock);
1368 return -1;
1369 }
1370
1371 /*
1372 * On success, find_next_reservable_window() returns the
1373 * reservation window where there is a reservable space after it.
1374 * Before we reserve this reservable space, we need
1375 * to make sure there is at least a free block inside this region.
1376 *
1377 * searching the first free bit on the block bitmap and copy of
1378 * last committed bitmap alternatively, until we found a allocatable
1379 * block. Search start from the start block of the reservable space
1380 * we just found.
1381 */
1382 spin_unlock(rsv_lock);
1383 first_free_block = bitmap_search_next_usable_block(
1384 my_rsv->rsv_start - group_first_block,
1385 bitmap_bh, group_end_block - group_first_block + 1);
1386
1387 if (first_free_block < 0) {
1388 /*
1389 * no free block left on the bitmap, no point
1390 * to reserve the space. return failed.
1391 */
1392 spin_lock(rsv_lock);
1393 if (!rsv_is_empty(&my_rsv->rsv_window))
1394 rsv_window_remove(sb, my_rsv);
1395 spin_unlock(rsv_lock);
1396 return -1; /* failed */
1397 }
1398
1399 start_block = first_free_block + group_first_block;
1400 /*
1401 * check if the first free block is within the
1402 * free space we just reserved
1403 */
1404 if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1405 return 0; /* success */
1406 /*
1407 * if the first free bit we found is out of the reservable space
1408 * continue search for next reservable space,
1409 * start from where the free block is,
1410 * we also shift the list head to where we stopped last time
1411 */
1412 search_head = my_rsv;
1413 spin_lock(rsv_lock);
1414 goto retry;
1415}
1416
1417/**
1418 * try_to_extend_reservation()
1419 * @my_rsv: given reservation window
1420 * @sb: super block
1421 * @size: the delta to extend
1422 *
1423 * Attempt to expand the reservation window large enough to have
1424 * required number of free blocks
1425 *
1426 * Since ext4_try_to_allocate() will always allocate blocks within
1427 * the reservation window range, if the window size is too small,
1428 * multiple blocks allocation has to stop at the end of the reservation
1429 * window. To make this more efficient, given the total number of
1430 * blocks needed and the current size of the window, we try to
1431 * expand the reservation window size if necessary on a best-effort
1432 * basis before ext4_new_blocks() tries to allocate blocks,
1433 */
1434static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1435 struct super_block *sb, int size)
1436{
1437 struct ext4_reserve_window_node *next_rsv;
1438 struct rb_node *next;
1439 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1440
1441 if (!spin_trylock(rsv_lock))
1442 return;
1443
1444 next = rb_next(&my_rsv->rsv_node);
1445
1446 if (!next)
1447 my_rsv->rsv_end += size;
1448 else {
1449 next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
1450
1451 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1452 my_rsv->rsv_end += size;
1453 else
1454 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1455 }
1456 spin_unlock(rsv_lock);
1457}
1458
1459/**
1460 * ext4_try_to_allocate_with_rsv()
1461 * @sb: superblock
1462 * @handle: handle to this transaction
1463 * @group: given allocation block group
1464 * @bitmap_bh: bufferhead holds the block bitmap
1465 * @grp_goal: given target block within the group
1466 * @count: target number of blocks to allocate
1467 * @my_rsv: reservation window
1468 * @errp: pointer to store the error code
1469 *
1470 * This is the main function used to allocate a new block and its reservation
1471 * window.
1472 *
1473 * Each time when a new block allocation is need, first try to allocate from
1474 * its own reservation. If it does not have a reservation window, instead of
1475 * looking for a free bit on bitmap first, then look up the reservation list to
1476 * see if it is inside somebody else's reservation window, we try to allocate a
1477 * reservation window for it starting from the goal first. Then do the block
1478 * allocation within the reservation window.
1479 *
1480 * This will avoid keeping on searching the reservation list again and
1481 * again when somebody is looking for a free block (without
1482 * reservation), and there are lots of free blocks, but they are all
1483 * being reserved.
1484 *
1485 * We use a red-black tree for the per-filesystem reservation list.
1486 *
1487 */
1488static ext4_grpblk_t
1489ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1490 ext4_group_t group, struct buffer_head *bitmap_bh,
1491 ext4_grpblk_t grp_goal,
1492 struct ext4_reserve_window_node * my_rsv,
1493 unsigned long *count, int *errp)
1494{
1495 ext4_fsblk_t group_first_block, group_last_block;
1496 ext4_grpblk_t ret = 0;
1497 int fatal;
1498 unsigned long num = *count;
1499
1500 *errp = 0;
1501
1502 /*
1503 * Make sure we use undo access for the bitmap, because it is critical
1504 * that we do the frozen_data COW on bitmap buffers in all cases even
1505 * if the buffer is in BJ_Forget state in the committing transaction.
1506 */
1507 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1508 fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1509 if (fatal) {
1510 *errp = fatal;
1511 return -1;
1512 }
1513
1514 /*
1515 * we don't deal with reservation when
1516 * filesystem is mounted without reservation
1517 * or the file is not a regular file
1518 * or last attempt to allocate a block with reservation turned on failed
1519 */
1520 if (my_rsv == NULL ) {
1521 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1522 grp_goal, count, NULL);
1523 goto out;
1524 }
1525 /*
1526 * grp_goal is a group relative block number (if there is a goal)
1527 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1528 * first block is a filesystem wide block number
1529 * first block is the block number of the first block in this group
1530 */
1531 group_first_block = ext4_group_first_block_no(sb, group);
1532 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1533
1534 /*
1535 * Basically we will allocate a new block from inode's reservation
1536 * window.
1537 *
1538 * We need to allocate a new reservation window, if:
1539 * a) inode does not have a reservation window; or
1540 * b) last attempt to allocate a block from existing reservation
1541 * failed; or
1542 * c) we come here with a goal and with a reservation window
1543 *
1544 * We do not need to allocate a new reservation window if we come here
1545 * at the beginning with a goal and the goal is inside the window, or
1546 * we don't have a goal but already have a reservation window.
1547 * then we could go to allocate from the reservation window directly.
1548 */
1549 while (1) {
1550 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1551 !goal_in_my_reservation(&my_rsv->rsv_window,
1552 grp_goal, group, sb)) {
1553 if (my_rsv->rsv_goal_size < *count)
1554 my_rsv->rsv_goal_size = *count;
1555 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1556 group, bitmap_bh);
1557 if (ret < 0)
1558 break; /* failed */
1559
1560 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1561 grp_goal, group, sb))
1562 grp_goal = -1;
1563 } else if (grp_goal >= 0) {
1564 int curr = my_rsv->rsv_end -
1565 (grp_goal + group_first_block) + 1;
1566
1567 if (curr < *count)
1568 try_to_extend_reservation(my_rsv, sb,
1569 *count - curr);
1570 }
1571
1572 if ((my_rsv->rsv_start > group_last_block) ||
1573 (my_rsv->rsv_end < group_first_block)) {
1574 rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1575 BUG();
1576 }
1577 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1578 grp_goal, &num, &my_rsv->rsv_window);
1579 if (ret >= 0) {
1580 my_rsv->rsv_alloc_hit += num;
1581 *count = num;
1582 break; /* succeed */
1583 }
1584 num = *count;
1585 }
1586out:
1587 if (ret >= 0) {
1588 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1589 "bitmap block");
1590 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1591 if (fatal) {
1592 *errp = fatal;
1593 return -1;
1594 }
1595 return ret;
1596 }
1597
1598 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1599 ext4_journal_release_buffer(handle, bitmap_bh);
1600 return ret;
1601}
1602
1603/**
1604 * ext4_has_free_blocks() 631 * ext4_has_free_blocks()
1605 * @sbi: in-core super block structure. 632 * @sbi: in-core super block structure.
1606 * @nblocks: number of neeed blocks 633 * @nblocks: number of neeed blocks
@@ -1610,26 +637,34 @@ out:
1610 * On success, return nblocks 637 * On success, return nblocks
1611 */ 638 */
1612ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, 639ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1613 ext4_fsblk_t nblocks) 640 s64 nblocks)
1614{ 641{
1615 ext4_fsblk_t free_blocks; 642 s64 free_blocks, dirty_blocks;
1616 ext4_fsblk_t root_blocks = 0; 643 s64 root_blocks = 0;
644 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
645 struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
1617 646
1618 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 647 free_blocks = percpu_counter_read_positive(fbc);
648 dirty_blocks = percpu_counter_read_positive(dbc);
1619 649
1620 if (!capable(CAP_SYS_RESOURCE) && 650 if (!capable(CAP_SYS_RESOURCE) &&
1621 sbi->s_resuid != current->fsuid && 651 sbi->s_resuid != current->fsuid &&
1622 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) 652 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1623 root_blocks = ext4_r_blocks_count(sbi->s_es); 653 root_blocks = ext4_r_blocks_count(sbi->s_es);
1624#ifdef CONFIG_SMP 654
1625 if (free_blocks - root_blocks < FBC_BATCH) 655 if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
1626 free_blocks = 656 EXT4_FREEBLOCKS_WATERMARK) {
1627 percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); 657 free_blocks = percpu_counter_sum(fbc);
1628#endif 658 dirty_blocks = percpu_counter_sum(dbc);
1629 if (free_blocks - root_blocks < nblocks) 659 }
1630 return free_blocks - root_blocks; 660 if (free_blocks <= (root_blocks + dirty_blocks))
661 /* we don't have free space */
662 return 0;
663
664 if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
665 return free_blocks - (root_blocks + dirty_blocks);
1631 return nblocks; 666 return nblocks;
1632 } 667}
1633 668
1634 669
1635/** 670/**
@@ -1654,303 +689,6 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1654 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 689 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1655} 690}
1656 691
1657/**
1658 * ext4_old_new_blocks() -- core block bitmap based block allocation function
1659 *
1660 * @handle: handle to this transaction
1661 * @inode: file inode
1662 * @goal: given target block(filesystem wide)
1663 * @count: target number of blocks to allocate
1664 * @errp: error code
1665 *
1666 * ext4_old_new_blocks uses a goal block to assist allocation and look up
1667 * the block bitmap directly to do block allocation. It tries to
1668 * allocate block(s) from the block group contains the goal block first. If
1669 * that fails, it will try to allocate block(s) from other block groups
1670 * without any specific goal block.
1671 *
1672 * This function is called when -o nomballoc mount option is enabled
1673 *
1674 */
1675ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1676 ext4_fsblk_t goal, unsigned long *count, int *errp)
1677{
1678 struct buffer_head *bitmap_bh = NULL;
1679 struct buffer_head *gdp_bh;
1680 ext4_group_t group_no;
1681 ext4_group_t goal_group;
1682 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1683 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1684 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1685 ext4_group_t bgi; /* blockgroup iteration index */
1686 int fatal = 0, err;
1687 int performed_allocation = 0;
1688 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
1689 struct super_block *sb;
1690 struct ext4_group_desc *gdp;
1691 struct ext4_super_block *es;
1692 struct ext4_sb_info *sbi;
1693 struct ext4_reserve_window_node *my_rsv = NULL;
1694 struct ext4_block_alloc_info *block_i;
1695 unsigned short windowsz = 0;
1696 ext4_group_t ngroups;
1697 unsigned long num = *count;
1698
1699 sb = inode->i_sb;
1700 if (!sb) {
1701 *errp = -ENODEV;
1702 printk("ext4_new_block: nonexistent device");
1703 return 0;
1704 }
1705
1706 sbi = EXT4_SB(sb);
1707 if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1708 /*
1709 * With delalloc we already reserved the blocks
1710 */
1711 *count = ext4_has_free_blocks(sbi, *count);
1712 }
1713 if (*count == 0) {
1714 *errp = -ENOSPC;
1715 return 0; /*return with ENOSPC error */
1716 }
1717 num = *count;
1718
1719 /*
1720 * Check quota for allocation of this block.
1721 */
1722 if (DQUOT_ALLOC_BLOCK(inode, num)) {
1723 *errp = -EDQUOT;
1724 return 0;
1725 }
1726
1727 sbi = EXT4_SB(sb);
1728 es = EXT4_SB(sb)->s_es;
1729 ext4_debug("goal=%llu.\n", goal);
1730 /*
1731 * Allocate a block from reservation only when
1732 * filesystem is mounted with reservation(default,-o reservation), and
1733 * it's a regular file, and
1734 * the desired window size is greater than 0 (One could use ioctl
1735 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1736 * reservation on that particular file)
1737 */
1738 block_i = EXT4_I(inode)->i_block_alloc_info;
1739 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1740 my_rsv = &block_i->rsv_window_node;
1741
1742 /*
1743 * First, test whether the goal block is free.
1744 */
1745 if (goal < le32_to_cpu(es->s_first_data_block) ||
1746 goal >= ext4_blocks_count(es))
1747 goal = le32_to_cpu(es->s_first_data_block);
1748 ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1749 goal_group = group_no;
1750retry_alloc:
1751 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1752 if (!gdp)
1753 goto io_error;
1754
1755 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1756 /*
1757 * if there is not enough free blocks to make a new resevation
1758 * turn off reservation for this allocation
1759 */
1760 if (my_rsv && (free_blocks < windowsz)
1761 && (rsv_is_empty(&my_rsv->rsv_window)))
1762 my_rsv = NULL;
1763
1764 if (free_blocks > 0) {
1765 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1766 if (!bitmap_bh)
1767 goto io_error;
1768 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1769 group_no, bitmap_bh, grp_target_blk,
1770 my_rsv, &num, &fatal);
1771 if (fatal)
1772 goto out;
1773 if (grp_alloc_blk >= 0)
1774 goto allocated;
1775 }
1776
1777 ngroups = EXT4_SB(sb)->s_groups_count;
1778 smp_rmb();
1779
1780 /*
1781 * Now search the rest of the groups. We assume that
1782 * group_no and gdp correctly point to the last group visited.
1783 */
1784 for (bgi = 0; bgi < ngroups; bgi++) {
1785 group_no++;
1786 if (group_no >= ngroups)
1787 group_no = 0;
1788 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1789 if (!gdp)
1790 goto io_error;
1791 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1792 /*
1793 * skip this group if the number of
1794 * free blocks is less than half of the reservation
1795 * window size.
1796 */
1797 if (free_blocks <= (windowsz/2))
1798 continue;
1799
1800 brelse(bitmap_bh);
1801 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1802 if (!bitmap_bh)
1803 goto io_error;
1804 /*
1805 * try to allocate block(s) from this group, without a goal(-1).
1806 */
1807 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1808 group_no, bitmap_bh, -1, my_rsv,
1809 &num, &fatal);
1810 if (fatal)
1811 goto out;
1812 if (grp_alloc_blk >= 0)
1813 goto allocated;
1814 }
1815 /*
1816 * We may end up a bogus ealier ENOSPC error due to
1817 * filesystem is "full" of reservations, but
1818 * there maybe indeed free blocks avaliable on disk
1819 * In this case, we just forget about the reservations
1820 * just do block allocation as without reservations.
1821 */
1822 if (my_rsv) {
1823 my_rsv = NULL;
1824 windowsz = 0;
1825 group_no = goal_group;
1826 goto retry_alloc;
1827 }
1828 /* No space left on the device */
1829 *errp = -ENOSPC;
1830 goto out;
1831
1832allocated:
1833
1834 ext4_debug("using block group %lu(%d)\n",
1835 group_no, gdp->bg_free_blocks_count);
1836
1837 BUFFER_TRACE(gdp_bh, "get_write_access");
1838 fatal = ext4_journal_get_write_access(handle, gdp_bh);
1839 if (fatal)
1840 goto out;
1841
1842 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1843
1844 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1845 in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
1846 in_range(ret_block, ext4_inode_table(sb, gdp),
1847 EXT4_SB(sb)->s_itb_per_group) ||
1848 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1849 EXT4_SB(sb)->s_itb_per_group)) {
1850 ext4_error(sb, "ext4_new_block",
1851 "Allocating block in system zone - "
1852 "blocks from %llu, length %lu",
1853 ret_block, num);
1854 /*
1855 * claim_block marked the blocks we allocated
1856 * as in use. So we may want to selectively
1857 * mark some of the blocks as free
1858 */
1859 goto retry_alloc;
1860 }
1861
1862 performed_allocation = 1;
1863
1864#ifdef CONFIG_JBD2_DEBUG
1865 {
1866 struct buffer_head *debug_bh;
1867
1868 /* Record bitmap buffer state in the newly allocated block */
1869 debug_bh = sb_find_get_block(sb, ret_block);
1870 if (debug_bh) {
1871 BUFFER_TRACE(debug_bh, "state when allocated");
1872 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1873 brelse(debug_bh);
1874 }
1875 }
1876 jbd_lock_bh_state(bitmap_bh);
1877 spin_lock(sb_bgl_lock(sbi, group_no));
1878 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1879 int i;
1880
1881 for (i = 0; i < num; i++) {
1882 if (ext4_test_bit(grp_alloc_blk+i,
1883 bh2jh(bitmap_bh)->b_committed_data)) {
1884 printk("%s: block was unexpectedly set in "
1885 "b_committed_data\n", __func__);
1886 }
1887 }
1888 }
1889 ext4_debug("found bit %d\n", grp_alloc_blk);
1890 spin_unlock(sb_bgl_lock(sbi, group_no));
1891 jbd_unlock_bh_state(bitmap_bh);
1892#endif
1893
1894 if (ret_block + num - 1 >= ext4_blocks_count(es)) {
1895 ext4_error(sb, "ext4_new_block",
1896 "block(%llu) >= blocks count(%llu) - "
1897 "block_group = %lu, es == %p ", ret_block,
1898 ext4_blocks_count(es), group_no, es);
1899 goto out;
1900 }
1901
1902 /*
1903 * It is up to the caller to add the new buffer to a journal
1904 * list of some description. We don't know in advance whether
1905 * the caller wants to use it as metadata or data.
1906 */
1907 spin_lock(sb_bgl_lock(sbi, group_no));
1908 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1909 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1910 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1911 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1912 spin_unlock(sb_bgl_lock(sbi, group_no));
1913 if (!EXT4_I(inode)->i_delalloc_reserved_flag)
1914 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1915
1916 if (sbi->s_log_groups_per_flex) {
1917 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1918 spin_lock(sb_bgl_lock(sbi, flex_group));
1919 sbi->s_flex_groups[flex_group].free_blocks -= num;
1920 spin_unlock(sb_bgl_lock(sbi, flex_group));
1921 }
1922
1923 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1924 err = ext4_journal_dirty_metadata(handle, gdp_bh);
1925 if (!fatal)
1926 fatal = err;
1927
1928 sb->s_dirt = 1;
1929 if (fatal)
1930 goto out;
1931
1932 *errp = 0;
1933 brelse(bitmap_bh);
1934 DQUOT_FREE_BLOCK(inode, *count-num);
1935 *count = num;
1936 return ret_block;
1937
1938io_error:
1939 *errp = -EIO;
1940out:
1941 if (fatal) {
1942 *errp = fatal;
1943 ext4_std_error(sb, fatal);
1944 }
1945 /*
1946 * Undo the block allocation
1947 */
1948 if (!performed_allocation)
1949 DQUOT_FREE_BLOCK(inode, *count);
1950 brelse(bitmap_bh);
1951 return 0;
1952}
1953
1954#define EXT4_META_BLOCK 0x1 692#define EXT4_META_BLOCK 0x1
1955 693
1956static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, 694static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
@@ -1960,10 +698,6 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
1960 struct ext4_allocation_request ar; 698 struct ext4_allocation_request ar;
1961 ext4_fsblk_t ret; 699 ext4_fsblk_t ret;
1962 700
1963 if (!test_opt(inode->i_sb, MBALLOC)) {
1964 return ext4_old_new_blocks(handle, inode, goal, count, errp);
1965 }
1966
1967 memset(&ar, 0, sizeof(ar)); 701 memset(&ar, 0, sizeof(ar));
1968 /* Fill with neighbour allocated blocks */ 702 /* Fill with neighbour allocated blocks */
1969 703
@@ -2005,7 +739,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
2005 /* 739 /*
2006 * Account for the allocated meta blocks 740 * Account for the allocated meta blocks
2007 */ 741 */
2008 if (!(*errp)) { 742 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
2009 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 743 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2010 EXT4_I(inode)->i_allocated_meta_blocks += *count; 744 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2011 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 745 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2090,10 +824,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
2090 bitmap_count += x; 824 bitmap_count += x;
2091 } 825 }
2092 brelse(bitmap_bh); 826 brelse(bitmap_bh);
2093 printk("ext4_count_free_blocks: stored = %llu" 827 printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
2094 ", computed = %llu, %llu\n", 828 ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
2095 ext4_free_blocks_count(es), 829 desc_count, bitmap_count);
2096 desc_count, bitmap_count);
2097 return bitmap_count; 830 return bitmap_count;
2098#else 831#else
2099 desc_count = 0; 832 desc_count = 0;
@@ -2180,8 +913,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
2180 913
2181 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || 914 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
2182 metagroup < first_meta_bg) 915 metagroup < first_meta_bg)
2183 return ext4_bg_num_gdb_nometa(sb,group); 916 return ext4_bg_num_gdb_nometa(sb, group);
2184 917
2185 return ext4_bg_num_gdb_meta(sb,group); 918 return ext4_bg_num_gdb_meta(sb,group);
2186 919
2187} 920}
921
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea6750454..0a7a6663c190 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
15 15
16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17 17
18unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars) 18unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
19{ 19{
20 unsigned int i; 20 unsigned int i;
21 unsigned long sum = 0; 21 unsigned long sum = 0;
22 22
23 if (!map) 23 if (!map)
24 return (0); 24 return 0;
25 for (i = 0; i < numchars; i++) 25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] + 26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf]; 27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum); 28 return sum;
29} 29}
30 30
31#endif /* EXT4FS_DEBUG */ 31#endif /* EXT4FS_DEBUG */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d3d23d73c08b..3ca6a2b7632d 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = {
33}; 33};
34 34
35static int ext4_readdir(struct file *, void *, filldir_t); 35static int ext4_readdir(struct file *, void *, filldir_t);
36static int ext4_dx_readdir(struct file * filp, 36static int ext4_dx_readdir(struct file *filp,
37 void * dirent, filldir_t filldir); 37 void *dirent, filldir_t filldir);
38static int ext4_release_dir (struct inode * inode, 38static int ext4_release_dir(struct inode *inode,
39 struct file * filp); 39 struct file *filp);
40 40
41const struct file_operations ext4_dir_operations = { 41const struct file_operations ext4_dir_operations = {
42 .llseek = generic_file_llseek, 42 .llseek = generic_file_llseek,
@@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
61} 61}
62 62
63 63
64int ext4_check_dir_entry (const char * function, struct inode * dir, 64int ext4_check_dir_entry(const char *function, struct inode *dir,
65 struct ext4_dir_entry_2 * de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head * bh, 66 struct buffer_head *bh,
67 unsigned long offset) 67 unsigned long offset)
68{ 68{
69 const char * error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
71 71
72 if (rlen < EXT4_DIR_REC_LEN(1)) 72 if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
82 error_msg = "inode out of bounds"; 82 error_msg = "inode out of bounds";
83 83
84 if (error_msg != NULL) 84 if (error_msg != NULL)
85 ext4_error (dir->i_sb, function, 85 ext4_error(dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92} 92}
93 93
94static int ext4_readdir(struct file * filp, 94static int ext4_readdir(struct file *filp,
95 void * dirent, filldir_t filldir) 95 void *dirent, filldir_t filldir)
96{ 96{
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned long offset;
@@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp,
102 int err; 102 int err;
103 struct inode *inode = filp->f_path.dentry->d_inode; 103 struct inode *inode = filp->f_path.dentry->d_inode;
104 int ret = 0; 104 int ret = 0;
105 int dir_has_error = 0;
105 106
106 sb = inode->i_sb; 107 sb = inode->i_sb;
107 108
@@ -148,9 +149,13 @@ static int ext4_readdir(struct file * filp,
148 * of recovering data when there's a bad sector 149 * of recovering data when there's a bad sector
149 */ 150 */
150 if (!bh) { 151 if (!bh) {
151 ext4_error (sb, "ext4_readdir", 152 if (!dir_has_error) {
152 "directory #%lu contains a hole at offset %lu", 153 ext4_error(sb, __func__, "directory #%lu "
153 inode->i_ino, (unsigned long)filp->f_pos); 154 "contains a hole at offset %Lu",
155 inode->i_ino,
156 (unsigned long long) filp->f_pos);
157 dir_has_error = 1;
158 }
154 /* corrupt size? Maybe no more blocks to read */ 159 /* corrupt size? Maybe no more blocks to read */
155 if (filp->f_pos > inode->i_blocks << 9) 160 if (filp->f_pos > inode->i_blocks << 9)
156 break; 161 break;
@@ -187,14 +192,14 @@ revalidate:
187 while (!error && filp->f_pos < inode->i_size 192 while (!error && filp->f_pos < inode->i_size
188 && offset < sb->s_blocksize) { 193 && offset < sb->s_blocksize) {
189 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 194 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
190 if (!ext4_check_dir_entry ("ext4_readdir", inode, de, 195 if (!ext4_check_dir_entry("ext4_readdir", inode, de,
191 bh, offset)) { 196 bh, offset)) {
192 /* 197 /*
193 * On error, skip the f_pos to the next block 198 * On error, skip the f_pos to the next block
194 */ 199 */
195 filp->f_pos = (filp->f_pos | 200 filp->f_pos = (filp->f_pos |
196 (sb->s_blocksize - 1)) + 1; 201 (sb->s_blocksize - 1)) + 1;
197 brelse (bh); 202 brelse(bh);
198 ret = stored; 203 ret = stored;
199 goto out; 204 goto out;
200 } 205 }
@@ -218,12 +223,12 @@ revalidate:
218 break; 223 break;
219 if (version != filp->f_version) 224 if (version != filp->f_version)
220 goto revalidate; 225 goto revalidate;
221 stored ++; 226 stored++;
222 } 227 }
223 filp->f_pos += ext4_rec_len_from_disk(de->rec_len); 228 filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
224 } 229 }
225 offset = 0; 230 offset = 0;
226 brelse (bh); 231 brelse(bh);
227 } 232 }
228out: 233out:
229 return ret; 234 return ret;
@@ -290,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root)
290 parent = rb_parent(n); 295 parent = rb_parent(n);
291 fname = rb_entry(n, struct fname, rb_hash); 296 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) { 297 while (fname) {
293 struct fname * old = fname; 298 struct fname *old = fname;
294 fname = fname->next; 299 fname = fname->next;
295 kfree (old); 300 kfree(old);
296 } 301 }
297 if (!parent) 302 if (!parent)
298 root->rb_node = NULL; 303 root->rb_node = NULL;
@@ -331,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
331 struct ext4_dir_entry_2 *dirent) 336 struct ext4_dir_entry_2 *dirent)
332{ 337{
333 struct rb_node **p, *parent = NULL; 338 struct rb_node **p, *parent = NULL;
334 struct fname * fname, *new_fn; 339 struct fname *fname, *new_fn;
335 struct dir_private_info *info; 340 struct dir_private_info *info;
336 int len; 341 int len;
337 342
@@ -388,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
388 * for all entres on the fname linked list. (Normally there is only 393 * for all entres on the fname linked list. (Normally there is only
389 * one entry on the linked list, unless there are 62 bit hash collisions.) 394 * one entry on the linked list, unless there are 62 bit hash collisions.)
390 */ 395 */
391static int call_filldir(struct file * filp, void * dirent, 396static int call_filldir(struct file *filp, void *dirent,
392 filldir_t filldir, struct fname *fname) 397 filldir_t filldir, struct fname *fname)
393{ 398{
394 struct dir_private_info *info = filp->private_data; 399 struct dir_private_info *info = filp->private_data;
395 loff_t curr_pos; 400 loff_t curr_pos;
396 struct inode *inode = filp->f_path.dentry->d_inode; 401 struct inode *inode = filp->f_path.dentry->d_inode;
397 struct super_block * sb; 402 struct super_block *sb;
398 int error; 403 int error;
399 404
400 sb = inode->i_sb; 405 sb = inode->i_sb;
401 406
402 if (!fname) { 407 if (!fname) {
403 printk("call_filldir: called with null fname?!?\n"); 408 printk(KERN_ERR "ext4: call_filldir: called with "
409 "null fname?!?\n");
404 return 0; 410 return 0;
405 } 411 }
406 curr_pos = hash2pos(fname->hash, fname->minor_hash); 412 curr_pos = hash2pos(fname->hash, fname->minor_hash);
@@ -411,7 +417,7 @@ static int call_filldir(struct file * filp, void * dirent,
411 get_dtype(sb, fname->file_type)); 417 get_dtype(sb, fname->file_type));
412 if (error) { 418 if (error) {
413 filp->f_pos = curr_pos; 419 filp->f_pos = curr_pos;
414 info->extra_fname = fname->next; 420 info->extra_fname = fname;
415 return error; 421 return error;
416 } 422 }
417 fname = fname->next; 423 fname = fname->next;
@@ -419,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent,
419 return 0; 425 return 0;
420} 426}
421 427
422static int ext4_dx_readdir(struct file * filp, 428static int ext4_dx_readdir(struct file *filp,
423 void * dirent, filldir_t filldir) 429 void *dirent, filldir_t filldir)
424{ 430{
425 struct dir_private_info *info = filp->private_data; 431 struct dir_private_info *info = filp->private_data;
426 struct inode *inode = filp->f_path.dentry->d_inode; 432 struct inode *inode = filp->f_path.dentry->d_inode;
@@ -450,11 +456,21 @@ static int ext4_dx_readdir(struct file * filp,
450 * If there are any leftover names on the hash collision 456 * If there are any leftover names on the hash collision
451 * chain, return them first. 457 * chain, return them first.
452 */ 458 */
453 if (info->extra_fname && 459 if (info->extra_fname) {
454 call_filldir(filp, dirent, filldir, info->extra_fname)) 460 if (call_filldir(filp, dirent, filldir, info->extra_fname))
455 goto finished; 461 goto finished;
456 462
457 if (!info->curr_node) 463 info->extra_fname = NULL;
464 info->curr_node = rb_next(info->curr_node);
465 if (!info->curr_node) {
466 if (info->next_hash == ~0) {
467 filp->f_pos = EXT4_HTREE_EOF;
468 goto finished;
469 }
470 info->curr_hash = info->next_hash;
471 info->curr_minor_hash = 0;
472 }
473 } else if (!info->curr_node)
458 info->curr_node = rb_first(&info->root); 474 info->curr_node = rb_first(&info->root);
459 475
460 while (1) { 476 while (1) {
@@ -501,7 +517,7 @@ finished:
501 return 0; 517 return 0;
502} 518}
503 519
504static int ext4_release_dir (struct inode * inode, struct file * filp) 520static int ext4_release_dir(struct inode *inode, struct file *filp)
505{ 521{
506 if (filp->private_data) 522 if (filp->private_data)
507 ext4_htree_free_dir_info(filp->private_data); 523 ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6c7924d9e358..4880cc3e6727 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -44,9 +44,9 @@
44#ifdef EXT4FS_DEBUG 44#ifdef EXT4FS_DEBUG
45#define ext4_debug(f, a...) \ 45#define ext4_debug(f, a...) \
46 do { \ 46 do { \
47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ 47 printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
48 __FILE__, __LINE__, __func__); \ 48 __FILE__, __LINE__, __func__); \
49 printk (KERN_DEBUG f, ## a); \ 49 printk(KERN_DEBUG f, ## a); \
50 } while (0) 50 } while (0)
51#else 51#else
52#define ext4_debug(f, a...) do {} while (0) 52#define ext4_debug(f, a...) do {} while (0)
@@ -128,7 +128,7 @@ struct ext4_allocation_request {
128#else 128#else
129# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) 129# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
130#endif 130#endif
131#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32)) 131#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
132#ifdef __KERNEL__ 132#ifdef __KERNEL__
133# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) 133# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
134#else 134#else
@@ -245,7 +245,7 @@ struct flex_groups {
245#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 245#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
246 246
247#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 247#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
248#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ 248#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
249 249
250/* 250/*
251 * Inode dynamic state flags 251 * Inode dynamic state flags
@@ -291,8 +291,6 @@ struct ext4_new_group_data {
291#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS 291#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
292#define EXT4_IOC_GETVERSION _IOR('f', 3, long) 292#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
293#define EXT4_IOC_SETVERSION _IOW('f', 4, long) 293#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
294#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
295#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input)
296#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION 294#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
297#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION 295#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
298#ifdef CONFIG_JBD2_DEBUG 296#ifdef CONFIG_JBD2_DEBUG
@@ -300,7 +298,10 @@ struct ext4_new_group_data {
300#endif 298#endif
301#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) 299#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
302#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) 300#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
303#define EXT4_IOC_MIGRATE _IO('f', 7) 301#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
302#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
303#define EXT4_IOC_MIGRATE _IO('f', 9)
304 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
304 305
305/* 306/*
306 * ioctl commands in 32 bit emulation 307 * ioctl commands in 32 bit emulation
@@ -510,7 +511,6 @@ do { \
510/* 511/*
511 * Mount flags 512 * Mount flags
512 */ 513 */
513#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */
514#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ 514#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
515#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ 515#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
516#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ 516#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
@@ -538,8 +538,9 @@ do { \
538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
541#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 541#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
542#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
543
543/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 544/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
544#ifndef _LINUX_EXT2_FS_H 545#ifndef _LINUX_EXT2_FS_H
545#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 546#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -667,7 +668,7 @@ struct ext4_super_block {
667}; 668};
668 669
669#ifdef __KERNEL__ 670#ifdef __KERNEL__
670static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb) 671static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
671{ 672{
672 return sb->s_fs_info; 673 return sb->s_fs_info;
673} 674}
@@ -725,11 +726,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
725 */ 726 */
726 727
727#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ 728#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
728 ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) 729 (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
729#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ 730#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
730 ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) 731 (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
731#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ 732#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
732 ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) 733 (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
733#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ 734#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
734 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) 735 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
735#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ 736#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -789,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
789#define EXT4_DEF_RESUID 0 790#define EXT4_DEF_RESUID 0
790#define EXT4_DEF_RESGID 0 791#define EXT4_DEF_RESGID 0
791 792
793#define EXT4_DEF_INODE_READAHEAD_BLKS 32
794
792/* 795/*
793 * Default mount options 796 * Default mount options
794 */ 797 */
@@ -954,6 +957,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
954void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 957void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
955 unsigned long *blockgrpp, ext4_grpblk_t *offsetp); 958 unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
956 959
960extern struct proc_dir_entry *ext4_proc_root;
961
962#ifdef CONFIG_PROC_FS
963extern const struct file_operations ext4_ui_proc_fops;
964
965#define EXT4_PROC_HANDLER(name, var) \
966do { \
967 proc = proc_create_data(name, mode, sbi->s_proc, \
968 &ext4_ui_proc_fops, &sbi->s_##var); \
969 if (proc == NULL) { \
970 printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
971 goto err_out; \
972 } \
973} while (0)
974#else
975#define EXT4_PROC_HANDLER(name, var)
976#endif
977
957/* 978/*
958 * Function prototypes 979 * Function prototypes
959 */ 980 */
@@ -981,23 +1002,20 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
981extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1002extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
982 ext4_lblk_t iblock, ext4_fsblk_t goal, 1003 ext4_lblk_t iblock, ext4_fsblk_t goal,
983 unsigned long *count, int *errp); 1004 unsigned long *count, int *errp);
984extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, 1005extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
985 ext4_fsblk_t goal, unsigned long *count, int *errp);
986extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, 1006extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
987 ext4_fsblk_t nblocks); 1007 s64 nblocks);
988extern void ext4_free_blocks (handle_t *handle, struct inode *inode, 1008extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
989 ext4_fsblk_t block, unsigned long count, int metadata); 1009 ext4_fsblk_t block, unsigned long count, int metadata);
990extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, 1010extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
991 ext4_fsblk_t block, unsigned long count, 1011 ext4_fsblk_t block, unsigned long count,
992 unsigned long *pdquot_freed_blocks); 1012 unsigned long *pdquot_freed_blocks);
993extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); 1013extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
994extern void ext4_check_blocks_bitmap (struct super_block *); 1014extern void ext4_check_blocks_bitmap(struct super_block *);
995extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1015extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
996 ext4_group_t block_group, 1016 ext4_group_t block_group,
997 struct buffer_head ** bh); 1017 struct buffer_head ** bh);
998extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); 1018extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
999extern void ext4_init_block_alloc_info(struct inode *);
1000extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
1001 1019
1002/* dir.c */ 1020/* dir.c */
1003extern int ext4_check_dir_entry(const char *, struct inode *, 1021extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1009,20 +1027,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1009extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1027extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1010 1028
1011/* fsync.c */ 1029/* fsync.c */
1012extern int ext4_sync_file (struct file *, struct dentry *, int); 1030extern int ext4_sync_file(struct file *, struct dentry *, int);
1013 1031
1014/* hash.c */ 1032/* hash.c */
1015extern int ext4fs_dirhash(const char *name, int len, struct 1033extern int ext4fs_dirhash(const char *name, int len, struct
1016 dx_hash_info *hinfo); 1034 dx_hash_info *hinfo);
1017 1035
1018/* ialloc.c */ 1036/* ialloc.c */
1019extern struct inode * ext4_new_inode (handle_t *, struct inode *, int); 1037extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
1020extern void ext4_free_inode (handle_t *, struct inode *); 1038extern void ext4_free_inode(handle_t *, struct inode *);
1021extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); 1039extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1022extern unsigned long ext4_count_free_inodes (struct super_block *); 1040extern unsigned long ext4_count_free_inodes(struct super_block *);
1023extern unsigned long ext4_count_dirs (struct super_block *); 1041extern unsigned long ext4_count_dirs(struct super_block *);
1024extern void ext4_check_inodes_bitmap (struct super_block *); 1042extern void ext4_check_inodes_bitmap(struct super_block *);
1025extern unsigned long ext4_count_free (struct buffer_head *, unsigned); 1043extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
1026 1044
1027/* mballoc.c */ 1045/* mballoc.c */
1028extern long ext4_mb_stats; 1046extern long ext4_mb_stats;
@@ -1032,7 +1050,7 @@ extern int ext4_mb_release(struct super_block *);
1032extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, 1050extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1033 struct ext4_allocation_request *, int *); 1051 struct ext4_allocation_request *, int *);
1034extern int ext4_mb_reserve_blocks(struct super_block *, int); 1052extern int ext4_mb_reserve_blocks(struct super_block *, int);
1035extern void ext4_mb_discard_inode_preallocations(struct inode *); 1053extern void ext4_discard_preallocations(struct inode *);
1036extern int __init init_ext4_mballoc(void); 1054extern int __init init_ext4_mballoc(void);
1037extern void exit_ext4_mballoc(void); 1055extern void exit_ext4_mballoc(void);
1038extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1056extern void ext4_mb_free_blocks(handle_t *, struct inode *,
@@ -1050,39 +1068,41 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1050 ext4_lblk_t, int, int *); 1068 ext4_lblk_t, int, int *);
1051struct buffer_head *ext4_bread(handle_t *, struct inode *, 1069struct buffer_head *ext4_bread(handle_t *, struct inode *,
1052 ext4_lblk_t, int, int *); 1070 ext4_lblk_t, int, int *);
1071int ext4_get_block(struct inode *inode, sector_t iblock,
1072 struct buffer_head *bh_result, int create);
1053int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 1073int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1054 ext4_lblk_t iblock, unsigned long maxblocks, 1074 ext4_lblk_t iblock, unsigned long maxblocks,
1055 struct buffer_head *bh_result, 1075 struct buffer_head *bh_result,
1056 int create, int extend_disksize); 1076 int create, int extend_disksize);
1057 1077
1058extern struct inode *ext4_iget(struct super_block *, unsigned long); 1078extern struct inode *ext4_iget(struct super_block *, unsigned long);
1059extern int ext4_write_inode (struct inode *, int); 1079extern int ext4_write_inode(struct inode *, int);
1060extern int ext4_setattr (struct dentry *, struct iattr *); 1080extern int ext4_setattr(struct dentry *, struct iattr *);
1061extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 1081extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1062 struct kstat *stat); 1082 struct kstat *stat);
1063extern void ext4_delete_inode (struct inode *); 1083extern void ext4_delete_inode(struct inode *);
1064extern int ext4_sync_inode (handle_t *, struct inode *); 1084extern int ext4_sync_inode(handle_t *, struct inode *);
1065extern void ext4_discard_reservation (struct inode *);
1066extern void ext4_dirty_inode(struct inode *); 1085extern void ext4_dirty_inode(struct inode *);
1067extern int ext4_change_inode_journal_flag(struct inode *, int); 1086extern int ext4_change_inode_journal_flag(struct inode *, int);
1068extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1087extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1069extern int ext4_can_truncate(struct inode *inode); 1088extern int ext4_can_truncate(struct inode *inode);
1070extern void ext4_truncate (struct inode *); 1089extern void ext4_truncate(struct inode *);
1071extern void ext4_set_inode_flags(struct inode *); 1090extern void ext4_set_inode_flags(struct inode *);
1072extern void ext4_get_inode_flags(struct ext4_inode_info *); 1091extern void ext4_get_inode_flags(struct ext4_inode_info *);
1073extern void ext4_set_aops(struct inode *inode); 1092extern void ext4_set_aops(struct inode *inode);
1074extern int ext4_writepage_trans_blocks(struct inode *); 1093extern int ext4_writepage_trans_blocks(struct inode *);
1094extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1095extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1075extern int ext4_block_truncate_page(handle_t *handle, 1096extern int ext4_block_truncate_page(handle_t *handle,
1076 struct address_space *mapping, loff_t from); 1097 struct address_space *mapping, loff_t from);
1077extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); 1098extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1078 1099
1079/* ioctl.c */ 1100/* ioctl.c */
1080extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1101extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1081extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); 1102extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1082 1103
1083/* migrate.c */ 1104/* migrate.c */
1084extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, 1105extern int ext4_ext_migrate(struct inode *);
1085 unsigned long);
1086/* namei.c */ 1106/* namei.c */
1087extern int ext4_orphan_add(handle_t *, struct inode *); 1107extern int ext4_orphan_add(handle_t *, struct inode *);
1088extern int ext4_orphan_del(handle_t *, struct inode *); 1108extern int ext4_orphan_del(handle_t *, struct inode *);
@@ -1097,14 +1117,14 @@ extern int ext4_group_extend(struct super_block *sb,
1097 ext4_fsblk_t n_blocks_count); 1117 ext4_fsblk_t n_blocks_count);
1098 1118
1099/* super.c */ 1119/* super.c */
1100extern void ext4_error (struct super_block *, const char *, const char *, ...) 1120extern void ext4_error(struct super_block *, const char *, const char *, ...)
1101 __attribute__ ((format (printf, 3, 4))); 1121 __attribute__ ((format (printf, 3, 4)));
1102extern void __ext4_std_error (struct super_block *, const char *, int); 1122extern void __ext4_std_error(struct super_block *, const char *, int);
1103extern void ext4_abort (struct super_block *, const char *, const char *, ...) 1123extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1104 __attribute__ ((format (printf, 3, 4))); 1124 __attribute__ ((format (printf, 3, 4)));
1105extern void ext4_warning (struct super_block *, const char *, const char *, ...) 1125extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1106 __attribute__ ((format (printf, 3, 4))); 1126 __attribute__ ((format (printf, 3, 4)));
1107extern void ext4_update_dynamic_rev (struct super_block *sb); 1127extern void ext4_update_dynamic_rev(struct super_block *sb);
1108extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1128extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1109 __u32 compat); 1129 __u32 compat);
1110extern int ext4_update_rocompat_feature(handle_t *handle, 1130extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1177,7 +1197,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
1177 1197
1178static inline 1198static inline
1179struct ext4_group_info *ext4_get_group_info(struct super_block *sb, 1199struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1180 ext4_group_t group) 1200 ext4_group_t group)
1181{ 1201{
1182 struct ext4_group_info ***grp_info; 1202 struct ext4_group_info ***grp_info;
1183 long indexv, indexh; 1203 long indexv, indexh;
@@ -1205,6 +1225,28 @@ do { \
1205 __ext4_std_error((sb), __func__, (errno)); \ 1225 __ext4_std_error((sb), __func__, (errno)); \
1206} while (0) 1226} while (0)
1207 1227
1228#ifdef CONFIG_SMP
1229/* Each CPU can accumulate FBC_BATCH blocks in their local
1230 * counters. So we need to make sure we have free blocks more
1231 * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
1232 */
1233#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
1234#else
1235#define EXT4_FREEBLOCKS_WATERMARK 0
1236#endif
1237
1238static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1239{
1240 /*
1241 * XXX: replace with spinlock if seen contended -bzzz
1242 */
1243 down_write(&EXT4_I(inode)->i_data_sem);
1244 if (newsize > EXT4_I(inode)->i_disksize)
1245 EXT4_I(inode)->i_disksize = newsize;
1246 up_write(&EXT4_I(inode)->i_data_sem);
1247 return ;
1248}
1249
1208/* 1250/*
1209 * Inodes and files operations 1251 * Inodes and files operations
1210 */ 1252 */
@@ -1227,6 +1269,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1227/* extents.c */ 1269/* extents.c */
1228extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 1270extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1229extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1271extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1272extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1273 int chunk);
1230extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1274extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1231 ext4_lblk_t iblock, 1275 ext4_lblk_t iblock,
1232 unsigned long max_blocks, struct buffer_head *bh_result, 1276 unsigned long max_blocks, struct buffer_head *bh_result,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 6c166c0a54b7..bec7ce59fc0d 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@ struct ext4_ext_path {
124#define EXT4_EXT_CACHE_GAP 1 124#define EXT4_EXT_CACHE_GAP 1
125#define EXT4_EXT_CACHE_EXTENT 2 125#define EXT4_EXT_CACHE_EXTENT 2
126 126
127/*
128 * to be called by ext4_ext_walk_space()
129 * negative retcode - error
130 * positive retcode - signal for ext4_ext_walk_space(), see below
131 * callback must return valid extent (passed or newly created)
132 */
133typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
134 struct ext4_ext_cache *,
135 struct ext4_extent *, void *);
136
137#define EXT_CONTINUE 0
138#define EXT_BREAK 1
139#define EXT_REPEAT 2
127 140
128#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
129 142
@@ -216,12 +229,16 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
216extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 229extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
217extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); 230extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
218extern int ext4_extent_tree_init(handle_t *, struct inode *); 231extern int ext4_extent_tree_init(handle_t *, struct inode *);
219extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); 232extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
233 int num,
234 struct ext4_ext_path *path);
220extern int ext4_ext_try_to_merge(struct inode *inode, 235extern int ext4_ext_try_to_merge(struct inode *inode,
221 struct ext4_ext_path *path, 236 struct ext4_ext_path *path,
222 struct ext4_extent *); 237 struct ext4_extent *);
223extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 238extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
224extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 239extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
240extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
241 ext_prepare_callback, void *);
225extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 242extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
226 struct ext4_ext_path *); 243 struct ext4_ext_path *);
227extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, 244extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index ef7409f0e7e4..5c124c0ac6d3 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t;
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned long ext4_group_t; 34typedef unsigned long ext4_group_t;
35 35
36struct ext4_reserve_window {
37 ext4_fsblk_t _rsv_start; /* First byte reserved */
38 ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */
39};
40
41struct ext4_reserve_window_node {
42 struct rb_node rsv_node;
43 __u32 rsv_goal_size;
44 __u32 rsv_alloc_hit;
45 struct ext4_reserve_window rsv_window;
46};
47
48struct ext4_block_alloc_info {
49 /* information about reservation window */
50 struct ext4_reserve_window_node rsv_window_node;
51 /*
52 * was i_next_alloc_block in ext4_inode_info
53 * is the logical (file-relative) number of the
54 * most-recently-allocated block in this file.
55 * We use this for detecting linearly ascending allocation requests.
56 */
57 ext4_lblk_t last_alloc_logical_block;
58 /*
59 * Was i_next_alloc_goal in ext4_inode_info
60 * is the *physical* companion to i_next_alloc_block.
61 * it the physical block number of the block which was most-recentl
62 * allocated to this file. This give us the goal (target) for the next
63 * allocation when we detect linearly ascending requests.
64 */
65 ext4_fsblk_t last_alloc_physical_block;
66};
67
68#define rsv_start rsv_window._rsv_start 36#define rsv_start rsv_window._rsv_start
69#define rsv_end rsv_window._rsv_end 37#define rsv_end rsv_window._rsv_end
70 38
@@ -97,11 +65,8 @@ struct ext4_inode_info {
97 ext4_group_t i_block_group; 65 ext4_group_t i_block_group;
98 __u32 i_state; /* Dynamic state flags for ext4 */ 66 __u32 i_state; /* Dynamic state flags for ext4 */
99 67
100 /* block reservation info */
101 struct ext4_block_alloc_info *i_block_alloc_info;
102
103 ext4_lblk_t i_dir_start_lookup; 68 ext4_lblk_t i_dir_start_lookup;
104#ifdef CONFIG_EXT4DEV_FS_XATTR 69#ifdef CONFIG_EXT4_FS_XATTR
105 /* 70 /*
106 * Extended attributes can be read independently of the main file 71 * Extended attributes can be read independently of the main file
107 * data. Taking i_mutex even when reading would cause contention 72 * data. Taking i_mutex even when reading would cause contention
@@ -111,7 +76,7 @@ struct ext4_inode_info {
111 */ 76 */
112 struct rw_semaphore xattr_sem; 77 struct rw_semaphore xattr_sem;
113#endif 78#endif
114#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 79#ifdef CONFIG_EXT4_FS_POSIX_ACL
115 struct posix_acl *i_acl; 80 struct posix_acl *i_acl;
116 struct posix_acl *i_default_acl; 81 struct posix_acl *i_default_acl;
117#endif 82#endif
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index eb8bc3afe6e9..b455c685a98b 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -51,6 +51,14 @@
51 EXT4_XATTR_TRANS_BLOCKS - 2 + \ 51 EXT4_XATTR_TRANS_BLOCKS - 2 + \
52 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 52 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
53 53
54/*
55 * Define the number of metadata blocks we need to account to modify data.
56 *
57 * This include super block, inode block, quota blocks and xattr blocks
58 */
59#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
60 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
61
54/* Delete operations potentially hit one directory's namespace plus an 62/* Delete operations potentially hit one directory's namespace plus an
55 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be 63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
56 * generous. We can grow the delete transaction later if necessary. */ 64 * generous. We can grow the delete transaction later if necessary. */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226d5531..445fde603df8 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -40,8 +40,8 @@ struct ext4_sb_info {
40 unsigned long s_blocks_last; /* Last seen block count */ 40 unsigned long s_blocks_last; /* Last seen block count */
41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 41 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
42 struct buffer_head * s_sbh; /* Buffer containing the super block */ 42 struct buffer_head * s_sbh; /* Buffer containing the super block */
43 struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ 43 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
44 struct buffer_head ** s_group_desc; 44 struct buffer_head **s_group_desc;
45 unsigned long s_mount_opt; 45 unsigned long s_mount_opt;
46 ext4_fsblk_t s_sb_block; 46 ext4_fsblk_t s_sb_block;
47 uid_t s_resuid; 47 uid_t s_resuid;
@@ -52,6 +52,7 @@ struct ext4_sb_info {
52 int s_desc_per_block_bits; 52 int s_desc_per_block_bits;
53 int s_inode_size; 53 int s_inode_size;
54 int s_first_ino; 54 int s_first_ino;
55 unsigned int s_inode_readahead_blks;
55 spinlock_t s_next_gen_lock; 56 spinlock_t s_next_gen_lock;
56 u32 s_next_generation; 57 u32 s_next_generation;
57 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
@@ -59,16 +60,17 @@ struct ext4_sb_info {
59 struct percpu_counter s_freeblocks_counter; 60 struct percpu_counter s_freeblocks_counter;
60 struct percpu_counter s_freeinodes_counter; 61 struct percpu_counter s_freeinodes_counter;
61 struct percpu_counter s_dirs_counter; 62 struct percpu_counter s_dirs_counter;
63 struct percpu_counter s_dirtyblocks_counter;
62 struct blockgroup_lock s_blockgroup_lock; 64 struct blockgroup_lock s_blockgroup_lock;
65 struct proc_dir_entry *s_proc;
63 66
64 /* root of the per fs reservation window tree */ 67 /* root of the per fs reservation window tree */
65 spinlock_t s_rsv_window_lock; 68 spinlock_t s_rsv_window_lock;
66 struct rb_root s_rsv_window_root; 69 struct rb_root s_rsv_window_root;
67 struct ext4_reserve_window_node s_rsv_window_head;
68 70
69 /* Journaling */ 71 /* Journaling */
70 struct inode * s_journal_inode; 72 struct inode *s_journal_inode;
71 struct journal_s * s_journal; 73 struct journal_s *s_journal;
72 struct list_head s_orphan; 74 struct list_head s_orphan;
73 unsigned long s_commit_interval; 75 unsigned long s_commit_interval;
74 struct block_device *journal_bdev; 76 struct block_device *journal_bdev;
@@ -97,21 +99,18 @@ struct ext4_sb_info {
97 struct inode *s_buddy_cache; 99 struct inode *s_buddy_cache;
98 long s_blocks_reserved; 100 long s_blocks_reserved;
99 spinlock_t s_reserve_lock; 101 spinlock_t s_reserve_lock;
100 struct list_head s_active_transaction;
101 struct list_head s_closed_transaction;
102 struct list_head s_committed_transaction;
103 spinlock_t s_md_lock; 102 spinlock_t s_md_lock;
104 tid_t s_last_transaction; 103 tid_t s_last_transaction;
105 unsigned short *s_mb_offsets, *s_mb_maxs; 104 unsigned short *s_mb_offsets, *s_mb_maxs;
106 105
107 /* tunables */ 106 /* tunables */
108 unsigned long s_stripe; 107 unsigned long s_stripe;
109 unsigned long s_mb_stream_request; 108 unsigned int s_mb_stream_request;
110 unsigned long s_mb_max_to_scan; 109 unsigned int s_mb_max_to_scan;
111 unsigned long s_mb_min_to_scan; 110 unsigned int s_mb_min_to_scan;
112 unsigned long s_mb_stats; 111 unsigned int s_mb_stats;
113 unsigned long s_mb_order2_reqs; 112 unsigned int s_mb_order2_reqs;
114 unsigned long s_mb_group_prealloc; 113 unsigned int s_mb_group_prealloc;
115 /* where last allocation was done - for stream allocation */ 114 /* where last allocation was done - for stream allocation */
116 unsigned long s_mb_last_group; 115 unsigned long s_mb_last_group;
117 unsigned long s_mb_last_start; 116 unsigned long s_mb_last_start;
@@ -121,7 +120,6 @@ struct ext4_sb_info {
121 int s_mb_history_cur; 120 int s_mb_history_cur;
122 int s_mb_history_max; 121 int s_mb_history_max;
123 int s_mb_history_num; 122 int s_mb_history_num;
124 struct proc_dir_entry *s_mb_proc;
125 spinlock_t s_mb_history_lock; 123 spinlock_t s_mb_history_lock;
126 int s_mb_history_filter; 124 int s_mb_history_filter;
127 125
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 612c3d2c3824..ea2ce3c0ae66 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/falloc.h> 41#include <linux/falloc.h>
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
44#include "ext4_extents.h" 45#include "ext4_extents.h"
45 46
@@ -383,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
383 ext_debug("\n"); 384 ext_debug("\n");
384} 385}
385#else 386#else
386#define ext4_ext_show_path(inode,path) 387#define ext4_ext_show_path(inode, path)
387#define ext4_ext_show_leaf(inode,path) 388#define ext4_ext_show_leaf(inode, path)
388#endif 389#endif
389 390
390void ext4_ext_drop_refs(struct ext4_ext_path *path) 391void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -440,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
440 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 441 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
441 if (k != 0 && 442 if (k != 0 &&
442 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 443 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
443 printk("k=%d, ix=0x%p, first=0x%p\n", k, 444 printk(KERN_DEBUG "k=%d, ix=0x%p, "
444 ix, EXT_FIRST_INDEX(eh)); 445 "first=0x%p\n", k,
445 printk("%u <= %u\n", 446 ix, EXT_FIRST_INDEX(eh));
447 printk(KERN_DEBUG "%u <= %u\n",
446 le32_to_cpu(ix->ei_block), 448 le32_to_cpu(ix->ei_block),
447 le32_to_cpu(ix[-1].ei_block)); 449 le32_to_cpu(ix[-1].ei_block));
448 } 450 }
@@ -1475,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1475 struct ext4_ext_path *path, 1477 struct ext4_ext_path *path,
1476 struct ext4_extent *newext) 1478 struct ext4_extent *newext)
1477{ 1479{
1478 struct ext4_extent_header * eh; 1480 struct ext4_extent_header *eh;
1479 struct ext4_extent *ex, *fex; 1481 struct ext4_extent *ex, *fex;
1480 struct ext4_extent *nearex; /* nearest extent */ 1482 struct ext4_extent *nearex; /* nearest extent */
1481 struct ext4_ext_path *npath = NULL; 1483 struct ext4_ext_path *npath = NULL;
@@ -1625,6 +1627,113 @@ cleanup:
1625 return err; 1627 return err;
1626} 1628}
1627 1629
1630int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1631 ext4_lblk_t num, ext_prepare_callback func,
1632 void *cbdata)
1633{
1634 struct ext4_ext_path *path = NULL;
1635 struct ext4_ext_cache cbex;
1636 struct ext4_extent *ex;
1637 ext4_lblk_t next, start = 0, end = 0;
1638 ext4_lblk_t last = block + num;
1639 int depth, exists, err = 0;
1640
1641 BUG_ON(func == NULL);
1642 BUG_ON(inode == NULL);
1643
1644 while (block < last && block != EXT_MAX_BLOCK) {
1645 num = last - block;
1646 /* find extent for this block */
1647 path = ext4_ext_find_extent(inode, block, path);
1648 if (IS_ERR(path)) {
1649 err = PTR_ERR(path);
1650 path = NULL;
1651 break;
1652 }
1653
1654 depth = ext_depth(inode);
1655 BUG_ON(path[depth].p_hdr == NULL);
1656 ex = path[depth].p_ext;
1657 next = ext4_ext_next_allocated_block(path);
1658
1659 exists = 0;
1660 if (!ex) {
1661 /* there is no extent yet, so try to allocate
1662 * all requested space */
1663 start = block;
1664 end = block + num;
1665 } else if (le32_to_cpu(ex->ee_block) > block) {
1666 /* need to allocate space before found extent */
1667 start = block;
1668 end = le32_to_cpu(ex->ee_block);
1669 if (block + num < end)
1670 end = block + num;
1671 } else if (block >= le32_to_cpu(ex->ee_block)
1672 + ext4_ext_get_actual_len(ex)) {
1673 /* need to allocate space after found extent */
1674 start = block;
1675 end = block + num;
1676 if (end >= next)
1677 end = next;
1678 } else if (block >= le32_to_cpu(ex->ee_block)) {
1679 /*
1680 * some part of requested space is covered
1681 * by found extent
1682 */
1683 start = block;
1684 end = le32_to_cpu(ex->ee_block)
1685 + ext4_ext_get_actual_len(ex);
1686 if (block + num < end)
1687 end = block + num;
1688 exists = 1;
1689 } else {
1690 BUG();
1691 }
1692 BUG_ON(end <= start);
1693
1694 if (!exists) {
1695 cbex.ec_block = start;
1696 cbex.ec_len = end - start;
1697 cbex.ec_start = 0;
1698 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1699 } else {
1700 cbex.ec_block = le32_to_cpu(ex->ee_block);
1701 cbex.ec_len = ext4_ext_get_actual_len(ex);
1702 cbex.ec_start = ext_pblock(ex);
1703 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1704 }
1705
1706 BUG_ON(cbex.ec_len == 0);
1707 err = func(inode, path, &cbex, ex, cbdata);
1708 ext4_ext_drop_refs(path);
1709
1710 if (err < 0)
1711 break;
1712
1713 if (err == EXT_REPEAT)
1714 continue;
1715 else if (err == EXT_BREAK) {
1716 err = 0;
1717 break;
1718 }
1719
1720 if (ext_depth(inode) != depth) {
1721 /* depth was changed. we have to realloc path */
1722 kfree(path);
1723 path = NULL;
1724 }
1725
1726 block = cbex.ec_block + cbex.ec_len;
1727 }
1728
1729 if (path) {
1730 ext4_ext_drop_refs(path);
1731 kfree(path);
1732 }
1733
1734 return err;
1735}
1736
1628static void 1737static void
1629ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1738ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1630 __u32 len, ext4_fsblk_t start, int type) 1739 __u32 len, ext4_fsblk_t start, int type)
@@ -1747,54 +1856,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1747} 1856}
1748 1857
1749/* 1858/*
1750 * ext4_ext_calc_credits_for_insert: 1859 * ext4_ext_calc_credits_for_single_extent:
1751 * This routine returns max. credits that the extent tree can consume. 1860 * This routine returns max. credits that needed to insert an extent
1752 * It should be OK for low-performance paths like ->writepage() 1861 * to the extent tree.
1753 * To allow many writing processes to fit into a single transaction, 1862 * When pass the actual path, the caller should calculate credits
1754 * the caller should calculate credits under i_data_sem and 1863 * under i_data_sem.
1755 * pass the actual path.
1756 */ 1864 */
1757int ext4_ext_calc_credits_for_insert(struct inode *inode, 1865int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1758 struct ext4_ext_path *path) 1866 struct ext4_ext_path *path)
1759{ 1867{
1760 int depth, needed;
1761
1762 if (path) { 1868 if (path) {
1869 int depth = ext_depth(inode);
1870 int ret = 0;
1871
1763 /* probably there is space in leaf? */ 1872 /* probably there is space in leaf? */
1764 depth = ext_depth(inode);
1765 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 1873 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1766 < le16_to_cpu(path[depth].p_hdr->eh_max)) 1874 < le16_to_cpu(path[depth].p_hdr->eh_max)) {
1767 return 1;
1768 }
1769
1770 /*
1771 * given 32-bit logical block (4294967296 blocks), max. tree
1772 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1773 * Let's also add one more level for imbalance.
1774 */
1775 depth = 5;
1776 1875
1777 /* allocation of new data block(s) */ 1876 /*
1778 needed = 2; 1877 * There are some space in the leaf tree, no
1878 * need to account for leaf block credit
1879 *
1880 * bitmaps and block group descriptor blocks
1881 * and other metadat blocks still need to be
1882 * accounted.
1883 */
1884 /* 1 bitmap, 1 block group descriptor */
1885 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
1886 }
1887 }
1779 1888
1780 /* 1889 return ext4_chunk_trans_blocks(inode, nrblocks);
1781 * tree can be full, so it would need to grow in depth: 1890}
1782 * we need one credit to modify old root, credits for
1783 * new root will be added in split accounting
1784 */
1785 needed += 1;
1786 1891
1787 /* 1892/*
1788 * Index split can happen, we would need: 1893 * How many index/leaf blocks need to change/allocate to modify nrblocks?
1789 * allocate intermediate indexes (bitmap + group) 1894 *
1790 * + change two blocks at each level, but root (already included) 1895 * if nrblocks are fit in a single extent (chunk flag is 1), then
1791 */ 1896 * in the worse case, each tree level index/leaf need to be changed
1792 needed += (depth * 2) + (depth * 2); 1897 * if the tree split due to insert a new extent, then the old tree
1898 * index/leaf need to be updated too
1899 *
1900 * If the nrblocks are discontiguous, they could cause
1901 * the whole tree split more than once, but this is really rare.
1902 */
1903int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
1904{
1905 int index;
1906 int depth = ext_depth(inode);
1793 1907
1794 /* any allocation modifies superblock */ 1908 if (chunk)
1795 needed += 1; 1909 index = depth * 2;
1910 else
1911 index = depth * 3;
1796 1912
1797 return needed; 1913 return index;
1798} 1914}
1799 1915
1800static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 1916static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
@@ -1921,9 +2037,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1921 correct_index = 1; 2037 correct_index = 1;
1922 credits += (ext_depth(inode)) + 1; 2038 credits += (ext_depth(inode)) + 1;
1923 } 2039 }
1924#ifdef CONFIG_QUOTA
1925 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2040 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1926#endif
1927 2041
1928 err = ext4_ext_journal_restart(handle, credits); 2042 err = ext4_ext_journal_restart(handle, credits);
1929 if (err) 2043 if (err)
@@ -2137,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb)
2137 */ 2251 */
2138 2252
2139 if (test_opt(sb, EXTENTS)) { 2253 if (test_opt(sb, EXTENTS)) {
2140 printk("EXT4-fs: file extents enabled"); 2254 printk(KERN_INFO "EXT4-fs: file extents enabled");
2141#ifdef AGGRESSIVE_TEST 2255#ifdef AGGRESSIVE_TEST
2142 printk(", aggressive tests"); 2256 printk(", aggressive tests");
2143#endif 2257#endif
@@ -2691,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2691 goto out2; 2805 goto out2;
2692 } 2806 }
2693 /* 2807 /*
2694 * Okay, we need to do block allocation. Lazily initialize the block 2808 * Okay, we need to do block allocation.
2695 * allocation info here if necessary.
2696 */ 2809 */
2697 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2698 ext4_init_block_alloc_info(inode);
2699 2810
2700 /* find neighbour allocated blocks */ 2811 /* find neighbour allocated blocks */
2701 ar.lleft = iblock; 2812 ar.lleft = iblock;
@@ -2755,7 +2866,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2755 /* free data blocks we just allocated */ 2866 /* free data blocks we just allocated */
2756 /* not a good idea to call discard here directly, 2867 /* not a good idea to call discard here directly,
2757 * but otherwise we'd need to call it every free() */ 2868 * but otherwise we'd need to call it every free() */
2758 ext4_mb_discard_inode_preallocations(inode); 2869 ext4_discard_preallocations(inode);
2759 ext4_free_blocks(handle, inode, ext_pblock(&newex), 2870 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2760 ext4_ext_get_actual_len(&newex), 0); 2871 ext4_ext_get_actual_len(&newex), 0);
2761 goto out2; 2872 goto out2;
@@ -2805,7 +2916,7 @@ void ext4_ext_truncate(struct inode *inode)
2805 /* 2916 /*
2806 * probably first extent we're gonna free will be last in block 2917 * probably first extent we're gonna free will be last in block
2807 */ 2918 */
2808 err = ext4_writepage_trans_blocks(inode) + 3; 2919 err = ext4_writepage_trans_blocks(inode);
2809 handle = ext4_journal_start(inode, err); 2920 handle = ext4_journal_start(inode, err);
2810 if (IS_ERR(handle)) 2921 if (IS_ERR(handle))
2811 return; 2922 return;
@@ -2819,7 +2930,7 @@ void ext4_ext_truncate(struct inode *inode)
2819 down_write(&EXT4_I(inode)->i_data_sem); 2930 down_write(&EXT4_I(inode)->i_data_sem);
2820 ext4_ext_invalidate_cache(inode); 2931 ext4_ext_invalidate_cache(inode);
2821 2932
2822 ext4_mb_discard_inode_preallocations(inode); 2933 ext4_discard_preallocations(inode);
2823 2934
2824 /* 2935 /*
2825 * TODO: optimization is possible here. 2936 * TODO: optimization is possible here.
@@ -2858,27 +2969,6 @@ out_stop:
2858 ext4_journal_stop(handle); 2969 ext4_journal_stop(handle);
2859} 2970}
2860 2971
2861/*
2862 * ext4_ext_writepage_trans_blocks:
2863 * calculate max number of blocks we could modify
2864 * in order to allocate new block for an inode
2865 */
2866int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2867{
2868 int needed;
2869
2870 needed = ext4_ext_calc_credits_for_insert(inode, NULL);
2871
2872 /* caller wants to allocate num blocks, but note it includes sb */
2873 needed = needed * num - (num - 1);
2874
2875#ifdef CONFIG_QUOTA
2876 needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2877#endif
2878
2879 return needed;
2880}
2881
2882static void ext4_falloc_update_inode(struct inode *inode, 2972static void ext4_falloc_update_inode(struct inode *inode,
2883 int mode, loff_t new_size, int update_ctime) 2973 int mode, loff_t new_size, int update_ctime)
2884{ 2974{
@@ -2893,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode,
2893 * Update only when preallocation was requested beyond 2983 * Update only when preallocation was requested beyond
2894 * the file size. 2984 * the file size.
2895 */ 2985 */
2896 if (!(mode & FALLOC_FL_KEEP_SIZE) && 2986 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
2897 new_size > i_size_read(inode)) { 2987 if (new_size > i_size_read(inode))
2898 i_size_write(inode, new_size); 2988 i_size_write(inode, new_size);
2899 EXT4_I(inode)->i_disksize = new_size; 2989 if (new_size > EXT4_I(inode)->i_disksize)
2990 ext4_update_i_disksize(inode, new_size);
2900 } 2991 }
2901 2992
2902} 2993}
@@ -2939,10 +3030,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
2939 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3030 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
2940 - block; 3031 - block;
2941 /* 3032 /*
2942 * credits to insert 1 extent into extent tree + buffers to be able to 3033 * credits to insert 1 extent into extent tree
2943 * modify 1 super block, 1 block bitmap and 1 group descriptor.
2944 */ 3034 */
2945 credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; 3035 credits = ext4_chunk_trans_blocks(inode, max_blocks);
2946 mutex_lock(&inode->i_mutex); 3036 mutex_lock(&inode->i_mutex);
2947retry: 3037retry:
2948 while (ret >= 0 && ret < max_blocks) { 3038 while (ret >= 0 && ret < max_blocks) {
@@ -2989,3 +3079,143 @@ retry:
2989 mutex_unlock(&inode->i_mutex); 3079 mutex_unlock(&inode->i_mutex);
2990 return ret > 0 ? ret2 : ret; 3080 return ret > 0 ? ret2 : ret;
2991} 3081}
3082
3083/*
3084 * Callback function called for each extent to gather FIEMAP information.
3085 */
3086int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3087 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3088 void *data)
3089{
3090 struct fiemap_extent_info *fieinfo = data;
3091 unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
3092 __u64 logical;
3093 __u64 physical;
3094 __u64 length;
3095 __u32 flags = 0;
3096 int error;
3097
3098 logical = (__u64)newex->ec_block << blksize_bits;
3099
3100 if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
3101 pgoff_t offset;
3102 struct page *page;
3103 struct buffer_head *bh = NULL;
3104
3105 offset = logical >> PAGE_SHIFT;
3106 page = find_get_page(inode->i_mapping, offset);
3107 if (!page || !page_has_buffers(page))
3108 return EXT_CONTINUE;
3109
3110 bh = page_buffers(page);
3111
3112 if (!bh)
3113 return EXT_CONTINUE;
3114
3115 if (buffer_delay(bh)) {
3116 flags |= FIEMAP_EXTENT_DELALLOC;
3117 page_cache_release(page);
3118 } else {
3119 page_cache_release(page);
3120 return EXT_CONTINUE;
3121 }
3122 }
3123
3124 physical = (__u64)newex->ec_start << blksize_bits;
3125 length = (__u64)newex->ec_len << blksize_bits;
3126
3127 if (ex && ext4_ext_is_uninitialized(ex))
3128 flags |= FIEMAP_EXTENT_UNWRITTEN;
3129
3130 /*
3131 * If this extent reaches EXT_MAX_BLOCK, it must be last.
3132 *
3133 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3134 * this also indicates no more allocated blocks.
3135 *
3136 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3137 */
3138 if (logical + length - 1 == EXT_MAX_BLOCK ||
3139 ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
3140 flags |= FIEMAP_EXTENT_LAST;
3141
3142 error = fiemap_fill_next_extent(fieinfo, logical, physical,
3143 length, flags);
3144 if (error < 0)
3145 return error;
3146 if (error == 1)
3147 return EXT_BREAK;
3148
3149 return EXT_CONTINUE;
3150}
3151
3152/* fiemap flags we can handle specified here */
3153#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3154
3155int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
3156{
3157 __u64 physical = 0;
3158 __u64 length;
3159 __u32 flags = FIEMAP_EXTENT_LAST;
3160 int blockbits = inode->i_sb->s_blocksize_bits;
3161 int error = 0;
3162
3163 /* in-inode? */
3164 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
3165 struct ext4_iloc iloc;
3166 int offset; /* offset of xattr in inode */
3167
3168 error = ext4_get_inode_loc(inode, &iloc);
3169 if (error)
3170 return error;
3171 physical = iloc.bh->b_blocknr << blockbits;
3172 offset = EXT4_GOOD_OLD_INODE_SIZE +
3173 EXT4_I(inode)->i_extra_isize;
3174 physical += offset;
3175 length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
3176 flags |= FIEMAP_EXTENT_DATA_INLINE;
3177 } else { /* external block */
3178 physical = EXT4_I(inode)->i_file_acl << blockbits;
3179 length = inode->i_sb->s_blocksize;
3180 }
3181
3182 if (physical)
3183 error = fiemap_fill_next_extent(fieinfo, 0, physical,
3184 length, flags);
3185 return (error < 0 ? error : 0);
3186}
3187
3188int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3189 __u64 start, __u64 len)
3190{
3191 ext4_lblk_t start_blk;
3192 ext4_lblk_t len_blks;
3193 int error = 0;
3194
3195 /* fallback to generic here if not in extents fmt */
3196 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
3197 return generic_block_fiemap(inode, fieinfo, start, len,
3198 ext4_get_block);
3199
3200 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
3201 return -EBADR;
3202
3203 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
3204 error = ext4_xattr_fiemap(inode, fieinfo);
3205 } else {
3206 start_blk = start >> inode->i_sb->s_blocksize_bits;
3207 len_blks = len >> inode->i_sb->s_blocksize_bits;
3208
3209 /*
3210 * Walk the extent tree gathering extent information.
3211 * ext4_ext_fiemap_cb will push extents back to user.
3212 */
3213 down_write(&EXT4_I(inode)->i_data_sem);
3214 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3215 ext4_ext_fiemap_cb, fieinfo);
3216 up_write(&EXT4_I(inode)->i_data_sem);
3217 }
3218
3219 return error;
3220}
3221
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 430eb7978db4..6bd11fba71f7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,14 +31,14 @@
31 * from ext4_file_open: open gets called at every open, but release 31 * from ext4_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed. 32 * gets called only when /all/ the files are closed.
33 */ 33 */
34static int ext4_release_file (struct inode * inode, struct file * filp) 34static int ext4_release_file(struct inode *inode, struct file *filp)
35{ 35{
36 /* if we are the last writer on the inode, drop the block reservation */ 36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) && 37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1)) 38 (atomic_read(&inode->i_writecount) == 1))
39 { 39 {
40 down_write(&EXT4_I(inode)->i_data_sem); 40 down_write(&EXT4_I(inode)->i_data_sem);
41 ext4_discard_reservation(inode); 41 ext4_discard_preallocations(inode);
42 up_write(&EXT4_I(inode)->i_data_sem); 42 up_write(&EXT4_I(inode)->i_data_sem);
43 } 43 }
44 if (is_dx(inode) && filp->private_data) 44 if (is_dx(inode) && filp->private_data)
@@ -140,6 +140,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
140 return 0; 140 return 0;
141} 141}
142 142
143extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
144 __u64 start, __u64 len);
145
143const struct file_operations ext4_file_operations = { 146const struct file_operations ext4_file_operations = {
144 .llseek = generic_file_llseek, 147 .llseek = generic_file_llseek,
145 .read = do_sync_read, 148 .read = do_sync_read,
@@ -162,7 +165,7 @@ const struct inode_operations ext4_file_inode_operations = {
162 .truncate = ext4_truncate, 165 .truncate = ext4_truncate,
163 .setattr = ext4_setattr, 166 .setattr = ext4_setattr,
164 .getattr = ext4_getattr, 167 .getattr = ext4_getattr,
165#ifdef CONFIG_EXT4DEV_FS_XATTR 168#ifdef CONFIG_EXT4_FS_XATTR
166 .setxattr = generic_setxattr, 169 .setxattr = generic_setxattr,
167 .getxattr = generic_getxattr, 170 .getxattr = generic_getxattr,
168 .listxattr = ext4_listxattr, 171 .listxattr = ext4_listxattr,
@@ -170,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = {
170#endif 173#endif
171 .permission = ext4_permission, 174 .permission = ext4_permission,
172 .fallocate = ext4_fallocate, 175 .fallocate = ext4_fallocate,
176 .fiemap = ext4_fiemap,
173}; 177};
174 178
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a45c3737ad31..5afe4370840b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/marker.h>
31#include "ext4.h" 32#include "ext4.h"
32#include "ext4_jbd2.h" 33#include "ext4_jbd2.h"
33 34
@@ -43,7 +44,7 @@
43 * inode to disk. 44 * inode to disk.
44 */ 45 */
45 46
46int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) 47int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
47{ 48{
48 struct inode *inode = dentry->d_inode; 49 struct inode *inode = dentry->d_inode;
49 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
@@ -51,6 +52,10 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
51 52
52 J_ASSERT(ext4_journal_current_handle() == NULL); 53 J_ASSERT(ext4_journal_current_handle() == NULL);
53 54
55 trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
56 inode->i_sb->s_id, datasync, inode->i_ino,
57 dentry->d_parent->d_inode->i_ino);
58
54 /* 59 /*
55 * data=writeback: 60 * data=writeback:
56 * The caller's filemap_fdatawrite()/wait will sync the data. 61 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329dbe390..556ca8eba3db 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
27 sum += DELTA; 27 sum += DELTA;
28 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); 28 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
29 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); 29 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
30 } while(--n); 30 } while (--n);
31 31
32 buf[0] += b0; 32 buf[0] += b0;
33 buf[1] += b1; 33 buf[1] += b1;
@@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash (const char *name, int len) 38static __u32 dx_hack_hash(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 while (len--) { 41 while (len--) {
@@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
59 val = pad; 59 val = pad;
60 if (len > num*4) 60 if (len > num*4)
61 len = num * 4; 61 len = num * 4;
62 for (i=0; i < len; i++) { 62 for (i = 0; i < len; i++) {
63 if ((i % 4) == 0) 63 if ((i % 4) == 0)
64 val = pad; 64 val = pad;
65 val = msg[i] + (val << 8); 65 val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
104 104
105 /* Check to see if the seed is all zero's */ 105 /* Check to see if the seed is all zero's */
106 if (hinfo->seed) { 106 if (hinfo->seed) {
107 for (i=0; i < 4; i++) { 107 for (i = 0; i < 4; i++) {
108 if (hinfo->seed[i]) 108 if (hinfo->seed[i])
109 break; 109 break;
110 } 110 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 655e760212b8..fe34d74cfb19 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
115 block_group, bitmap_blk); 115 block_group, bitmap_blk);
116 return NULL; 116 return NULL;
117 } 117 }
118 if (bh_uptodate_or_lock(bh)) 118 if (buffer_uptodate(bh) &&
119 !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
119 return bh; 120 return bh;
120 121
122 lock_buffer(bh);
121 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 123 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
122 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 124 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
123 ext4_init_inode_bitmap(sb, bh, block_group, desc); 125 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -154,39 +156,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
154 * though), and then we'd have two inodes sharing the 156 * though), and then we'd have two inodes sharing the
155 * same inode number and space on the harddisk. 157 * same inode number and space on the harddisk.
156 */ 158 */
157void ext4_free_inode (handle_t *handle, struct inode * inode) 159void ext4_free_inode(handle_t *handle, struct inode *inode)
158{ 160{
159 struct super_block * sb = inode->i_sb; 161 struct super_block *sb = inode->i_sb;
160 int is_directory; 162 int is_directory;
161 unsigned long ino; 163 unsigned long ino;
162 struct buffer_head *bitmap_bh = NULL; 164 struct buffer_head *bitmap_bh = NULL;
163 struct buffer_head *bh2; 165 struct buffer_head *bh2;
164 ext4_group_t block_group; 166 ext4_group_t block_group;
165 unsigned long bit; 167 unsigned long bit;
166 struct ext4_group_desc * gdp; 168 struct ext4_group_desc *gdp;
167 struct ext4_super_block * es; 169 struct ext4_super_block *es;
168 struct ext4_sb_info *sbi; 170 struct ext4_sb_info *sbi;
169 int fatal = 0, err; 171 int fatal = 0, err;
170 ext4_group_t flex_group; 172 ext4_group_t flex_group;
171 173
172 if (atomic_read(&inode->i_count) > 1) { 174 if (atomic_read(&inode->i_count) > 1) {
173 printk ("ext4_free_inode: inode has count=%d\n", 175 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
174 atomic_read(&inode->i_count)); 176 atomic_read(&inode->i_count));
175 return; 177 return;
176 } 178 }
177 if (inode->i_nlink) { 179 if (inode->i_nlink) {
178 printk ("ext4_free_inode: inode has nlink=%d\n", 180 printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
179 inode->i_nlink); 181 inode->i_nlink);
180 return; 182 return;
181 } 183 }
182 if (!sb) { 184 if (!sb) {
183 printk("ext4_free_inode: inode on nonexistent device\n"); 185 printk(KERN_ERR "ext4_free_inode: inode on "
186 "nonexistent device\n");
184 return; 187 return;
185 } 188 }
186 sbi = EXT4_SB(sb); 189 sbi = EXT4_SB(sb);
187 190
188 ino = inode->i_ino; 191 ino = inode->i_ino;
189 ext4_debug ("freeing inode %lu\n", ino); 192 ext4_debug("freeing inode %lu\n", ino);
190 193
191 /* 194 /*
192 * Note: we must free any quota before locking the superblock, 195 * Note: we must free any quota before locking the superblock,
@@ -200,12 +203,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
200 is_directory = S_ISDIR(inode->i_mode); 203 is_directory = S_ISDIR(inode->i_mode);
201 204
202 /* Do this BEFORE marking the inode not in use or returning an error */ 205 /* Do this BEFORE marking the inode not in use or returning an error */
203 clear_inode (inode); 206 clear_inode(inode);
204 207
205 es = EXT4_SB(sb)->s_es; 208 es = EXT4_SB(sb)->s_es;
206 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { 209 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
207 ext4_error (sb, "ext4_free_inode", 210 ext4_error(sb, "ext4_free_inode",
208 "reserved or nonexistent inode %lu", ino); 211 "reserved or nonexistent inode %lu", ino);
209 goto error_return; 212 goto error_return;
210 } 213 }
211 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 214 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -222,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
222 /* Ok, now we can actually update the inode bitmaps.. */ 225 /* Ok, now we can actually update the inode bitmaps.. */
223 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 226 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
224 bit, bitmap_bh->b_data)) 227 bit, bitmap_bh->b_data))
225 ext4_error (sb, "ext4_free_inode", 228 ext4_error(sb, "ext4_free_inode",
226 "bit already cleared for inode %lu", ino); 229 "bit already cleared for inode %lu", ino);
227 else { 230 else {
228 gdp = ext4_get_group_desc (sb, block_group, &bh2); 231 gdp = ext4_get_group_desc(sb, block_group, &bh2);
229 232
230 BUFFER_TRACE(bh2, "get_write_access"); 233 BUFFER_TRACE(bh2, "get_write_access");
231 fatal = ext4_journal_get_write_access(handle, bh2); 234 fatal = ext4_journal_get_write_access(handle, bh2);
@@ -287,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
287 avefreei = freei / ngroups; 290 avefreei = freei / ngroups;
288 291
289 for (group = 0; group < ngroups; group++) { 292 for (group = 0; group < ngroups; group++) {
290 desc = ext4_get_group_desc (sb, group, NULL); 293 desc = ext4_get_group_desc(sb, group, NULL);
291 if (!desc || !desc->bg_free_inodes_count) 294 if (!desc || !desc->bg_free_inodes_count)
292 continue; 295 continue;
293 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 296 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -351,7 +354,7 @@ find_close_to_parent:
351 goto found_flexbg; 354 goto found_flexbg;
352 } 355 }
353 356
354 if (best_flex < 0 || 357 if (flex_group[best_flex].free_inodes == 0 ||
355 (flex_group[i].free_blocks > 358 (flex_group[i].free_blocks >
356 flex_group[best_flex].free_blocks && 359 flex_group[best_flex].free_blocks &&
357 flex_group[i].free_inodes)) 360 flex_group[i].free_inodes))
@@ -576,16 +579,16 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
576 * For other inodes, search forward from the parent directory's block 579 * For other inodes, search forward from the parent directory's block
577 * group to find a free inode. 580 * group to find a free inode.
578 */ 581 */
579struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) 582struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
580{ 583{
581 struct super_block *sb; 584 struct super_block *sb;
582 struct buffer_head *bitmap_bh = NULL; 585 struct buffer_head *bitmap_bh = NULL;
583 struct buffer_head *bh2; 586 struct buffer_head *bh2;
584 ext4_group_t group = 0; 587 ext4_group_t group = 0;
585 unsigned long ino = 0; 588 unsigned long ino = 0;
586 struct inode * inode; 589 struct inode *inode;
587 struct ext4_group_desc * gdp = NULL; 590 struct ext4_group_desc *gdp = NULL;
588 struct ext4_super_block * es; 591 struct ext4_super_block *es;
589 struct ext4_inode_info *ei; 592 struct ext4_inode_info *ei;
590 struct ext4_sb_info *sbi; 593 struct ext4_sb_info *sbi;
591 int ret2, err = 0; 594 int ret2, err = 0;
@@ -613,7 +616,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
613 } 616 }
614 617
615 if (S_ISDIR(mode)) { 618 if (S_ISDIR(mode)) {
616 if (test_opt (sb, OLDALLOC)) 619 if (test_opt(sb, OLDALLOC))
617 ret2 = find_group_dir(sb, dir, &group); 620 ret2 = find_group_dir(sb, dir, &group);
618 else 621 else
619 ret2 = find_group_orlov(sb, dir, &group); 622 ret2 = find_group_orlov(sb, dir, &group);
@@ -783,7 +786,7 @@ got:
783 } 786 }
784 787
785 inode->i_uid = current->fsuid; 788 inode->i_uid = current->fsuid;
786 if (test_opt (sb, GRPID)) 789 if (test_opt(sb, GRPID))
787 inode->i_gid = dir->i_gid; 790 inode->i_gid = dir->i_gid;
788 else if (dir->i_mode & S_ISGID) { 791 else if (dir->i_mode & S_ISGID) {
789 inode->i_gid = dir->i_gid; 792 inode->i_gid = dir->i_gid;
@@ -816,7 +819,6 @@ got:
816 ei->i_flags &= ~EXT4_DIRSYNC_FL; 819 ei->i_flags &= ~EXT4_DIRSYNC_FL;
817 ei->i_file_acl = 0; 820 ei->i_file_acl = 0;
818 ei->i_dtime = 0; 821 ei->i_dtime = 0;
819 ei->i_block_alloc_info = NULL;
820 ei->i_block_group = group; 822 ei->i_block_group = group;
821 823
822 ext4_set_inode_flags(inode); 824 ext4_set_inode_flags(inode);
@@ -832,7 +834,7 @@ got:
832 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 834 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
833 835
834 ret = inode; 836 ret = inode;
835 if(DQUOT_ALLOC_INODE(inode)) { 837 if (DQUOT_ALLOC_INODE(inode)) {
836 err = -EDQUOT; 838 err = -EDQUOT;
837 goto fail_drop; 839 goto fail_drop;
838 } 840 }
@@ -841,7 +843,7 @@ got:
841 if (err) 843 if (err)
842 goto fail_free_drop; 844 goto fail_free_drop;
843 845
844 err = ext4_init_security(handle,inode, dir); 846 err = ext4_init_security(handle, inode, dir);
845 if (err) 847 if (err)
846 goto fail_free_drop; 848 goto fail_free_drop;
847 849
@@ -959,7 +961,7 @@ error:
959 return ERR_PTR(err); 961 return ERR_PTR(err);
960} 962}
961 963
962unsigned long ext4_count_free_inodes (struct super_block * sb) 964unsigned long ext4_count_free_inodes(struct super_block *sb)
963{ 965{
964 unsigned long desc_count; 966 unsigned long desc_count;
965 struct ext4_group_desc *gdp; 967 struct ext4_group_desc *gdp;
@@ -974,7 +976,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
974 bitmap_count = 0; 976 bitmap_count = 0;
975 gdp = NULL; 977 gdp = NULL;
976 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 978 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
977 gdp = ext4_get_group_desc (sb, i, NULL); 979 gdp = ext4_get_group_desc(sb, i, NULL);
978 if (!gdp) 980 if (!gdp)
979 continue; 981 continue;
980 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 982 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -989,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
989 bitmap_count += x; 991 bitmap_count += x;
990 } 992 }
991 brelse(bitmap_bh); 993 brelse(bitmap_bh);
992 printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n", 994 printk(KERN_DEBUG "ext4_count_free_inodes: "
993 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); 995 "stored = %u, computed = %lu, %lu\n",
996 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
994 return desc_count; 997 return desc_count;
995#else 998#else
996 desc_count = 0; 999 desc_count = 0;
997 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1000 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
998 gdp = ext4_get_group_desc (sb, i, NULL); 1001 gdp = ext4_get_group_desc(sb, i, NULL);
999 if (!gdp) 1002 if (!gdp)
1000 continue; 1003 continue;
1001 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1004 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -1006,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
1006} 1009}
1007 1010
1008/* Called at mount-time, super-block is locked */ 1011/* Called at mount-time, super-block is locked */
1009unsigned long ext4_count_dirs (struct super_block * sb) 1012unsigned long ext4_count_dirs(struct super_block * sb)
1010{ 1013{
1011 unsigned long count = 0; 1014 unsigned long count = 0;
1012 ext4_group_t i; 1015 ext4_group_t i;
1013 1016
1014 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { 1017 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
1015 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); 1018 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1016 if (!gdp) 1019 if (!gdp)
1017 continue; 1020 continue;
1018 count += le16_to_cpu(gdp->bg_used_dirs_count); 1021 count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 59fbbe899acc..8dbf6953845b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
41#include "acl.h" 41#include "acl.h"
42#include "ext4_extents.h" 42#include "ext4_extents.h"
43 43
44#define MPAGE_DA_EXTENT_TAIL 0x01
45
44static inline int ext4_begin_ordered_truncate(struct inode *inode, 46static inline int ext4_begin_ordered_truncate(struct inode *inode,
45 loff_t new_size) 47 loff_t new_size)
46{ 48{
@@ -188,7 +190,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
188/* 190/*
189 * Called at the last iput() if i_nlink is zero. 191 * Called at the last iput() if i_nlink is zero.
190 */ 192 */
191void ext4_delete_inode (struct inode * inode) 193void ext4_delete_inode(struct inode *inode)
192{ 194{
193 handle_t *handle; 195 handle_t *handle;
194 int err; 196 int err;
@@ -328,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode,
328 int final = 0; 330 int final = 0;
329 331
330 if (i_block < 0) { 332 if (i_block < 0) {
331 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0"); 333 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
332 } else if (i_block < direct_blocks) { 334 } else if (i_block < direct_blocks) {
333 offsets[n++] = i_block; 335 offsets[n++] = i_block;
334 final = direct_blocks; 336 final = direct_blocks;
335 } else if ( (i_block -= direct_blocks) < indirect_blocks) { 337 } else if ((i_block -= direct_blocks) < indirect_blocks) {
336 offsets[n++] = EXT4_IND_BLOCK; 338 offsets[n++] = EXT4_IND_BLOCK;
337 offsets[n++] = i_block; 339 offsets[n++] = i_block;
338 final = ptrs; 340 final = ptrs;
@@ -398,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
398 400
399 *err = 0; 401 *err = 0;
400 /* i_data is not going away, no lock needed */ 402 /* i_data is not going away, no lock needed */
401 add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets); 403 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
402 if (!p->key) 404 if (!p->key)
403 goto no_block; 405 goto no_block;
404 while (--depth) { 406 while (--depth) {
405 bh = sb_bread(sb, le32_to_cpu(p->key)); 407 bh = sb_bread(sb, le32_to_cpu(p->key));
406 if (!bh) 408 if (!bh)
407 goto failure; 409 goto failure;
408 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); 410 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
409 /* Reader: end */ 411 /* Reader: end */
410 if (!p->key) 412 if (!p->key)
411 goto no_block; 413 goto no_block;
@@ -441,7 +443,7 @@ no_block:
441static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 443static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
442{ 444{
443 struct ext4_inode_info *ei = EXT4_I(inode); 445 struct ext4_inode_info *ei = EXT4_I(inode);
444 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; 446 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
445 __le32 *p; 447 __le32 *p;
446 ext4_fsblk_t bg_start; 448 ext4_fsblk_t bg_start;
447 ext4_fsblk_t last_block; 449 ext4_fsblk_t last_block;
@@ -484,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
484static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 486static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
485 Indirect *partial) 487 Indirect *partial)
486{ 488{
487 struct ext4_block_alloc_info *block_i;
488
489 block_i = EXT4_I(inode)->i_block_alloc_info;
490
491 /* 489 /*
492 * try the heuristic for sequential allocation, 490 * XXX need to get goal block from mballoc's data structures
493 * failing that at least try to get decent locality.
494 */ 491 */
495 if (block_i && (block == block_i->last_alloc_logical_block + 1)
496 && (block_i->last_alloc_physical_block != 0)) {
497 return block_i->last_alloc_physical_block + 1;
498 }
499 492
500 return ext4_find_near(inode, partial); 493 return ext4_find_near(inode, partial);
501} 494}
@@ -628,7 +621,7 @@ allocated:
628 *err = 0; 621 *err = 0;
629 return ret; 622 return ret;
630failed_out: 623failed_out:
631 for (i = 0; i <index; i++) 624 for (i = 0; i < index; i++)
632 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 625 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
633 return ret; 626 return ret;
634} 627}
@@ -701,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
701 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 694 branch[n].p = (__le32 *) bh->b_data + offsets[n];
702 branch[n].key = cpu_to_le32(new_blocks[n]); 695 branch[n].key = cpu_to_le32(new_blocks[n]);
703 *branch[n].p = branch[n].key; 696 *branch[n].p = branch[n].key;
704 if ( n == indirect_blks) { 697 if (n == indirect_blks) {
705 current_block = new_blocks[n]; 698 current_block = new_blocks[n];
706 /* 699 /*
707 * End of chain, update the last new metablock of 700 * End of chain, update the last new metablock of
@@ -728,7 +721,7 @@ failed:
728 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 721 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
729 ext4_journal_forget(handle, branch[i].bh); 722 ext4_journal_forget(handle, branch[i].bh);
730 } 723 }
731 for (i = 0; i <indirect_blks; i++) 724 for (i = 0; i < indirect_blks; i++)
732 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 725 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
733 726
734 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 727 ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -755,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
755{ 748{
756 int i; 749 int i;
757 int err = 0; 750 int err = 0;
758 struct ext4_block_alloc_info *block_i;
759 ext4_fsblk_t current_block; 751 ext4_fsblk_t current_block;
760 752
761 block_i = EXT4_I(inode)->i_block_alloc_info;
762 /* 753 /*
763 * If we're splicing into a [td]indirect block (as opposed to the 754 * If we're splicing into a [td]indirect block (as opposed to the
764 * inode) then we need to get write access to the [td]indirect block 755 * inode) then we need to get write access to the [td]indirect block
@@ -781,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
781 if (num == 0 && blks > 1) { 772 if (num == 0 && blks > 1) {
782 current_block = le32_to_cpu(where->key) + 1; 773 current_block = le32_to_cpu(where->key) + 1;
783 for (i = 1; i < blks; i++) 774 for (i = 1; i < blks; i++)
784 *(where->p + i ) = cpu_to_le32(current_block++); 775 *(where->p + i) = cpu_to_le32(current_block++);
785 }
786
787 /*
788 * update the most recently allocated logical & physical block
789 * in i_block_alloc_info, to assist find the proper goal block for next
790 * allocation
791 */
792 if (block_i) {
793 block_i->last_alloc_logical_block = block + blks - 1;
794 block_i->last_alloc_physical_block =
795 le32_to_cpu(where[num].key) + blks - 1;
796 } 776 }
797 777
798 /* We are done with atomic stuff, now do the rest of housekeeping */ 778 /* We are done with atomic stuff, now do the rest of housekeeping */
@@ -912,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
912 goto cleanup; 892 goto cleanup;
913 893
914 /* 894 /*
915 * Okay, we need to do block allocation. Lazily initialize the block 895 * Okay, we need to do block allocation.
916 * allocation info here if necessary
917 */ 896 */
918 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
919 ext4_init_block_alloc_info(inode);
920
921 goal = ext4_find_goal(inode, iblock, partial); 897 goal = ext4_find_goal(inode, iblock, partial);
922 898
923 /* the number of blocks need to allocate for [d,t]indirect blocks */ 899 /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1005,6 +981,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
1005 */ 981 */
1006static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 982static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1007{ 983{
984 if (!blocks)
985 return 0;
986
1008 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 987 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1009 return ext4_ext_calc_metadata_amount(inode, blocks); 988 return ext4_ext_calc_metadata_amount(inode, blocks);
1010 989
@@ -1025,34 +1004,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1025 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1004 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1026 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1005 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1027 1006
1028 /* Account for allocated meta_blocks */ 1007 if (mdb_free) {
1029 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1008 /* Account for allocated meta_blocks */
1009 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1030 1010
1031 /* update fs free blocks counter for truncate case */ 1011 /* update fs dirty blocks counter */
1032 percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); 1012 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1013 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1014 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1015 }
1033 1016
1034 /* update per-inode reservations */ 1017 /* update per-inode reservations */
1035 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1018 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1036 EXT4_I(inode)->i_reserved_data_blocks -= used; 1019 EXT4_I(inode)->i_reserved_data_blocks -= used;
1037 1020
1038 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1039 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1040 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1041 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1021 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1042} 1022}
1043 1023
1044/* Maximum number of blocks we map for direct IO at once. */
1045#define DIO_MAX_BLOCKS 4096
1046/*
1047 * Number of credits we need for writing DIO_MAX_BLOCKS:
1048 * We need sb + group descriptor + bitmap + inode -> 4
1049 * For B blocks with A block pointers per block we need:
1050 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
1051 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
1052 */
1053#define DIO_CREDITS 25
1054
1055
1056/* 1024/*
1057 * The ext4_get_blocks_wrap() function try to look up the requested blocks, 1025 * The ext4_get_blocks_wrap() function try to look up the requested blocks,
1058 * and returns if the blocks are already mapped. 1026 * and returns if the blocks are already mapped.
@@ -1164,19 +1132,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1164 return retval; 1132 return retval;
1165} 1133}
1166 1134
1167static int ext4_get_block(struct inode *inode, sector_t iblock, 1135/* Maximum number of blocks we map for direct IO at once. */
1168 struct buffer_head *bh_result, int create) 1136#define DIO_MAX_BLOCKS 4096
1137
1138int ext4_get_block(struct inode *inode, sector_t iblock,
1139 struct buffer_head *bh_result, int create)
1169{ 1140{
1170 handle_t *handle = ext4_journal_current_handle(); 1141 handle_t *handle = ext4_journal_current_handle();
1171 int ret = 0, started = 0; 1142 int ret = 0, started = 0;
1172 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1143 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1144 int dio_credits;
1173 1145
1174 if (create && !handle) { 1146 if (create && !handle) {
1175 /* Direct IO write... */ 1147 /* Direct IO write... */
1176 if (max_blocks > DIO_MAX_BLOCKS) 1148 if (max_blocks > DIO_MAX_BLOCKS)
1177 max_blocks = DIO_MAX_BLOCKS; 1149 max_blocks = DIO_MAX_BLOCKS;
1178 handle = ext4_journal_start(inode, DIO_CREDITS + 1150 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
1179 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); 1151 handle = ext4_journal_start(inode, dio_credits);
1180 if (IS_ERR(handle)) { 1152 if (IS_ERR(handle)) {
1181 ret = PTR_ERR(handle); 1153 ret = PTR_ERR(handle);
1182 goto out; 1154 goto out;
@@ -1244,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1244 BUFFER_TRACE(bh, "call get_create_access"); 1216 BUFFER_TRACE(bh, "call get_create_access");
1245 fatal = ext4_journal_get_create_access(handle, bh); 1217 fatal = ext4_journal_get_create_access(handle, bh);
1246 if (!fatal && !buffer_uptodate(bh)) { 1218 if (!fatal && !buffer_uptodate(bh)) {
1247 memset(bh->b_data,0,inode->i_sb->s_blocksize); 1219 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1248 set_buffer_uptodate(bh); 1220 set_buffer_uptodate(bh);
1249 } 1221 }
1250 unlock_buffer(bh); 1222 unlock_buffer(bh);
@@ -1269,7 +1241,7 @@ err:
1269struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1241struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1270 ext4_lblk_t block, int create, int *err) 1242 ext4_lblk_t block, int create, int *err)
1271{ 1243{
1272 struct buffer_head * bh; 1244 struct buffer_head *bh;
1273 1245
1274 bh = ext4_getblk(handle, inode, block, create, err); 1246 bh = ext4_getblk(handle, inode, block, create, err);
1275 if (!bh) 1247 if (!bh)
@@ -1285,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1285 return NULL; 1257 return NULL;
1286} 1258}
1287 1259
1288static int walk_page_buffers( handle_t *handle, 1260static int walk_page_buffers(handle_t *handle,
1289 struct buffer_head *head, 1261 struct buffer_head *head,
1290 unsigned from, 1262 unsigned from,
1291 unsigned to, 1263 unsigned to,
1292 int *partial, 1264 int *partial,
1293 int (*fn)( handle_t *handle, 1265 int (*fn)(handle_t *handle,
1294 struct buffer_head *bh)) 1266 struct buffer_head *bh))
1295{ 1267{
1296 struct buffer_head *bh; 1268 struct buffer_head *bh;
1297 unsigned block_start, block_end; 1269 unsigned block_start, block_end;
@@ -1299,9 +1271,9 @@ static int walk_page_buffers( handle_t *handle,
1299 int err, ret = 0; 1271 int err, ret = 0;
1300 struct buffer_head *next; 1272 struct buffer_head *next;
1301 1273
1302 for ( bh = head, block_start = 0; 1274 for (bh = head, block_start = 0;
1303 ret == 0 && (bh != head || !block_start); 1275 ret == 0 && (bh != head || !block_start);
1304 block_start = block_end, bh = next) 1276 block_start = block_end, bh = next)
1305 { 1277 {
1306 next = bh->b_this_page; 1278 next = bh->b_this_page;
1307 block_end = block_start + blocksize; 1279 block_end = block_start + blocksize;
@@ -1354,23 +1326,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1354 loff_t pos, unsigned len, unsigned flags, 1326 loff_t pos, unsigned len, unsigned flags,
1355 struct page **pagep, void **fsdata) 1327 struct page **pagep, void **fsdata)
1356{ 1328{
1357 struct inode *inode = mapping->host; 1329 struct inode *inode = mapping->host;
1358 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1330 int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1359 handle_t *handle; 1331 handle_t *handle;
1360 int retries = 0; 1332 int retries = 0;
1361 struct page *page; 1333 struct page *page;
1362 pgoff_t index; 1334 pgoff_t index;
1363 unsigned from, to; 1335 unsigned from, to;
1364 1336
1365 index = pos >> PAGE_CACHE_SHIFT; 1337 index = pos >> PAGE_CACHE_SHIFT;
1366 from = pos & (PAGE_CACHE_SIZE - 1); 1338 from = pos & (PAGE_CACHE_SIZE - 1);
1367 to = from + len; 1339 to = from + len;
1368 1340
1369retry: 1341retry:
1370 handle = ext4_journal_start(inode, needed_blocks); 1342 handle = ext4_journal_start(inode, needed_blocks);
1371 if (IS_ERR(handle)) { 1343 if (IS_ERR(handle)) {
1372 ret = PTR_ERR(handle); 1344 ret = PTR_ERR(handle);
1373 goto out; 1345 goto out;
1374 } 1346 }
1375 1347
1376 page = __grab_cache_page(mapping, index); 1348 page = __grab_cache_page(mapping, index);
@@ -1390,9 +1362,16 @@ retry:
1390 } 1362 }
1391 1363
1392 if (ret) { 1364 if (ret) {
1393 unlock_page(page); 1365 unlock_page(page);
1394 ext4_journal_stop(handle); 1366 ext4_journal_stop(handle);
1395 page_cache_release(page); 1367 page_cache_release(page);
1368 /*
1369 * block_write_begin may have instantiated a few blocks
1370 * outside i_size. Trim these off again. Don't need
1371 * i_size_read because we hold i_mutex.
1372 */
1373 if (pos + len > inode->i_size)
1374 vmtruncate(inode, inode->i_size);
1396 } 1375 }
1397 1376
1398 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1377 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1429,16 +1408,18 @@ static int ext4_ordered_write_end(struct file *file,
1429 ret = ext4_jbd2_file_inode(handle, inode); 1408 ret = ext4_jbd2_file_inode(handle, inode);
1430 1409
1431 if (ret == 0) { 1410 if (ret == 0) {
1432 /*
1433 * generic_write_end() will run mark_inode_dirty() if i_size
1434 * changes. So let's piggyback the i_disksize mark_inode_dirty
1435 * into that.
1436 */
1437 loff_t new_i_size; 1411 loff_t new_i_size;
1438 1412
1439 new_i_size = pos + copied; 1413 new_i_size = pos + copied;
1440 if (new_i_size > EXT4_I(inode)->i_disksize) 1414 if (new_i_size > EXT4_I(inode)->i_disksize) {
1441 EXT4_I(inode)->i_disksize = new_i_size; 1415 ext4_update_i_disksize(inode, new_i_size);
1416 /* We need to mark inode dirty even if
1417 * new_i_size is less that inode->i_size
1418 * bu greater than i_disksize.(hint delalloc)
1419 */
1420 ext4_mark_inode_dirty(handle, inode);
1421 }
1422
1442 ret2 = generic_write_end(file, mapping, pos, len, copied, 1423 ret2 = generic_write_end(file, mapping, pos, len, copied,
1443 page, fsdata); 1424 page, fsdata);
1444 copied = ret2; 1425 copied = ret2;
@@ -1463,8 +1444,14 @@ static int ext4_writeback_write_end(struct file *file,
1463 loff_t new_i_size; 1444 loff_t new_i_size;
1464 1445
1465 new_i_size = pos + copied; 1446 new_i_size = pos + copied;
1466 if (new_i_size > EXT4_I(inode)->i_disksize) 1447 if (new_i_size > EXT4_I(inode)->i_disksize) {
1467 EXT4_I(inode)->i_disksize = new_i_size; 1448 ext4_update_i_disksize(inode, new_i_size);
1449 /* We need to mark inode dirty even if
1450 * new_i_size is less that inode->i_size
1451 * bu greater than i_disksize.(hint delalloc)
1452 */
1453 ext4_mark_inode_dirty(handle, inode);
1454 }
1468 1455
1469 ret2 = generic_write_end(file, mapping, pos, len, copied, 1456 ret2 = generic_write_end(file, mapping, pos, len, copied,
1470 page, fsdata); 1457 page, fsdata);
@@ -1489,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file,
1489 int ret = 0, ret2; 1476 int ret = 0, ret2;
1490 int partial = 0; 1477 int partial = 0;
1491 unsigned from, to; 1478 unsigned from, to;
1479 loff_t new_i_size;
1492 1480
1493 from = pos & (PAGE_CACHE_SIZE - 1); 1481 from = pos & (PAGE_CACHE_SIZE - 1);
1494 to = from + len; 1482 to = from + len;
@@ -1503,11 +1491,12 @@ static int ext4_journalled_write_end(struct file *file,
1503 to, &partial, write_end_fn); 1491 to, &partial, write_end_fn);
1504 if (!partial) 1492 if (!partial)
1505 SetPageUptodate(page); 1493 SetPageUptodate(page);
1506 if (pos+copied > inode->i_size) 1494 new_i_size = pos + copied;
1495 if (new_i_size > inode->i_size)
1507 i_size_write(inode, pos+copied); 1496 i_size_write(inode, pos+copied);
1508 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1497 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1509 if (inode->i_size > EXT4_I(inode)->i_disksize) { 1498 if (new_i_size > EXT4_I(inode)->i_disksize) {
1510 EXT4_I(inode)->i_disksize = inode->i_size; 1499 ext4_update_i_disksize(inode, new_i_size);
1511 ret2 = ext4_mark_inode_dirty(handle, inode); 1500 ret2 = ext4_mark_inode_dirty(handle, inode);
1512 if (!ret) 1501 if (!ret)
1513 ret = ret2; 1502 ret = ret2;
@@ -1524,6 +1513,7 @@ static int ext4_journalled_write_end(struct file *file,
1524 1513
1525static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1514static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1526{ 1515{
1516 int retries = 0;
1527 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1517 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1528 unsigned long md_needed, mdblocks, total = 0; 1518 unsigned long md_needed, mdblocks, total = 0;
1529 1519
@@ -1532,6 +1522,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1532 * in order to allocate nrblocks 1522 * in order to allocate nrblocks
1533 * worse case is one extent per block 1523 * worse case is one extent per block
1534 */ 1524 */
1525repeat:
1535 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1526 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1536 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1527 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1537 mdblocks = ext4_calc_metadata_amount(inode, total); 1528 mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1540,13 +1531,14 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1540 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1531 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1541 total = md_needed + nrblocks; 1532 total = md_needed + nrblocks;
1542 1533
1543 if (ext4_has_free_blocks(sbi, total) < total) { 1534 if (ext4_claim_free_blocks(sbi, total)) {
1544 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1535 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1536 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1537 yield();
1538 goto repeat;
1539 }
1545 return -ENOSPC; 1540 return -ENOSPC;
1546 } 1541 }
1547 /* reduce fs free blocks counter */
1548 percpu_counter_sub(&sbi->s_freeblocks_counter, total);
1549
1550 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1542 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1551 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1543 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1552 1544
@@ -1559,7 +1551,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1559 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1551 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1560 int total, mdb, mdb_free, release; 1552 int total, mdb, mdb_free, release;
1561 1553
1554 if (!to_free)
1555 return; /* Nothing to release, exit */
1556
1562 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1557 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1558
1559 if (!EXT4_I(inode)->i_reserved_data_blocks) {
1560 /*
1561 * if there is no reserved blocks, but we try to free some
1562 * then the counter is messed up somewhere.
1563 * but since this function is called from invalidate
1564 * page, it's harmless to return without any action
1565 */
1566 printk(KERN_INFO "ext4 delalloc try to release %d reserved "
1567 "blocks for inode %lu, but there is no reserved "
1568 "data blocks\n", to_free, inode->i_ino);
1569 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1570 return;
1571 }
1572
1563 /* recalculate the number of metablocks still need to be reserved */ 1573 /* recalculate the number of metablocks still need to be reserved */
1564 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1574 total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
1565 mdb = ext4_calc_metadata_amount(inode, total); 1575 mdb = ext4_calc_metadata_amount(inode, total);
@@ -1570,8 +1580,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1570 1580
1571 release = to_free + mdb_free; 1581 release = to_free + mdb_free;
1572 1582
1573 /* update fs free blocks counter for truncate case */ 1583 /* update fs dirty blocks counter for truncate case */
1574 percpu_counter_add(&sbi->s_freeblocks_counter, release); 1584 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1575 1585
1576 /* update per-inode reservations */ 1586 /* update per-inode reservations */
1577 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1587 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -1613,11 +1623,14 @@ struct mpage_da_data {
1613 unsigned long first_page, next_page; /* extent of pages */ 1623 unsigned long first_page, next_page; /* extent of pages */
1614 get_block_t *get_block; 1624 get_block_t *get_block;
1615 struct writeback_control *wbc; 1625 struct writeback_control *wbc;
1626 int io_done;
1627 long pages_written;
1628 int retval;
1616}; 1629};
1617 1630
1618/* 1631/*
1619 * mpage_da_submit_io - walks through extent of pages and try to write 1632 * mpage_da_submit_io - walks through extent of pages and try to write
1620 * them with __mpage_writepage() 1633 * them with writepage() call back
1621 * 1634 *
1622 * @mpd->inode: inode 1635 * @mpd->inode: inode
1623 * @mpd->first_page: first page of the extent 1636 * @mpd->first_page: first page of the extent
@@ -1632,37 +1645,42 @@ struct mpage_da_data {
1632static int mpage_da_submit_io(struct mpage_da_data *mpd) 1645static int mpage_da_submit_io(struct mpage_da_data *mpd)
1633{ 1646{
1634 struct address_space *mapping = mpd->inode->i_mapping; 1647 struct address_space *mapping = mpd->inode->i_mapping;
1635 struct mpage_data mpd_pp = {
1636 .bio = NULL,
1637 .last_block_in_bio = 0,
1638 .get_block = mpd->get_block,
1639 .use_writepage = 1,
1640 };
1641 int ret = 0, err, nr_pages, i; 1648 int ret = 0, err, nr_pages, i;
1642 unsigned long index, end; 1649 unsigned long index, end;
1643 struct pagevec pvec; 1650 struct pagevec pvec;
1651 long pages_skipped;
1644 1652
1645 BUG_ON(mpd->next_page <= mpd->first_page); 1653 BUG_ON(mpd->next_page <= mpd->first_page);
1646
1647 pagevec_init(&pvec, 0); 1654 pagevec_init(&pvec, 0);
1648 index = mpd->first_page; 1655 index = mpd->first_page;
1649 end = mpd->next_page - 1; 1656 end = mpd->next_page - 1;
1650 1657
1651 while (index <= end) { 1658 while (index <= end) {
1652 /* XXX: optimize tail */ 1659 /*
1653 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1660 * We can use PAGECACHE_TAG_DIRTY lookup here because
1661 * even though we have cleared the dirty flag on the page
1662 * We still keep the page in the radix tree with tag
1663 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
1664 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
1665 * which is called via the below writepage callback.
1666 */
1667 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1668 PAGECACHE_TAG_DIRTY,
1669 min(end - index,
1670 (pgoff_t)PAGEVEC_SIZE-1) + 1);
1654 if (nr_pages == 0) 1671 if (nr_pages == 0)
1655 break; 1672 break;
1656 for (i = 0; i < nr_pages; i++) { 1673 for (i = 0; i < nr_pages; i++) {
1657 struct page *page = pvec.pages[i]; 1674 struct page *page = pvec.pages[i];
1658 1675
1659 index = page->index; 1676 pages_skipped = mpd->wbc->pages_skipped;
1660 if (index > end) 1677 err = mapping->a_ops->writepage(page, mpd->wbc);
1661 break; 1678 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
1662 index++; 1679 /*
1663 1680 * have successfully written the page
1664 err = __mpage_writepage(page, mpd->wbc, &mpd_pp); 1681 * without skipping the same
1665 1682 */
1683 mpd->pages_written++;
1666 /* 1684 /*
1667 * In error case, we have to continue because 1685 * In error case, we have to continue because
1668 * remaining pages are still locked 1686 * remaining pages are still locked
@@ -1673,9 +1691,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1673 } 1691 }
1674 pagevec_release(&pvec); 1692 pagevec_release(&pvec);
1675 } 1693 }
1676 if (mpd_pp.bio)
1677 mpage_bio_submit(WRITE, mpd_pp.bio);
1678
1679 return ret; 1694 return ret;
1680} 1695}
1681 1696
@@ -1698,7 +1713,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1698 int blocks = exbh->b_size >> inode->i_blkbits; 1713 int blocks = exbh->b_size >> inode->i_blkbits;
1699 sector_t pblock = exbh->b_blocknr, cur_logical; 1714 sector_t pblock = exbh->b_blocknr, cur_logical;
1700 struct buffer_head *head, *bh; 1715 struct buffer_head *head, *bh;
1701 unsigned long index, end; 1716 pgoff_t index, end;
1702 struct pagevec pvec; 1717 struct pagevec pvec;
1703 int nr_pages, i; 1718 int nr_pages, i;
1704 1719
@@ -1741,6 +1756,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1741 if (buffer_delay(bh)) { 1756 if (buffer_delay(bh)) {
1742 bh->b_blocknr = pblock; 1757 bh->b_blocknr = pblock;
1743 clear_buffer_delay(bh); 1758 clear_buffer_delay(bh);
1759 bh->b_bdev = inode->i_sb->s_bdev;
1760 } else if (buffer_unwritten(bh)) {
1761 bh->b_blocknr = pblock;
1762 clear_buffer_unwritten(bh);
1763 set_buffer_mapped(bh);
1764 set_buffer_new(bh);
1765 bh->b_bdev = inode->i_sb->s_bdev;
1744 } else if (buffer_mapped(bh)) 1766 } else if (buffer_mapped(bh))
1745 BUG_ON(bh->b_blocknr != pblock); 1767 BUG_ON(bh->b_blocknr != pblock);
1746 1768
@@ -1768,6 +1790,57 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1768 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1790 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1769} 1791}
1770 1792
1793static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
1794 sector_t logical, long blk_cnt)
1795{
1796 int nr_pages, i;
1797 pgoff_t index, end;
1798 struct pagevec pvec;
1799 struct inode *inode = mpd->inode;
1800 struct address_space *mapping = inode->i_mapping;
1801
1802 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1803 end = (logical + blk_cnt - 1) >>
1804 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1805 while (index <= end) {
1806 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1807 if (nr_pages == 0)
1808 break;
1809 for (i = 0; i < nr_pages; i++) {
1810 struct page *page = pvec.pages[i];
1811 index = page->index;
1812 if (index > end)
1813 break;
1814 index++;
1815
1816 BUG_ON(!PageLocked(page));
1817 BUG_ON(PageWriteback(page));
1818 block_invalidatepage(page, 0);
1819 ClearPageUptodate(page);
1820 unlock_page(page);
1821 }
1822 }
1823 return;
1824}
1825
1826static void ext4_print_free_blocks(struct inode *inode)
1827{
1828 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1829 printk(KERN_EMERG "Total free blocks count %lld\n",
1830 ext4_count_free_blocks(inode->i_sb));
1831 printk(KERN_EMERG "Free/Dirty block details\n");
1832 printk(KERN_EMERG "free_blocks=%lld\n",
1833 percpu_counter_sum(&sbi->s_freeblocks_counter));
1834 printk(KERN_EMERG "dirty_blocks=%lld\n",
1835 percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1836 printk(KERN_EMERG "Block reservation details\n");
1837 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
1838 EXT4_I(inode)->i_reserved_data_blocks);
1839 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
1840 EXT4_I(inode)->i_reserved_meta_blocks);
1841 return;
1842}
1843
1771/* 1844/*
1772 * mpage_da_map_blocks - go through given space 1845 * mpage_da_map_blocks - go through given space
1773 * 1846 *
@@ -1776,54 +1849,87 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
1776 * 1849 *
1777 * The function skips space we know is already mapped to disk blocks. 1850 * The function skips space we know is already mapped to disk blocks.
1778 * 1851 *
1779 * The function ignores errors ->get_block() returns, thus real
1780 * error handling is postponed to __mpage_writepage()
1781 */ 1852 */
1782static void mpage_da_map_blocks(struct mpage_da_data *mpd) 1853static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1783{ 1854{
1784 struct buffer_head *lbh = &mpd->lbh; 1855 int err = 0;
1785 int err = 0, remain = lbh->b_size;
1786 sector_t next = lbh->b_blocknr;
1787 struct buffer_head new; 1856 struct buffer_head new;
1857 struct buffer_head *lbh = &mpd->lbh;
1858 sector_t next;
1788 1859
1789 /* 1860 /*
1790 * We consider only non-mapped and non-allocated blocks 1861 * We consider only non-mapped and non-allocated blocks
1791 */ 1862 */
1792 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1863 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1793 return; 1864 return 0;
1865 new.b_state = lbh->b_state;
1866 new.b_blocknr = 0;
1867 new.b_size = lbh->b_size;
1868 next = lbh->b_blocknr;
1869 /*
1870 * If we didn't accumulate anything
1871 * to write simply return
1872 */
1873 if (!new.b_size)
1874 return 0;
1875 err = mpd->get_block(mpd->inode, next, &new, 1);
1876 if (err) {
1794 1877
1795 while (remain) { 1878 /* If get block returns with error
1796 new.b_state = lbh->b_state; 1879 * we simply return. Later writepage
1797 new.b_blocknr = 0; 1880 * will redirty the page and writepages
1798 new.b_size = remain; 1881 * will find the dirty page again
1799 err = mpd->get_block(mpd->inode, next, &new, 1); 1882 */
1800 if (err) { 1883 if (err == -EAGAIN)
1801 /* 1884 return 0;
1802 * Rather than implement own error handling
1803 * here, we just leave remaining blocks
1804 * unallocated and try again with ->writepage()
1805 */
1806 break;
1807 }
1808 BUG_ON(new.b_size == 0);
1809 1885
1810 if (buffer_new(&new)) 1886 if (err == -ENOSPC &&
1811 __unmap_underlying_blocks(mpd->inode, &new); 1887 ext4_count_free_blocks(mpd->inode->i_sb)) {
1888 mpd->retval = err;
1889 return 0;
1890 }
1812 1891
1813 /* 1892 /*
1814 * If blocks are delayed marked, we need to 1893 * get block failure will cause us
1815 * put actual blocknr and drop delayed bit 1894 * to loop in writepages. Because
1895 * a_ops->writepage won't be able to
1896 * make progress. The page will be redirtied
1897 * by writepage and writepages will again
1898 * try to write the same.
1816 */ 1899 */
1817 if (buffer_delay(lbh)) 1900 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1818 mpage_put_bnr_to_bhs(mpd, next, &new); 1901 "at logical offset %llu with max blocks "
1819 1902 "%zd with error %d\n",
1820 /* go for the remaining blocks */ 1903 __func__, mpd->inode->i_ino,
1821 next += new.b_size >> mpd->inode->i_blkbits; 1904 (unsigned long long)next,
1822 remain -= new.b_size; 1905 lbh->b_size >> mpd->inode->i_blkbits, err);
1906 printk(KERN_EMERG "This should not happen.!! "
1907 "Data will be lost\n");
1908 if (err == -ENOSPC) {
1909 ext4_print_free_blocks(mpd->inode);
1910 }
1911 /* invlaidate all the pages */
1912 ext4_da_block_invalidatepages(mpd, next,
1913 lbh->b_size >> mpd->inode->i_blkbits);
1914 return err;
1823 } 1915 }
1916 BUG_ON(new.b_size == 0);
1917
1918 if (buffer_new(&new))
1919 __unmap_underlying_blocks(mpd->inode, &new);
1920
1921 /*
1922 * If blocks are delayed marked, we need to
1923 * put actual blocknr and drop delayed bit
1924 */
1925 if (buffer_delay(lbh) || buffer_unwritten(lbh))
1926 mpage_put_bnr_to_bhs(mpd, next, &new);
1927
1928 return 0;
1824} 1929}
1825 1930
1826#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) 1931#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1932 (1 << BH_Delay) | (1 << BH_Unwritten))
1827 1933
1828/* 1934/*
1829 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 1935 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
@@ -1837,41 +1943,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1837static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 1943static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1838 sector_t logical, struct buffer_head *bh) 1944 sector_t logical, struct buffer_head *bh)
1839{ 1945{
1840 struct buffer_head *lbh = &mpd->lbh;
1841 sector_t next; 1946 sector_t next;
1947 size_t b_size = bh->b_size;
1948 struct buffer_head *lbh = &mpd->lbh;
1949 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
1842 1950
1843 next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); 1951 /* check if thereserved journal credits might overflow */
1844 1952 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
1953 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1954 /*
1955 * With non-extent format we are limited by the journal
1956 * credit available. Total credit needed to insert
1957 * nrblocks contiguous blocks is dependent on the
1958 * nrblocks. So limit nrblocks.
1959 */
1960 goto flush_it;
1961 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
1962 EXT4_MAX_TRANS_DATA) {
1963 /*
1964 * Adding the new buffer_head would make it cross the
1965 * allowed limit for which we have journal credit
1966 * reserved. So limit the new bh->b_size
1967 */
1968 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
1969 mpd->inode->i_blkbits;
1970 /* we will do mpage_da_submit_io in the next loop */
1971 }
1972 }
1845 /* 1973 /*
1846 * First block in the extent 1974 * First block in the extent
1847 */ 1975 */
1848 if (lbh->b_size == 0) { 1976 if (lbh->b_size == 0) {
1849 lbh->b_blocknr = logical; 1977 lbh->b_blocknr = logical;
1850 lbh->b_size = bh->b_size; 1978 lbh->b_size = b_size;
1851 lbh->b_state = bh->b_state & BH_FLAGS; 1979 lbh->b_state = bh->b_state & BH_FLAGS;
1852 return; 1980 return;
1853 } 1981 }
1854 1982
1983 next = lbh->b_blocknr + nrblocks;
1855 /* 1984 /*
1856 * Can we merge the block to our big extent? 1985 * Can we merge the block to our big extent?
1857 */ 1986 */
1858 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 1987 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1859 lbh->b_size += bh->b_size; 1988 lbh->b_size += b_size;
1860 return; 1989 return;
1861 } 1990 }
1862 1991
1992flush_it:
1863 /* 1993 /*
1864 * We couldn't merge the block to our extent, so we 1994 * We couldn't merge the block to our extent, so we
1865 * need to flush current extent and start new one 1995 * need to flush current extent and start new one
1866 */ 1996 */
1867 mpage_da_map_blocks(mpd); 1997 if (mpage_da_map_blocks(mpd) == 0)
1868 1998 mpage_da_submit_io(mpd);
1869 /* 1999 mpd->io_done = 1;
1870 * Now start a new extent 2000 return;
1871 */
1872 lbh->b_size = bh->b_size;
1873 lbh->b_state = bh->b_state & BH_FLAGS;
1874 lbh->b_blocknr = logical;
1875} 2001}
1876 2002
1877/* 2003/*
@@ -1891,17 +2017,35 @@ static int __mpage_da_writepage(struct page *page,
1891 struct buffer_head *bh, *head, fake; 2017 struct buffer_head *bh, *head, fake;
1892 sector_t logical; 2018 sector_t logical;
1893 2019
2020 if (mpd->io_done) {
2021 /*
2022 * Rest of the page in the page_vec
2023 * redirty then and skip then. We will
2024 * try to to write them again after
2025 * starting a new transaction
2026 */
2027 redirty_page_for_writepage(wbc, page);
2028 unlock_page(page);
2029 return MPAGE_DA_EXTENT_TAIL;
2030 }
1894 /* 2031 /*
1895 * Can we merge this page to current extent? 2032 * Can we merge this page to current extent?
1896 */ 2033 */
1897 if (mpd->next_page != page->index) { 2034 if (mpd->next_page != page->index) {
1898 /* 2035 /*
1899 * Nope, we can't. So, we map non-allocated blocks 2036 * Nope, we can't. So, we map non-allocated blocks
1900 * and start IO on them using __mpage_writepage() 2037 * and start IO on them using writepage()
1901 */ 2038 */
1902 if (mpd->next_page != mpd->first_page) { 2039 if (mpd->next_page != mpd->first_page) {
1903 mpage_da_map_blocks(mpd); 2040 if (mpage_da_map_blocks(mpd) == 0)
1904 mpage_da_submit_io(mpd); 2041 mpage_da_submit_io(mpd);
2042 /*
2043 * skip rest of the page in the page_vec
2044 */
2045 mpd->io_done = 1;
2046 redirty_page_for_writepage(wbc, page);
2047 unlock_page(page);
2048 return MPAGE_DA_EXTENT_TAIL;
1905 } 2049 }
1906 2050
1907 /* 2051 /*
@@ -1932,6 +2076,8 @@ static int __mpage_da_writepage(struct page *page,
1932 set_buffer_dirty(bh); 2076 set_buffer_dirty(bh);
1933 set_buffer_uptodate(bh); 2077 set_buffer_uptodate(bh);
1934 mpage_add_bh_to_extent(mpd, logical, bh); 2078 mpage_add_bh_to_extent(mpd, logical, bh);
2079 if (mpd->io_done)
2080 return MPAGE_DA_EXTENT_TAIL;
1935 } else { 2081 } else {
1936 /* 2082 /*
1937 * Page with regular buffer heads, just add all dirty ones 2083 * Page with regular buffer heads, just add all dirty ones
@@ -1940,8 +2086,12 @@ static int __mpage_da_writepage(struct page *page,
1940 bh = head; 2086 bh = head;
1941 do { 2087 do {
1942 BUG_ON(buffer_locked(bh)); 2088 BUG_ON(buffer_locked(bh));
1943 if (buffer_dirty(bh)) 2089 if (buffer_dirty(bh) &&
2090 (!buffer_mapped(bh) || buffer_delay(bh))) {
1944 mpage_add_bh_to_extent(mpd, logical, bh); 2091 mpage_add_bh_to_extent(mpd, logical, bh);
2092 if (mpd->io_done)
2093 return MPAGE_DA_EXTENT_TAIL;
2094 }
1945 logical++; 2095 logical++;
1946 } while ((bh = bh->b_this_page) != head); 2096 } while ((bh = bh->b_this_page) != head);
1947 } 2097 }
@@ -1960,46 +2110,37 @@ static int __mpage_da_writepage(struct page *page,
1960 * 2110 *
1961 * This is a library function, which implements the writepages() 2111 * This is a library function, which implements the writepages()
1962 * address_space_operation. 2112 * address_space_operation.
1963 *
1964 * In order to avoid duplication of logic that deals with partial pages,
1965 * multiple bio per page, etc, we find non-allocated blocks, allocate
1966 * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1967 *
1968 * It's important that we call __mpage_writepage() only once for each
1969 * involved page, otherwise we'd have to implement more complicated logic
1970 * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1971 *
1972 * See comments to mpage_writepages()
1973 */ 2113 */
1974static int mpage_da_writepages(struct address_space *mapping, 2114static int mpage_da_writepages(struct address_space *mapping,
1975 struct writeback_control *wbc, 2115 struct writeback_control *wbc,
1976 get_block_t get_block) 2116 struct mpage_da_data *mpd)
1977{ 2117{
1978 struct mpage_da_data mpd;
1979 int ret; 2118 int ret;
1980 2119
1981 if (!get_block) 2120 if (!mpd->get_block)
1982 return generic_writepages(mapping, wbc); 2121 return generic_writepages(mapping, wbc);
1983 2122
1984 mpd.wbc = wbc; 2123 mpd->lbh.b_size = 0;
1985 mpd.inode = mapping->host; 2124 mpd->lbh.b_state = 0;
1986 mpd.lbh.b_size = 0; 2125 mpd->lbh.b_blocknr = 0;
1987 mpd.lbh.b_state = 0; 2126 mpd->first_page = 0;
1988 mpd.lbh.b_blocknr = 0; 2127 mpd->next_page = 0;
1989 mpd.first_page = 0; 2128 mpd->io_done = 0;
1990 mpd.next_page = 0; 2129 mpd->pages_written = 0;
1991 mpd.get_block = get_block; 2130 mpd->retval = 0;
1992
1993 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
1994 2131
2132 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
1995 /* 2133 /*
1996 * Handle last extent of pages 2134 * Handle last extent of pages
1997 */ 2135 */
1998 if (mpd.next_page != mpd.first_page) { 2136 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
1999 mpage_da_map_blocks(&mpd); 2137 if (mpage_da_map_blocks(mpd) == 0)
2000 mpage_da_submit_io(&mpd); 2138 mpage_da_submit_io(mpd);
2001 }
2002 2139
2140 mpd->io_done = 1;
2141 ret = MPAGE_DA_EXTENT_TAIL;
2142 }
2143 wbc->nr_to_write -= mpd->pages_written;
2003 return ret; 2144 return ret;
2004} 2145}
2005 2146
@@ -2052,18 +2193,24 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2052 handle_t *handle = NULL; 2193 handle_t *handle = NULL;
2053 2194
2054 handle = ext4_journal_current_handle(); 2195 handle = ext4_journal_current_handle();
2055 if (!handle) { 2196 BUG_ON(!handle);
2056 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2197 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2057 bh_result, 0, 0, 0); 2198 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2058 BUG_ON(!ret);
2059 } else {
2060 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2061 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2062 }
2063
2064 if (ret > 0) { 2199 if (ret > 0) {
2200
2065 bh_result->b_size = (ret << inode->i_blkbits); 2201 bh_result->b_size = (ret << inode->i_blkbits);
2066 2202
2203 if (ext4_should_order_data(inode)) {
2204 int retval;
2205 retval = ext4_jbd2_file_inode(handle, inode);
2206 if (retval)
2207 /*
2208 * Failed to add inode for ordered
2209 * mode. Don't update file size
2210 */
2211 return retval;
2212 }
2213
2067 /* 2214 /*
2068 * Update on-disk size along with block allocation 2215 * Update on-disk size along with block allocation
2069 * we don't use 'extend_disksize' as size may change 2216 * we don't use 'extend_disksize' as size may change
@@ -2073,18 +2220,9 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2073 if (disksize > i_size_read(inode)) 2220 if (disksize > i_size_read(inode))
2074 disksize = i_size_read(inode); 2221 disksize = i_size_read(inode);
2075 if (disksize > EXT4_I(inode)->i_disksize) { 2222 if (disksize > EXT4_I(inode)->i_disksize) {
2076 /* 2223 ext4_update_i_disksize(inode, disksize);
2077 * XXX: replace with spinlock if seen contended -bzzz 2224 ret = ext4_mark_inode_dirty(handle, inode);
2078 */ 2225 return ret;
2079 down_write(&EXT4_I(inode)->i_data_sem);
2080 if (disksize > EXT4_I(inode)->i_disksize)
2081 EXT4_I(inode)->i_disksize = disksize;
2082 up_write(&EXT4_I(inode)->i_data_sem);
2083
2084 if (EXT4_I(inode)->i_disksize == disksize) {
2085 ret = ext4_mark_inode_dirty(handle, inode);
2086 return ret;
2087 }
2088 } 2226 }
2089 ret = 0; 2227 ret = 0;
2090 } 2228 }
@@ -2204,102 +2342,177 @@ static int ext4_da_writepage(struct page *page,
2204} 2342}
2205 2343
2206/* 2344/*
2207 * For now just follow the DIO way to estimate the max credits 2345 * This is called via ext4_da_writepages() to
2208 * needed to write out EXT4_MAX_WRITEBACK_PAGES. 2346 * calulate the total number of credits to reserve to fit
2209 * todo: need to calculate the max credits need for 2347 * a single extent allocation into a single transaction,
2210 * extent based files, currently the DIO credits is based on 2348 * ext4_da_writpeages() will loop calling this before
2211 * indirect-blocks mapping way. 2349 * the block allocation.
2212 *
2213 * Probably should have a generic way to calculate credits
2214 * for DIO, writepages, and truncate
2215 */ 2350 */
2216#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS 2351
2217#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS 2352static int ext4_da_writepages_trans_blocks(struct inode *inode)
2353{
2354 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
2355
2356 /*
2357 * With non-extent format the journal credit needed to
2358 * insert nrblocks contiguous block is dependent on
2359 * number of contiguous block. So we will limit
2360 * number of contiguous block to a sane value
2361 */
2362 if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
2363 (max_blocks > EXT4_MAX_TRANS_DATA))
2364 max_blocks = EXT4_MAX_TRANS_DATA;
2365
2366 return ext4_chunk_trans_blocks(inode, max_blocks);
2367}
2218 2368
2219static int ext4_da_writepages(struct address_space *mapping, 2369static int ext4_da_writepages(struct address_space *mapping,
2220 struct writeback_control *wbc) 2370 struct writeback_control *wbc)
2221{ 2371{
2222 struct inode *inode = mapping->host; 2372 pgoff_t index;
2373 int range_whole = 0;
2223 handle_t *handle = NULL; 2374 handle_t *handle = NULL;
2224 int needed_blocks; 2375 struct mpage_da_data mpd;
2225 int ret = 0; 2376 struct inode *inode = mapping->host;
2226 long to_write; 2377 int no_nrwrite_index_update;
2227 loff_t range_start = 0; 2378 long pages_written = 0, pages_skipped;
2379 int needed_blocks, ret = 0, nr_to_writebump = 0;
2380 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2228 2381
2229 /* 2382 /*
2230 * No pages to write? This is mainly a kludge to avoid starting 2383 * No pages to write? This is mainly a kludge to avoid starting
2231 * a transaction for special inodes like journal inode on last iput() 2384 * a transaction for special inodes like journal inode on last iput()
2232 * because that could violate lock ordering on umount 2385 * because that could violate lock ordering on umount
2233 */ 2386 */
2234 if (!mapping->nrpages) 2387 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2235 return 0; 2388 return 0;
2389 /*
2390 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2391 * This make sure small files blocks are allocated in
2392 * single attempt. This ensure that small files
2393 * get less fragmented.
2394 */
2395 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2396 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2397 wbc->nr_to_write = sbi->s_mb_stream_request;
2398 }
2399 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2400 range_whole = 1;
2401
2402 if (wbc->range_cyclic)
2403 index = mapping->writeback_index;
2404 else
2405 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2406
2407 mpd.wbc = wbc;
2408 mpd.inode = mapping->host;
2236 2409
2237 /* 2410 /*
2238 * Estimate the worse case needed credits to write out 2411 * we don't want write_cache_pages to update
2239 * EXT4_MAX_BUF_BLOCKS pages 2412 * nr_to_write and writeback_index
2240 */ 2413 */
2241 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; 2414 no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2415 wbc->no_nrwrite_index_update = 1;
2416 pages_skipped = wbc->pages_skipped;
2417
2418 while (!ret && wbc->nr_to_write > 0) {
2242 2419
2243 to_write = wbc->nr_to_write;
2244 if (!wbc->range_cyclic) {
2245 /* 2420 /*
2246 * If range_cyclic is not set force range_cont 2421 * we insert one extent at a time. So we need
2247 * and save the old writeback_index 2422 * credit needed for single extent allocation.
2423 * journalled mode is currently not supported
2424 * by delalloc
2248 */ 2425 */
2249 wbc->range_cont = 1; 2426 BUG_ON(ext4_should_journal_data(inode));
2250 range_start = wbc->range_start; 2427 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2251 }
2252 2428
2253 while (!ret && to_write) {
2254 /* start a new transaction*/ 2429 /* start a new transaction*/
2255 handle = ext4_journal_start(inode, needed_blocks); 2430 handle = ext4_journal_start(inode, needed_blocks);
2256 if (IS_ERR(handle)) { 2431 if (IS_ERR(handle)) {
2257 ret = PTR_ERR(handle); 2432 ret = PTR_ERR(handle);
2433 printk(KERN_EMERG "%s: jbd2_start: "
2434 "%ld pages, ino %lu; err %d\n", __func__,
2435 wbc->nr_to_write, inode->i_ino, ret);
2436 dump_stack();
2258 goto out_writepages; 2437 goto out_writepages;
2259 } 2438 }
2260 if (ext4_should_order_data(inode)) { 2439 mpd.get_block = ext4_da_get_block_write;
2261 /* 2440 ret = mpage_da_writepages(mapping, wbc, &mpd);
2262 * With ordered mode we need to add
2263 * the inode to the journal handle
2264 * when we do block allocation.
2265 */
2266 ret = ext4_jbd2_file_inode(handle, inode);
2267 if (ret) {
2268 ext4_journal_stop(handle);
2269 goto out_writepages;
2270 }
2271
2272 }
2273 /*
2274 * set the max dirty pages could be write at a time
2275 * to fit into the reserved transaction credits
2276 */
2277 if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
2278 wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
2279 2441
2280 to_write -= wbc->nr_to_write;
2281 ret = mpage_da_writepages(mapping, wbc,
2282 ext4_da_get_block_write);
2283 ext4_journal_stop(handle); 2442 ext4_journal_stop(handle);
2284 if (wbc->nr_to_write) { 2443
2444 if (mpd.retval == -ENOSPC) {
2445 /* commit the transaction which would
2446 * free blocks released in the transaction
2447 * and try again
2448 */
2449 jbd2_journal_force_commit_nested(sbi->s_journal);
2450 wbc->pages_skipped = pages_skipped;
2451 ret = 0;
2452 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2453 /*
2454 * got one extent now try with
2455 * rest of the pages
2456 */
2457 pages_written += mpd.pages_written;
2458 wbc->pages_skipped = pages_skipped;
2459 ret = 0;
2460 } else if (wbc->nr_to_write)
2285 /* 2461 /*
2286 * There is no more writeout needed 2462 * There is no more writeout needed
2287 * or we requested for a noblocking writeout 2463 * or we requested for a noblocking writeout
2288 * and we found the device congested 2464 * and we found the device congested
2289 */ 2465 */
2290 to_write += wbc->nr_to_write;
2291 break; 2466 break;
2292 }
2293 wbc->nr_to_write = to_write;
2294 } 2467 }
2468 if (pages_skipped != wbc->pages_skipped)
2469 printk(KERN_EMERG "This should not happen leaving %s "
2470 "with nr_to_write = %ld ret = %d\n",
2471 __func__, wbc->nr_to_write, ret);
2472
2473 /* Update index */
2474 index += pages_written;
2475 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2476 /*
2477 * set the writeback_index so that range_cyclic
2478 * mode will write it back later
2479 */
2480 mapping->writeback_index = index;
2295 2481
2296out_writepages: 2482out_writepages:
2297 wbc->nr_to_write = to_write; 2483 if (!no_nrwrite_index_update)
2298 if (range_start) 2484 wbc->no_nrwrite_index_update = 0;
2299 wbc->range_start = range_start; 2485 wbc->nr_to_write -= nr_to_writebump;
2300 return ret; 2486 return ret;
2301} 2487}
2302 2488
2489#define FALL_BACK_TO_NONDELALLOC 1
2490static int ext4_nonda_switch(struct super_block *sb)
2491{
2492 s64 free_blocks, dirty_blocks;
2493 struct ext4_sb_info *sbi = EXT4_SB(sb);
2494
2495 /*
2496 * switch to non delalloc mode if we are running low
2497 * on free block. The free block accounting via percpu
2498 * counters can get slightly wrong with FBC_BATCH getting
2499 * accumulated on each CPU without updating global counters
2500 * Delalloc need an accurate free block accounting. So switch
2501 * to non delalloc when we are near to error range.
2502 */
2503 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
2504 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
2505 if (2 * free_blocks < 3 * dirty_blocks ||
2506 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
2507 /*
2508 * free block count is less that 150% of dirty blocks
2509 * or free blocks is less that watermark
2510 */
2511 return 1;
2512 }
2513 return 0;
2514}
2515
2303static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2516static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2304 loff_t pos, unsigned len, unsigned flags, 2517 loff_t pos, unsigned len, unsigned flags,
2305 struct page **pagep, void **fsdata) 2518 struct page **pagep, void **fsdata)
@@ -2315,6 +2528,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2315 from = pos & (PAGE_CACHE_SIZE - 1); 2528 from = pos & (PAGE_CACHE_SIZE - 1);
2316 to = from + len; 2529 to = from + len;
2317 2530
2531 if (ext4_nonda_switch(inode->i_sb)) {
2532 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2533 return ext4_write_begin(file, mapping, pos,
2534 len, flags, pagep, fsdata);
2535 }
2536 *fsdata = (void *)0;
2318retry: 2537retry:
2319 /* 2538 /*
2320 * With delayed allocation, we don't log the i_disksize update 2539 * With delayed allocation, we don't log the i_disksize update
@@ -2342,6 +2561,13 @@ retry:
2342 unlock_page(page); 2561 unlock_page(page);
2343 ext4_journal_stop(handle); 2562 ext4_journal_stop(handle);
2344 page_cache_release(page); 2563 page_cache_release(page);
2564 /*
2565 * block_write_begin may have instantiated a few blocks
2566 * outside i_size. Trim these off again. Don't need
2567 * i_size_read because we hold i_mutex.
2568 */
2569 if (pos + len > inode->i_size)
2570 vmtruncate(inode, inode->i_size);
2345 } 2571 }
2346 2572
2347 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2573 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2365,7 +2591,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
2365 bh = page_buffers(page); 2591 bh = page_buffers(page);
2366 idx = offset >> inode->i_blkbits; 2592 idx = offset >> inode->i_blkbits;
2367 2593
2368 for (i=0; i < idx; i++) 2594 for (i = 0; i < idx; i++)
2369 bh = bh->b_this_page; 2595 bh = bh->b_this_page;
2370 2596
2371 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2597 if (!buffer_mapped(bh) || (buffer_delay(bh)))
@@ -2383,9 +2609,22 @@ static int ext4_da_write_end(struct file *file,
2383 handle_t *handle = ext4_journal_current_handle(); 2609 handle_t *handle = ext4_journal_current_handle();
2384 loff_t new_i_size; 2610 loff_t new_i_size;
2385 unsigned long start, end; 2611 unsigned long start, end;
2612 int write_mode = (int)(unsigned long)fsdata;
2613
2614 if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2615 if (ext4_should_order_data(inode)) {
2616 return ext4_ordered_write_end(file, mapping, pos,
2617 len, copied, page, fsdata);
2618 } else if (ext4_should_writeback_data(inode)) {
2619 return ext4_writeback_write_end(file, mapping, pos,
2620 len, copied, page, fsdata);
2621 } else {
2622 BUG();
2623 }
2624 }
2386 2625
2387 start = pos & (PAGE_CACHE_SIZE - 1); 2626 start = pos & (PAGE_CACHE_SIZE - 1);
2388 end = start + copied -1; 2627 end = start + copied - 1;
2389 2628
2390 /* 2629 /*
2391 * generic_write_end() will run mark_inode_dirty() if i_size 2630 * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2409,6 +2648,11 @@ static int ext4_da_write_end(struct file *file,
2409 EXT4_I(inode)->i_disksize = new_i_size; 2648 EXT4_I(inode)->i_disksize = new_i_size;
2410 } 2649 }
2411 up_write(&EXT4_I(inode)->i_data_sem); 2650 up_write(&EXT4_I(inode)->i_data_sem);
2651 /* We need to mark inode dirty even if
2652 * new_i_size is less that inode->i_size
2653 * bu greater than i_disksize.(hint delalloc)
2654 */
2655 ext4_mark_inode_dirty(handle, inode);
2412 } 2656 }
2413 } 2657 }
2414 ret2 = generic_write_end(file, mapping, pos, len, copied, 2658 ret2 = generic_write_end(file, mapping, pos, len, copied,
@@ -2500,7 +2744,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2500 return 0; 2744 return 0;
2501 } 2745 }
2502 2746
2503 return generic_block_bmap(mapping,block,ext4_get_block); 2747 return generic_block_bmap(mapping, block, ext4_get_block);
2504} 2748}
2505 2749
2506static int bget_one(handle_t *handle, struct buffer_head *bh) 2750static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -3106,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
3106 if (!partial->key && *partial->p) 3350 if (!partial->key && *partial->p)
3107 /* Writer: end */ 3351 /* Writer: end */
3108 goto no_top; 3352 goto no_top;
3109 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) 3353 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
3110 ; 3354 ;
3111 /* 3355 /*
3112 * OK, we've found the last block that must survive. The rest of our 3356 * OK, we've found the last block that must survive. The rest of our
@@ -3125,7 +3369,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
3125 } 3369 }
3126 /* Writer: end */ 3370 /* Writer: end */
3127 3371
3128 while(partial > p) { 3372 while (partial > p) {
3129 brelse(partial->bh); 3373 brelse(partial->bh);
3130 partial--; 3374 partial--;
3131 } 3375 }
@@ -3317,9 +3561,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3317 /* This zaps the entire block. Bottom up. */ 3561 /* This zaps the entire block. Bottom up. */
3318 BUFFER_TRACE(bh, "free child branches"); 3562 BUFFER_TRACE(bh, "free child branches");
3319 ext4_free_branches(handle, inode, bh, 3563 ext4_free_branches(handle, inode, bh,
3320 (__le32*)bh->b_data, 3564 (__le32 *) bh->b_data,
3321 (__le32*)bh->b_data + addr_per_block, 3565 (__le32 *) bh->b_data + addr_per_block,
3322 depth); 3566 depth);
3323 3567
3324 /* 3568 /*
3325 * We've probably journalled the indirect block several 3569 * We've probably journalled the indirect block several
@@ -3486,6 +3730,9 @@ void ext4_truncate(struct inode *inode)
3486 * modify the block allocation tree. 3730 * modify the block allocation tree.
3487 */ 3731 */
3488 down_write(&ei->i_data_sem); 3732 down_write(&ei->i_data_sem);
3733
3734 ext4_discard_preallocations(inode);
3735
3489 /* 3736 /*
3490 * The orphan list entry will now protect us from any crash which 3737 * The orphan list entry will now protect us from any crash which
3491 * occurs before the truncate completes, so it is now safe to propagate 3738 * occurs before the truncate completes, so it is now safe to propagate
@@ -3555,8 +3802,6 @@ do_indirects:
3555 ; 3802 ;
3556 } 3803 }
3557 3804
3558 ext4_discard_reservation(inode);
3559
3560 up_write(&ei->i_data_sem); 3805 up_write(&ei->i_data_sem);
3561 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3806 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3562 ext4_mark_inode_dirty(handle, inode); 3807 ext4_mark_inode_dirty(handle, inode);
@@ -3581,41 +3826,6 @@ out_stop:
3581 ext4_journal_stop(handle); 3826 ext4_journal_stop(handle);
3582} 3827}
3583 3828
3584static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3585 unsigned long ino, struct ext4_iloc *iloc)
3586{
3587 ext4_group_t block_group;
3588 unsigned long offset;
3589 ext4_fsblk_t block;
3590 struct ext4_group_desc *gdp;
3591
3592 if (!ext4_valid_inum(sb, ino)) {
3593 /*
3594 * This error is already checked for in namei.c unless we are
3595 * looking at an NFS filehandle, in which case no error
3596 * report is needed
3597 */
3598 return 0;
3599 }
3600
3601 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
3602 gdp = ext4_get_group_desc(sb, block_group, NULL);
3603 if (!gdp)
3604 return 0;
3605
3606 /*
3607 * Figure out the offset within the block group inode table
3608 */
3609 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
3610 EXT4_INODE_SIZE(sb);
3611 block = ext4_inode_table(sb, gdp) +
3612 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
3613
3614 iloc->block_group = block_group;
3615 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
3616 return block;
3617}
3618
3619/* 3829/*
3620 * ext4_get_inode_loc returns with an extra refcount against the inode's 3830 * ext4_get_inode_loc returns with an extra refcount against the inode's
3621 * underlying buffer_head on success. If 'in_mem' is true, we have all 3831 * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3625,19 +3835,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
3625static int __ext4_get_inode_loc(struct inode *inode, 3835static int __ext4_get_inode_loc(struct inode *inode,
3626 struct ext4_iloc *iloc, int in_mem) 3836 struct ext4_iloc *iloc, int in_mem)
3627{ 3837{
3628 ext4_fsblk_t block; 3838 struct ext4_group_desc *gdp;
3629 struct buffer_head *bh; 3839 struct buffer_head *bh;
3840 struct super_block *sb = inode->i_sb;
3841 ext4_fsblk_t block;
3842 int inodes_per_block, inode_offset;
3843
3844 iloc->bh = 0;
3845 if (!ext4_valid_inum(sb, inode->i_ino))
3846 return -EIO;
3630 3847
3631 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); 3848 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
3632 if (!block) 3849 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
3850 if (!gdp)
3633 return -EIO; 3851 return -EIO;
3634 3852
3635 bh = sb_getblk(inode->i_sb, block); 3853 /*
3854 * Figure out the offset within the block group inode table
3855 */
3856 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
3857 inode_offset = ((inode->i_ino - 1) %
3858 EXT4_INODES_PER_GROUP(sb));
3859 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
3860 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3861
3862 bh = sb_getblk(sb, block);
3636 if (!bh) { 3863 if (!bh) {
3637 ext4_error (inode->i_sb, "ext4_get_inode_loc", 3864 ext4_error(sb, "ext4_get_inode_loc", "unable to read "
3638 "unable to read inode block - " 3865 "inode block - inode=%lu, block=%llu",
3639 "inode=%lu, block=%llu", 3866 inode->i_ino, block);
3640 inode->i_ino, block);
3641 return -EIO; 3867 return -EIO;
3642 } 3868 }
3643 if (!buffer_uptodate(bh)) { 3869 if (!buffer_uptodate(bh)) {
@@ -3665,28 +3891,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
3665 */ 3891 */
3666 if (in_mem) { 3892 if (in_mem) {
3667 struct buffer_head *bitmap_bh; 3893 struct buffer_head *bitmap_bh;
3668 struct ext4_group_desc *desc; 3894 int i, start;
3669 int inodes_per_buffer;
3670 int inode_offset, i;
3671 ext4_group_t block_group;
3672 int start;
3673
3674 block_group = (inode->i_ino - 1) /
3675 EXT4_INODES_PER_GROUP(inode->i_sb);
3676 inodes_per_buffer = bh->b_size /
3677 EXT4_INODE_SIZE(inode->i_sb);
3678 inode_offset = ((inode->i_ino - 1) %
3679 EXT4_INODES_PER_GROUP(inode->i_sb));
3680 start = inode_offset & ~(inodes_per_buffer - 1);
3681 3895
3682 /* Is the inode bitmap in cache? */ 3896 start = inode_offset & ~(inodes_per_block - 1);
3683 desc = ext4_get_group_desc(inode->i_sb,
3684 block_group, NULL);
3685 if (!desc)
3686 goto make_io;
3687 3897
3688 bitmap_bh = sb_getblk(inode->i_sb, 3898 /* Is the inode bitmap in cache? */
3689 ext4_inode_bitmap(inode->i_sb, desc)); 3899 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3690 if (!bitmap_bh) 3900 if (!bitmap_bh)
3691 goto make_io; 3901 goto make_io;
3692 3902
@@ -3699,14 +3909,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
3699 brelse(bitmap_bh); 3909 brelse(bitmap_bh);
3700 goto make_io; 3910 goto make_io;
3701 } 3911 }
3702 for (i = start; i < start + inodes_per_buffer; i++) { 3912 for (i = start; i < start + inodes_per_block; i++) {
3703 if (i == inode_offset) 3913 if (i == inode_offset)
3704 continue; 3914 continue;
3705 if (ext4_test_bit(i, bitmap_bh->b_data)) 3915 if (ext4_test_bit(i, bitmap_bh->b_data))
3706 break; 3916 break;
3707 } 3917 }
3708 brelse(bitmap_bh); 3918 brelse(bitmap_bh);
3709 if (i == start + inodes_per_buffer) { 3919 if (i == start + inodes_per_block) {
3710 /* all other inodes are free, so skip I/O */ 3920 /* all other inodes are free, so skip I/O */
3711 memset(bh->b_data, 0, bh->b_size); 3921 memset(bh->b_data, 0, bh->b_size);
3712 set_buffer_uptodate(bh); 3922 set_buffer_uptodate(bh);
@@ -3717,6 +3927,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
3717 3927
3718make_io: 3928make_io:
3719 /* 3929 /*
3930 * If we need to do any I/O, try to pre-readahead extra
3931 * blocks from the inode table.
3932 */
3933 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3934 ext4_fsblk_t b, end, table;
3935 unsigned num;
3936
3937 table = ext4_inode_table(sb, gdp);
3938 /* Make sure s_inode_readahead_blks is a power of 2 */
3939 while (EXT4_SB(sb)->s_inode_readahead_blks &
3940 (EXT4_SB(sb)->s_inode_readahead_blks-1))
3941 EXT4_SB(sb)->s_inode_readahead_blks =
3942 (EXT4_SB(sb)->s_inode_readahead_blks &
3943 (EXT4_SB(sb)->s_inode_readahead_blks-1));
3944 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
3945 if (table > b)
3946 b = table;
3947 end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3948 num = EXT4_INODES_PER_GROUP(sb);
3949 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3950 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3951 num -= le16_to_cpu(gdp->bg_itable_unused);
3952 table += num / inodes_per_block;
3953 if (end > table)
3954 end = table;
3955 while (b <= end)
3956 sb_breadahead(sb, b++);
3957 }
3958
3959 /*
3720 * There are other valid inodes in the buffer, this inode 3960 * There are other valid inodes in the buffer, this inode
3721 * has in-inode xattrs, or we don't have this inode in memory. 3961 * has in-inode xattrs, or we don't have this inode in memory.
3722 * Read the block from disk. 3962 * Read the block from disk.
@@ -3726,10 +3966,9 @@ make_io:
3726 submit_bh(READ_META, bh); 3966 submit_bh(READ_META, bh);
3727 wait_on_buffer(bh); 3967 wait_on_buffer(bh);
3728 if (!buffer_uptodate(bh)) { 3968 if (!buffer_uptodate(bh)) {
3729 ext4_error(inode->i_sb, "ext4_get_inode_loc", 3969 ext4_error(sb, __func__,
3730 "unable to read inode block - " 3970 "unable to read inode block - inode=%lu, "
3731 "inode=%lu, block=%llu", 3971 "block=%llu", inode->i_ino, block);
3732 inode->i_ino, block);
3733 brelse(bh); 3972 brelse(bh);
3734 return -EIO; 3973 return -EIO;
3735 } 3974 }
@@ -3821,11 +4060,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3821 return inode; 4060 return inode;
3822 4061
3823 ei = EXT4_I(inode); 4062 ei = EXT4_I(inode);
3824#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 4063#ifdef CONFIG_EXT4_FS_POSIX_ACL
3825 ei->i_acl = EXT4_ACL_NOT_CACHED; 4064 ei->i_acl = EXT4_ACL_NOT_CACHED;
3826 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4065 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
3827#endif 4066#endif
3828 ei->i_block_alloc_info = NULL;
3829 4067
3830 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4068 ret = __ext4_get_inode_loc(inode, &iloc, 0);
3831 if (ret < 0) 4069 if (ret < 0)
@@ -3835,7 +4073,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3835 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4073 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3836 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4074 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3837 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4075 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3838 if(!(test_opt (inode->i_sb, NO_UID32))) { 4076 if (!(test_opt(inode->i_sb, NO_UID32))) {
3839 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4077 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3840 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4078 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3841 } 4079 }
@@ -3853,7 +4091,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3853 if (inode->i_mode == 0 || 4091 if (inode->i_mode == 0 ||
3854 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4092 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3855 /* this inode is deleted */ 4093 /* this inode is deleted */
3856 brelse (bh); 4094 brelse(bh);
3857 ret = -ESTALE; 4095 ret = -ESTALE;
3858 goto bad_inode; 4096 goto bad_inode;
3859 } 4097 }
@@ -3886,7 +4124,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3886 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4124 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3887 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4125 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3888 EXT4_INODE_SIZE(inode->i_sb)) { 4126 EXT4_INODE_SIZE(inode->i_sb)) {
3889 brelse (bh); 4127 brelse(bh);
3890 ret = -EIO; 4128 ret = -EIO;
3891 goto bad_inode; 4129 goto bad_inode;
3892 } 4130 }
@@ -3939,7 +4177,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3939 init_special_inode(inode, inode->i_mode, 4177 init_special_inode(inode, inode->i_mode,
3940 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4178 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3941 } 4179 }
3942 brelse (iloc.bh); 4180 brelse(iloc.bh);
3943 ext4_set_inode_flags(inode); 4181 ext4_set_inode_flags(inode);
3944 unlock_new_inode(inode); 4182 unlock_new_inode(inode);
3945 return inode; 4183 return inode;
@@ -3956,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
3956 struct inode *inode = &(ei->vfs_inode); 4194 struct inode *inode = &(ei->vfs_inode);
3957 u64 i_blocks = inode->i_blocks; 4195 u64 i_blocks = inode->i_blocks;
3958 struct super_block *sb = inode->i_sb; 4196 struct super_block *sb = inode->i_sb;
3959 int err = 0;
3960 4197
3961 if (i_blocks <= ~0U) { 4198 if (i_blocks <= ~0U) {
3962 /* 4199 /*
@@ -3966,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
3966 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4203 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
3967 raw_inode->i_blocks_high = 0; 4204 raw_inode->i_blocks_high = 0;
3968 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4205 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
3969 } else if (i_blocks <= 0xffffffffffffULL) { 4206 return 0;
4207 }
4208 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
4209 return -EFBIG;
4210
4211 if (i_blocks <= 0xffffffffffffULL) {
3970 /* 4212 /*
3971 * i_blocks can be represented in a 48 bit variable 4213 * i_blocks can be represented in a 48 bit variable
3972 * as multiple of 512 bytes 4214 * as multiple of 512 bytes
3973 */ 4215 */
3974 err = ext4_update_rocompat_feature(handle, sb,
3975 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
3976 if (err)
3977 goto err_out;
3978 /* i_block is stored in the split 48 bit fields */
3979 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4216 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
3980 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4217 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3981 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4218 ei->i_flags &= ~EXT4_HUGE_FILE_FL;
3982 } else { 4219 } else {
3983 /*
3984 * i_blocks should be represented in a 48 bit variable
3985 * as multiple of file system block size
3986 */
3987 err = ext4_update_rocompat_feature(handle, sb,
3988 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
3989 if (err)
3990 goto err_out;
3991 ei->i_flags |= EXT4_HUGE_FILE_FL; 4220 ei->i_flags |= EXT4_HUGE_FILE_FL;
3992 /* i_block is stored in file system block size */ 4221 /* i_block is stored in file system block size */
3993 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4222 i_blocks = i_blocks >> (inode->i_blkbits - 9);
3994 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4223 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
3995 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4224 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3996 } 4225 }
3997err_out: 4226 return 0;
3998 return err;
3999} 4227}
4000 4228
4001/* 4229/*
@@ -4021,14 +4249,14 @@ static int ext4_do_update_inode(handle_t *handle,
4021 4249
4022 ext4_get_inode_flags(ei); 4250 ext4_get_inode_flags(ei);
4023 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4251 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
4024 if(!(test_opt(inode->i_sb, NO_UID32))) { 4252 if (!(test_opt(inode->i_sb, NO_UID32))) {
4025 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4253 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
4026 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4254 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
4027/* 4255/*
4028 * Fix up interoperability with old kernels. Otherwise, old inodes get 4256 * Fix up interoperability with old kernels. Otherwise, old inodes get
4029 * re-used with the upper 16 bits of the uid/gid intact 4257 * re-used with the upper 16 bits of the uid/gid intact
4030 */ 4258 */
4031 if(!ei->i_dtime) { 4259 if (!ei->i_dtime) {
4032 raw_inode->i_uid_high = 4260 raw_inode->i_uid_high =
4033 cpu_to_le16(high_16_bits(inode->i_uid)); 4261 cpu_to_le16(high_16_bits(inode->i_uid));
4034 raw_inode->i_gid_high = 4262 raw_inode->i_gid_high =
@@ -4116,7 +4344,7 @@ static int ext4_do_update_inode(handle_t *handle,
4116 ei->i_state &= ~EXT4_STATE_NEW; 4344 ei->i_state &= ~EXT4_STATE_NEW;
4117 4345
4118out_brelse: 4346out_brelse:
4119 brelse (bh); 4347 brelse(bh);
4120 ext4_std_error(inode->i_sb, err); 4348 ext4_std_error(inode->i_sb, err);
4121 return err; 4349 return err;
4122} 4350}
@@ -4324,57 +4552,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4324 return 0; 4552 return 0;
4325} 4553}
4326 4554
4555static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
4556 int chunk)
4557{
4558 int indirects;
4559
4560 /* if nrblocks are contiguous */
4561 if (chunk) {
4562 /*
4563 * With N contiguous data blocks, it need at most
4564 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
4565 * 2 dindirect blocks
4566 * 1 tindirect block
4567 */
4568 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
4569 return indirects + 3;
4570 }
4571 /*
4572 * if nrblocks are not contiguous, worse case, each block touch
4573 * a indirect block, and each indirect block touch a double indirect
4574 * block, plus a triple indirect block
4575 */
4576 indirects = nrblocks * 2 + 1;
4577 return indirects;
4578}
4579
4580static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4581{
4582 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4583 return ext4_indirect_trans_blocks(inode, nrblocks, 0);
4584 return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
4585}
4327/* 4586/*
4328 * How many blocks doth make a writepage()? 4587 * Account for index blocks, block groups bitmaps and block group
4329 * 4588 * descriptor blocks if modify datablocks and index blocks
4330 * With N blocks per page, it may be: 4589 * worse case, the indexs blocks spread over different block groups
4331 * N data blocks
4332 * 2 indirect block
4333 * 2 dindirect
4334 * 1 tindirect
4335 * N+5 bitmap blocks (from the above)
4336 * N+5 group descriptor summary blocks
4337 * 1 inode block
4338 * 1 superblock.
4339 * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
4340 * 4590 *
4341 * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS 4591 * If datablocks are discontiguous, they are possible to spread over
4592 * different block groups too. If they are contiugous, with flexbg,
4593 * they could still across block group boundary.
4342 * 4594 *
4343 * With ordered or writeback data it's the same, less the N data blocks. 4595 * Also account for superblock, inode, quota and xattr blocks
4596 */
4597int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4598{
4599 int groups, gdpblocks;
4600 int idxblocks;
4601 int ret = 0;
4602
4603 /*
4604 * How many index blocks need to touch to modify nrblocks?
4605 * The "Chunk" flag indicating whether the nrblocks is
4606 * physically contiguous on disk
4607 *
4608 * For Direct IO and fallocate, they calls get_block to allocate
4609 * one single extent at a time, so they could set the "Chunk" flag
4610 */
4611 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
4612
4613 ret = idxblocks;
4614
4615 /*
4616 * Now let's see how many group bitmaps and group descriptors need
4617 * to account
4618 */
4619 groups = idxblocks;
4620 if (chunk)
4621 groups += 1;
4622 else
4623 groups += nrblocks;
4624
4625 gdpblocks = groups;
4626 if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
4627 groups = EXT4_SB(inode->i_sb)->s_groups_count;
4628 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4629 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4630
4631 /* bitmaps and block group descriptor blocks */
4632 ret += groups + gdpblocks;
4633
4634 /* Blocks for super block, inode, quota and xattr blocks */
4635 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4636
4637 return ret;
4638}
4639
4640/*
4641 * Calulate the total number of credits to reserve to fit
4642 * the modification of a single pages into a single transaction,
4643 * which may include multiple chunks of block allocations.
4344 * 4644 *
4345 * If the inode's direct blocks can hold an integral number of pages then a 4645 * This could be called via ext4_write_begin()
4346 * page cannot straddle two indirect blocks, and we can only touch one indirect
4347 * and dindirect block, and the "5" above becomes "3".
4348 * 4646 *
4349 * This still overestimates under most circumstances. If we were to pass the 4647 * We need to consider the worse case, when
4350 * start and end offsets in here as well we could do block_to_path() on each 4648 * one new block per extent.
4351 * block and work out the exact number of indirects which are touched. Pah.
4352 */ 4649 */
4353
4354int ext4_writepage_trans_blocks(struct inode *inode) 4650int ext4_writepage_trans_blocks(struct inode *inode)
4355{ 4651{
4356 int bpp = ext4_journal_blocks_per_page(inode); 4652 int bpp = ext4_journal_blocks_per_page(inode);
4357 int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
4358 int ret; 4653 int ret;
4359 4654
4360 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 4655 ret = ext4_meta_trans_blocks(inode, bpp, 0);
4361 return ext4_ext_writepage_trans_blocks(inode, bpp);
4362 4656
4657 /* Account for data blocks for journalled mode */
4363 if (ext4_should_journal_data(inode)) 4658 if (ext4_should_journal_data(inode))
4364 ret = 3 * (bpp + indirects) + 2; 4659 ret += bpp;
4365 else
4366 ret = 2 * (bpp + indirects) + 2;
4367
4368#ifdef CONFIG_QUOTA
4369 /* We know that structure was already allocated during DQUOT_INIT so
4370 * we will be updating only the data blocks + inodes */
4371 ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
4372#endif
4373
4374 return ret; 4660 return ret;
4375} 4661}
4376 4662
4377/* 4663/*
4664 * Calculate the journal credits for a chunk of data modification.
4665 *
4666 * This is called from DIO, fallocate or whoever calling
4667 * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
4668 *
4669 * journal buffers for data blocks are not included here, as DIO
4670 * and fallocate do no need to journal data buffers.
4671 */
4672int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4673{
4674 return ext4_meta_trans_blocks(inode, nrblocks, 1);
4675}
4676
4677/*
4378 * The caller must have previously called ext4_reserve_inode_write(). 4678 * The caller must have previously called ext4_reserve_inode_write().
4379 * Give this, we know that the caller already has write access to iloc->bh. 4679 * Give this, we know that the caller already has write access to iloc->bh.
4380 */ 4680 */
@@ -4647,6 +4947,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4647 loff_t size; 4947 loff_t size;
4648 unsigned long len; 4948 unsigned long len;
4649 int ret = -EINVAL; 4949 int ret = -EINVAL;
4950 void *fsdata;
4650 struct file *file = vma->vm_file; 4951 struct file *file = vma->vm_file;
4651 struct inode *inode = file->f_path.dentry->d_inode; 4952 struct inode *inode = file->f_path.dentry->d_inode;
4652 struct address_space *mapping = inode->i_mapping; 4953 struct address_space *mapping = inode->i_mapping;
@@ -4685,11 +4986,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4685 * on the same page though 4986 * on the same page though
4686 */ 4987 */
4687 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 4988 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4688 len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 4989 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
4689 if (ret < 0) 4990 if (ret < 0)
4690 goto out_unlock; 4991 goto out_unlock;
4691 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 4992 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4692 len, len, page, NULL); 4993 len, len, page, fsdata);
4693 if (ret < 0) 4994 if (ret < 0)
4694 goto out_unlock; 4995 goto out_unlock;
4695 ret = 0; 4996 ret = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1faba6..dc99b4776d58 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
23 struct inode *inode = filp->f_dentry->d_inode; 23 struct inode *inode = filp->f_dentry->d_inode;
24 struct ext4_inode_info *ei = EXT4_I(inode); 24 struct ext4_inode_info *ei = EXT4_I(inode);
25 unsigned int flags; 25 unsigned int flags;
26 unsigned short rsv_window_size;
27 26
28 ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg); 27 ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
29 28
30 switch (cmd) { 29 switch (cmd) {
31 case EXT4_IOC_GETFLAGS: 30 case EXT4_IOC_GETFLAGS:
@@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
34 return put_user(flags, (int __user *) arg); 33 return put_user(flags, (int __user *) arg);
35 case EXT4_IOC_SETFLAGS: { 34 case EXT4_IOC_SETFLAGS: {
36 handle_t *handle = NULL; 35 handle_t *handle = NULL;
37 int err; 36 int err, migrate = 0;
38 struct ext4_iloc iloc; 37 struct ext4_iloc iloc;
39 unsigned int oldflags; 38 unsigned int oldflags;
40 unsigned int jflag; 39 unsigned int jflag;
@@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
82 if (!capable(CAP_SYS_RESOURCE)) 81 if (!capable(CAP_SYS_RESOURCE))
83 goto flags_out; 82 goto flags_out;
84 } 83 }
84 if (oldflags & EXT4_EXTENTS_FL) {
85 /* We don't support clearning extent flags */
86 if (!(flags & EXT4_EXTENTS_FL)) {
87 err = -EOPNOTSUPP;
88 goto flags_out;
89 }
90 } else if (flags & EXT4_EXTENTS_FL) {
91 /* migrate the file */
92 migrate = 1;
93 flags &= ~EXT4_EXTENTS_FL;
94 }
85 95
86 handle = ext4_journal_start(inode, 1); 96 handle = ext4_journal_start(inode, 1);
87 if (IS_ERR(handle)) { 97 if (IS_ERR(handle)) {
@@ -109,6 +119,10 @@ flags_err:
109 119
110 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) 120 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
111 err = ext4_change_inode_journal_flag(inode, jflag); 121 err = ext4_change_inode_journal_flag(inode, jflag);
122 if (err)
123 goto flags_out;
124 if (migrate)
125 err = ext4_ext_migrate(inode);
112flags_out: 126flags_out:
113 mutex_unlock(&inode->i_mutex); 127 mutex_unlock(&inode->i_mutex);
114 mnt_drop_write(filp->f_path.mnt); 128 mnt_drop_write(filp->f_path.mnt);
@@ -175,53 +189,10 @@ setversion_out:
175 return ret; 189 return ret;
176 } 190 }
177#endif 191#endif
178 case EXT4_IOC_GETRSVSZ:
179 if (test_opt(inode->i_sb, RESERVATION)
180 && S_ISREG(inode->i_mode)
181 && ei->i_block_alloc_info) {
182 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
183 return put_user(rsv_window_size, (int __user *)arg);
184 }
185 return -ENOTTY;
186 case EXT4_IOC_SETRSVSZ: {
187 int err;
188
189 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
190 return -ENOTTY;
191
192 if (!is_owner_or_cap(inode))
193 return -EACCES;
194
195 if (get_user(rsv_window_size, (int __user *)arg))
196 return -EFAULT;
197
198 err = mnt_want_write(filp->f_path.mnt);
199 if (err)
200 return err;
201
202 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
203 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
204
205 /*
206 * need to allocate reservation structure for this inode
207 * before set the window size
208 */
209 down_write(&ei->i_data_sem);
210 if (!ei->i_block_alloc_info)
211 ext4_init_block_alloc_info(inode);
212
213 if (ei->i_block_alloc_info){
214 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
215 rsv->rsv_goal_size = rsv_window_size;
216 }
217 up_write(&ei->i_data_sem);
218 mnt_drop_write(filp->f_path.mnt);
219 return 0;
220 }
221 case EXT4_IOC_GROUP_EXTEND: { 192 case EXT4_IOC_GROUP_EXTEND: {
222 ext4_fsblk_t n_blocks_count; 193 ext4_fsblk_t n_blocks_count;
223 struct super_block *sb = inode->i_sb; 194 struct super_block *sb = inode->i_sb;
224 int err; 195 int err, err2;
225 196
226 if (!capable(CAP_SYS_RESOURCE)) 197 if (!capable(CAP_SYS_RESOURCE))
227 return -EPERM; 198 return -EPERM;
@@ -235,8 +206,10 @@ setversion_out:
235 206
236 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 207 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
237 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 208 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
238 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 209 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
239 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 210 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
211 if (err == 0)
212 err = err2;
240 mnt_drop_write(filp->f_path.mnt); 213 mnt_drop_write(filp->f_path.mnt);
241 214
242 return err; 215 return err;
@@ -244,7 +217,7 @@ setversion_out:
244 case EXT4_IOC_GROUP_ADD: { 217 case EXT4_IOC_GROUP_ADD: {
245 struct ext4_new_group_data input; 218 struct ext4_new_group_data input;
246 struct super_block *sb = inode->i_sb; 219 struct super_block *sb = inode->i_sb;
247 int err; 220 int err, err2;
248 221
249 if (!capable(CAP_SYS_RESOURCE)) 222 if (!capable(CAP_SYS_RESOURCE))
250 return -EPERM; 223 return -EPERM;
@@ -259,15 +232,36 @@ setversion_out:
259 232
260 err = ext4_group_add(sb, &input); 233 err = ext4_group_add(sb, &input);
261 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 234 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
262 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 235 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
263 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 236 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
237 if (err == 0)
238 err = err2;
264 mnt_drop_write(filp->f_path.mnt); 239 mnt_drop_write(filp->f_path.mnt);
265 240
266 return err; 241 return err;
267 } 242 }
268 243
269 case EXT4_IOC_MIGRATE: 244 case EXT4_IOC_MIGRATE:
270 return ext4_ext_migrate(inode, filp, cmd, arg); 245 {
246 int err;
247 if (!is_owner_or_cap(inode))
248 return -EACCES;
249
250 err = mnt_want_write(filp->f_path.mnt);
251 if (err)
252 return err;
253 /*
254 * inode_mutex prevent write and truncate on the file.
255 * Read still goes through. We take i_data_sem in
256 * ext4_ext_swap_inode_data before we switch the
257 * inode format to prevent read.
258 */
259 mutex_lock(&(inode->i_mutex));
260 err = ext4_ext_migrate(inode);
261 mutex_unlock(&(inode->i_mutex));
262 mnt_drop_write(filp->f_path.mnt);
263 return err;
264 }
271 265
272 default: 266 default:
273 return -ENOTTY; 267 return -ENOTTY;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 865e9ddb44d4..dfe17a134052 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -477,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
477 b2 = (unsigned char *) bitmap; 477 b2 = (unsigned char *) bitmap;
478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
479 if (b1[i] != b2[i]) { 479 if (b1[i] != b2[i]) {
480 printk("corruption in group %lu at byte %u(%u):" 480 printk(KERN_ERR "corruption in group %lu "
481 " %x in copy != %x on disk/prealloc\n", 481 "at byte %u(%u): %x in copy != %x "
482 e4b->bd_group, i, i * 8, b1[i], b2[i]); 482 "on disk/prealloc\n",
483 e4b->bd_group, i, i * 8, b1[i], b2[i]);
483 BUG(); 484 BUG();
484 } 485 }
485 } 486 }
@@ -533,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
533 void *buddy; 534 void *buddy;
534 void *buddy2; 535 void *buddy2;
535 536
536 if (!test_opt(sb, MBALLOC))
537 return 0;
538
539 { 537 {
540 static int mb_check_counter; 538 static int mb_check_counter;
541 if (mb_check_counter++ % 100 != 0) 539 if (mb_check_counter++ % 100 != 0)
@@ -784,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
784 if (bh[i] == NULL) 782 if (bh[i] == NULL)
785 goto out; 783 goto out;
786 784
787 if (bh_uptodate_or_lock(bh[i])) 785 if (buffer_uptodate(bh[i]) &&
786 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
788 continue; 787 continue;
789 788
789 lock_buffer(bh[i]);
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
792 ext4_init_block_bitmap(sb, bh[i], 792 ext4_init_block_bitmap(sb, bh[i],
@@ -2169,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb)
2169{ 2169{
2170 struct ext4_sb_info *sbi = EXT4_SB(sb); 2170 struct ext4_sb_info *sbi = EXT4_SB(sb);
2171 2171
2172 remove_proc_entry("mb_groups", sbi->s_mb_proc); 2172 if (sbi->s_proc != NULL) {
2173 remove_proc_entry("mb_history", sbi->s_mb_proc); 2173 remove_proc_entry("mb_groups", sbi->s_proc);
2174 2174 remove_proc_entry("mb_history", sbi->s_proc);
2175 }
2175 kfree(sbi->s_mb_history); 2176 kfree(sbi->s_mb_history);
2176} 2177}
2177 2178
@@ -2180,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb)
2180 struct ext4_sb_info *sbi = EXT4_SB(sb); 2181 struct ext4_sb_info *sbi = EXT4_SB(sb);
2181 int i; 2182 int i;
2182 2183
2183 if (sbi->s_mb_proc != NULL) { 2184 if (sbi->s_proc != NULL) {
2184 proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc, 2185 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2185 &ext4_mb_seq_history_fops, sb); 2186 &ext4_mb_seq_history_fops, sb);
2186 proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc, 2187 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2187 &ext4_mb_seq_groups_fops, sb); 2188 &ext4_mb_seq_groups_fops, sb);
2188 } 2189 }
2189 2190
@@ -2299,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2299 } 2300 }
2300 2301
2301 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2303 meta_group_info[i]->bb_free_root.rb_node = NULL;;
2302 2304
2303#ifdef DOUBLE_CHECK 2305#ifdef DOUBLE_CHECK
2304 { 2306 {
@@ -2485,19 +2487,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2485 unsigned max; 2487 unsigned max;
2486 int ret; 2488 int ret;
2487 2489
2488 if (!test_opt(sb, MBALLOC))
2489 return 0;
2490
2491 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2490 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
2492 2491
2493 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2492 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2494 if (sbi->s_mb_offsets == NULL) { 2493 if (sbi->s_mb_offsets == NULL) {
2495 clear_opt(sbi->s_mount_opt, MBALLOC);
2496 return -ENOMEM; 2494 return -ENOMEM;
2497 } 2495 }
2498 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2496 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2499 if (sbi->s_mb_maxs == NULL) { 2497 if (sbi->s_mb_maxs == NULL) {
2500 clear_opt(sbi->s_mount_opt, MBALLOC);
2501 kfree(sbi->s_mb_maxs); 2498 kfree(sbi->s_mb_maxs);
2502 return -ENOMEM; 2499 return -ENOMEM;
2503 } 2500 }
@@ -2520,16 +2517,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2520 /* init file for buddy data */ 2517 /* init file for buddy data */
2521 ret = ext4_mb_init_backend(sb); 2518 ret = ext4_mb_init_backend(sb);
2522 if (ret != 0) { 2519 if (ret != 0) {
2523 clear_opt(sbi->s_mount_opt, MBALLOC);
2524 kfree(sbi->s_mb_offsets); 2520 kfree(sbi->s_mb_offsets);
2525 kfree(sbi->s_mb_maxs); 2521 kfree(sbi->s_mb_maxs);
2526 return ret; 2522 return ret;
2527 } 2523 }
2528 2524
2529 spin_lock_init(&sbi->s_md_lock); 2525 spin_lock_init(&sbi->s_md_lock);
2530 INIT_LIST_HEAD(&sbi->s_active_transaction);
2531 INIT_LIST_HEAD(&sbi->s_closed_transaction);
2532 INIT_LIST_HEAD(&sbi->s_committed_transaction);
2533 spin_lock_init(&sbi->s_bal_lock); 2526 spin_lock_init(&sbi->s_bal_lock);
2534 2527
2535 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 2528 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2540,17 +2533,15 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2540 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; 2533 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2541 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2534 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2542 2535
2543 i = sizeof(struct ext4_locality_group) * nr_cpu_ids; 2536 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2544 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
2545 if (sbi->s_locality_groups == NULL) { 2537 if (sbi->s_locality_groups == NULL) {
2546 clear_opt(sbi->s_mount_opt, MBALLOC);
2547 kfree(sbi->s_mb_offsets); 2538 kfree(sbi->s_mb_offsets);
2548 kfree(sbi->s_mb_maxs); 2539 kfree(sbi->s_mb_maxs);
2549 return -ENOMEM; 2540 return -ENOMEM;
2550 } 2541 }
2551 for (i = 0; i < nr_cpu_ids; i++) { 2542 for_each_possible_cpu(i) {
2552 struct ext4_locality_group *lg; 2543 struct ext4_locality_group *lg;
2553 lg = &sbi->s_locality_groups[i]; 2544 lg = per_cpu_ptr(sbi->s_locality_groups, i);
2554 mutex_init(&lg->lg_mutex); 2545 mutex_init(&lg->lg_mutex);
2555 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2546 for (j = 0; j < PREALLOC_TB_SIZE; j++)
2556 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2547 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
@@ -2560,7 +2551,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2560 ext4_mb_init_per_dev_proc(sb); 2551 ext4_mb_init_per_dev_proc(sb);
2561 ext4_mb_history_init(sb); 2552 ext4_mb_history_init(sb);
2562 2553
2563 printk("EXT4-fs: mballoc enabled\n"); 2554 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2555
2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2564 return 0; 2557 return 0;
2565} 2558}
2566 2559
@@ -2575,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2575 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2568 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2576 list_del(&pa->pa_group_list); 2569 list_del(&pa->pa_group_list);
2577 count++; 2570 count++;
2578 kfree(pa); 2571 kmem_cache_free(ext4_pspace_cachep, pa);
2579 } 2572 }
2580 if (count) 2573 if (count)
2581 mb_debug("mballoc: %u PAs left\n", count); 2574 mb_debug("mballoc: %u PAs left\n", count);
@@ -2589,18 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
2589 struct ext4_group_info *grinfo; 2582 struct ext4_group_info *grinfo;
2590 struct ext4_sb_info *sbi = EXT4_SB(sb); 2583 struct ext4_sb_info *sbi = EXT4_SB(sb);
2591 2584
2592 if (!test_opt(sb, MBALLOC))
2593 return 0;
2594
2595 /* release freed, non-committed blocks */
2596 spin_lock(&sbi->s_md_lock);
2597 list_splice_init(&sbi->s_closed_transaction,
2598 &sbi->s_committed_transaction);
2599 list_splice_init(&sbi->s_active_transaction,
2600 &sbi->s_committed_transaction);
2601 spin_unlock(&sbi->s_md_lock);
2602 ext4_mb_free_committed_blocks(sb);
2603
2604 if (sbi->s_group_info) { 2585 if (sbi->s_group_info) {
2605 for (i = 0; i < sbi->s_groups_count; i++) { 2586 for (i = 0; i < sbi->s_groups_count; i++) {
2606 grinfo = ext4_get_group_info(sb, i); 2587 grinfo = ext4_get_group_info(sb, i);
@@ -2647,69 +2628,64 @@ int ext4_mb_release(struct super_block *sb)
2647 atomic_read(&sbi->s_mb_discarded)); 2628 atomic_read(&sbi->s_mb_discarded));
2648 } 2629 }
2649 2630
2650 kfree(sbi->s_locality_groups); 2631 free_percpu(sbi->s_locality_groups);
2651
2652 ext4_mb_history_release(sb); 2632 ext4_mb_history_release(sb);
2653 ext4_mb_destroy_per_dev_proc(sb); 2633 ext4_mb_destroy_per_dev_proc(sb);
2654 2634
2655 return 0; 2635 return 0;
2656} 2636}
2657 2637
2658static noinline_for_stack void 2638/*
2659ext4_mb_free_committed_blocks(struct super_block *sb) 2639 * This function is called by the jbd2 layer once the commit has finished,
2640 * so we know we can free the blocks that were released with that commit.
2641 */
2642static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2660{ 2643{
2661 struct ext4_sb_info *sbi = EXT4_SB(sb); 2644 struct super_block *sb = journal->j_private;
2662 int err;
2663 int i;
2664 int count = 0;
2665 int count2 = 0;
2666 struct ext4_free_metadata *md;
2667 struct ext4_buddy e4b; 2645 struct ext4_buddy e4b;
2646 struct ext4_group_info *db;
2647 int err, count = 0, count2 = 0;
2648 struct ext4_free_data *entry;
2649 ext4_fsblk_t discard_block;
2650 struct list_head *l, *ltmp;
2668 2651
2669 if (list_empty(&sbi->s_committed_transaction)) 2652 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2670 return; 2653 entry = list_entry(l, struct ext4_free_data, list);
2671
2672 /* there is committed blocks to be freed yet */
2673 do {
2674 /* get next array of blocks */
2675 md = NULL;
2676 spin_lock(&sbi->s_md_lock);
2677 if (!list_empty(&sbi->s_committed_transaction)) {
2678 md = list_entry(sbi->s_committed_transaction.next,
2679 struct ext4_free_metadata, list);
2680 list_del(&md->list);
2681 }
2682 spin_unlock(&sbi->s_md_lock);
2683
2684 if (md == NULL)
2685 break;
2686 2654
2687 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2655 mb_debug("gonna free %u blocks in group %lu (0x%p):",
2688 md->num, md->group, md); 2656 entry->count, entry->group, entry);
2689 2657
2690 err = ext4_mb_load_buddy(sb, md->group, &e4b); 2658 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2691 /* we expect to find existing buddy because it's pinned */ 2659 /* we expect to find existing buddy because it's pinned */
2692 BUG_ON(err != 0); 2660 BUG_ON(err != 0);
2693 2661
2662 db = e4b.bd_info;
2694 /* there are blocks to put in buddy to make them really free */ 2663 /* there are blocks to put in buddy to make them really free */
2695 count += md->num; 2664 count += entry->count;
2696 count2++; 2665 count2++;
2697 ext4_lock_group(sb, md->group); 2666 ext4_lock_group(sb, entry->group);
2698 for (i = 0; i < md->num; i++) { 2667 /* Take it out of per group rb tree */
2699 mb_debug(" %u", md->blocks[i]); 2668 rb_erase(&entry->node, &(db->bb_free_root));
2700 mb_free_blocks(NULL, &e4b, md->blocks[i], 1); 2669 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
2670
2671 if (!db->bb_free_root.rb_node) {
2672 /* No more items in the per group rb tree
2673 * balance refcounts from ext4_mb_free_metadata()
2674 */
2675 page_cache_release(e4b.bd_buddy_page);
2676 page_cache_release(e4b.bd_bitmap_page);
2701 } 2677 }
2702 mb_debug("\n"); 2678 ext4_unlock_group(sb, entry->group);
2703 ext4_unlock_group(sb, md->group); 2679 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2704 2680 + entry->start_blk
2705 /* balance refcounts from ext4_mb_free_metadata() */ 2681 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2706 page_cache_release(e4b.bd_buddy_page); 2682 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
2707 page_cache_release(e4b.bd_bitmap_page); 2683 (unsigned long long) discard_block, entry->count);
2708 2684 sb_issue_discard(sb, discard_block, entry->count);
2709 kfree(md); 2685
2686 kmem_cache_free(ext4_free_ext_cachep, entry);
2710 ext4_mb_release_desc(&e4b); 2687 ext4_mb_release_desc(&e4b);
2711 2688 }
2712 } while (md);
2713 2689
2714 mb_debug("freed %u blocks in %u structures\n", count, count2); 2690 mb_debug("freed %u blocks in %u structures\n", count, count2);
2715} 2691}
@@ -2721,119 +2697,52 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2721#define EXT4_MB_STREAM_REQ "stream_req" 2697#define EXT4_MB_STREAM_REQ "stream_req"
2722#define EXT4_MB_GROUP_PREALLOC "group_prealloc" 2698#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2723 2699
2724
2725
2726#define MB_PROC_FOPS(name) \
2727static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
2728{ \
2729 struct ext4_sb_info *sbi = m->private; \
2730 \
2731 seq_printf(m, "%ld\n", sbi->s_mb_##name); \
2732 return 0; \
2733} \
2734 \
2735static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
2736{ \
2737 return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
2738} \
2739 \
2740static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
2741 const char __user *buf, size_t cnt, loff_t *ppos) \
2742{ \
2743 struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
2744 char str[32]; \
2745 long value; \
2746 if (cnt >= sizeof(str)) \
2747 return -EINVAL; \
2748 if (copy_from_user(str, buf, cnt)) \
2749 return -EFAULT; \
2750 value = simple_strtol(str, NULL, 0); \
2751 if (value <= 0) \
2752 return -ERANGE; \
2753 sbi->s_mb_##name = value; \
2754 return cnt; \
2755} \
2756 \
2757static const struct file_operations ext4_mb_##name##_proc_fops = { \
2758 .owner = THIS_MODULE, \
2759 .open = ext4_mb_##name##_proc_open, \
2760 .read = seq_read, \
2761 .llseek = seq_lseek, \
2762 .release = single_release, \
2763 .write = ext4_mb_##name##_proc_write, \
2764};
2765
2766MB_PROC_FOPS(stats);
2767MB_PROC_FOPS(max_to_scan);
2768MB_PROC_FOPS(min_to_scan);
2769MB_PROC_FOPS(order2_reqs);
2770MB_PROC_FOPS(stream_request);
2771MB_PROC_FOPS(group_prealloc);
2772
2773#define MB_PROC_HANDLER(name, var) \
2774do { \
2775 proc = proc_create_data(name, mode, sbi->s_mb_proc, \
2776 &ext4_mb_##var##_proc_fops, sbi); \
2777 if (proc == NULL) { \
2778 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2779 goto err_out; \
2780 } \
2781} while (0)
2782
2783static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2700static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2784{ 2701{
2702#ifdef CONFIG_PROC_FS
2785 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; 2703 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2786 struct ext4_sb_info *sbi = EXT4_SB(sb); 2704 struct ext4_sb_info *sbi = EXT4_SB(sb);
2787 struct proc_dir_entry *proc; 2705 struct proc_dir_entry *proc;
2788 char devname[64];
2789 2706
2790 if (proc_root_ext4 == NULL) { 2707 if (sbi->s_proc == NULL)
2791 sbi->s_mb_proc = NULL;
2792 return -EINVAL; 2708 return -EINVAL;
2793 }
2794 bdevname(sb->s_bdev, devname);
2795 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2796
2797 MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
2798 MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
2799 MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
2800 MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
2801 MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
2802 MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
2803 2709
2710 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2711 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2712 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2713 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2714 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2715 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2804 return 0; 2716 return 0;
2805 2717
2806err_out: 2718err_out:
2807 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); 2719 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2808 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2720 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2809 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2721 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2810 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2722 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2811 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2723 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2812 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2724 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2813 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2814 remove_proc_entry(devname, proc_root_ext4);
2815 sbi->s_mb_proc = NULL;
2816
2817 return -ENOMEM; 2725 return -ENOMEM;
2726#else
2727 return 0;
2728#endif
2818} 2729}
2819 2730
2820static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) 2731static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2821{ 2732{
2733#ifdef CONFIG_PROC_FS
2822 struct ext4_sb_info *sbi = EXT4_SB(sb); 2734 struct ext4_sb_info *sbi = EXT4_SB(sb);
2823 char devname[64];
2824 2735
2825 if (sbi->s_mb_proc == NULL) 2736 if (sbi->s_proc == NULL)
2826 return -EINVAL; 2737 return -EINVAL;
2827 2738
2828 bdevname(sb->s_bdev, devname); 2739 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2829 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); 2740 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2830 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); 2741 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2831 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); 2742 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2832 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); 2743 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2833 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); 2744 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2834 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); 2745#endif
2835 remove_proc_entry(devname, proc_root_ext4);
2836
2837 return 0; 2746 return 0;
2838} 2747}
2839 2748
@@ -2854,11 +2763,16 @@ int __init init_ext4_mballoc(void)
2854 kmem_cache_destroy(ext4_pspace_cachep); 2763 kmem_cache_destroy(ext4_pspace_cachep);
2855 return -ENOMEM; 2764 return -ENOMEM;
2856 } 2765 }
2857#ifdef CONFIG_PROC_FS 2766
2858 proc_root_ext4 = proc_mkdir("fs/ext4", NULL); 2767 ext4_free_ext_cachep =
2859 if (proc_root_ext4 == NULL) 2768 kmem_cache_create("ext4_free_block_extents",
2860 printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n"); 2769 sizeof(struct ext4_free_data),
2861#endif 2770 0, SLAB_RECLAIM_ACCOUNT, NULL);
2771 if (ext4_free_ext_cachep == NULL) {
2772 kmem_cache_destroy(ext4_pspace_cachep);
2773 kmem_cache_destroy(ext4_ac_cachep);
2774 return -ENOMEM;
2775 }
2862 return 0; 2776 return 0;
2863} 2777}
2864 2778
@@ -2867,9 +2781,7 @@ void exit_ext4_mballoc(void)
2867 /* XXX: synchronize_rcu(); */ 2781 /* XXX: synchronize_rcu(); */
2868 kmem_cache_destroy(ext4_pspace_cachep); 2782 kmem_cache_destroy(ext4_pspace_cachep);
2869 kmem_cache_destroy(ext4_ac_cachep); 2783 kmem_cache_destroy(ext4_ac_cachep);
2870#ifdef CONFIG_PROC_FS 2784 kmem_cache_destroy(ext4_free_ext_cachep);
2871 remove_proc_entry("fs/ext4", NULL);
2872#endif
2873} 2785}
2874 2786
2875 2787
@@ -2879,7 +2791,7 @@ void exit_ext4_mballoc(void)
2879 */ 2791 */
2880static noinline_for_stack int 2792static noinline_for_stack int
2881ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2793ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2882 handle_t *handle) 2794 handle_t *handle, unsigned long reserv_blks)
2883{ 2795{
2884 struct buffer_head *bitmap_bh = NULL; 2796 struct buffer_head *bitmap_bh = NULL;
2885 struct ext4_super_block *es; 2797 struct ext4_super_block *es;
@@ -2968,15 +2880,16 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2968 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 2880 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
2969 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2881 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2970 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2882 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2971 2883 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
2972 /* 2884 /*
2973 * free blocks account has already be reduced/reserved 2885 * Now reduce the dirty block count also. Should not go negative
2974 * at write_begin() time for delayed allocation
2975 * do not double accounting
2976 */ 2886 */
2977 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2887 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2978 percpu_counter_sub(&sbi->s_freeblocks_counter, 2888 /* release all the reserved blocks if non delalloc */
2979 ac->ac_b_ex.fe_len); 2889 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
2890 else
2891 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
2892 ac->ac_b_ex.fe_len);
2980 2893
2981 if (sbi->s_log_groups_per_flex) { 2894 if (sbi->s_log_groups_per_flex) {
2982 ext4_group_t flex_group = ext4_flex_group(sbi, 2895 ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3282,6 +3195,35 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3282} 3195}
3283 3196
3284/* 3197/*
3198 * Return the prealloc space that have minimal distance
3199 * from the goal block. @cpa is the prealloc
3200 * space that is having currently known minimal distance
3201 * from the goal block.
3202 */
3203static struct ext4_prealloc_space *
3204ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3205 struct ext4_prealloc_space *pa,
3206 struct ext4_prealloc_space *cpa)
3207{
3208 ext4_fsblk_t cur_distance, new_distance;
3209
3210 if (cpa == NULL) {
3211 atomic_inc(&pa->pa_count);
3212 return pa;
3213 }
3214 cur_distance = abs(goal_block - cpa->pa_pstart);
3215 new_distance = abs(goal_block - pa->pa_pstart);
3216
3217 if (cur_distance < new_distance)
3218 return cpa;
3219
3220 /* drop the previous reference */
3221 atomic_dec(&cpa->pa_count);
3222 atomic_inc(&pa->pa_count);
3223 return pa;
3224}
3225
3226/*
3285 * search goal blocks in preallocated space 3227 * search goal blocks in preallocated space
3286 */ 3228 */
3287static noinline_for_stack int 3229static noinline_for_stack int
@@ -3290,7 +3232,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3290 int order, i; 3232 int order, i;
3291 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3233 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3292 struct ext4_locality_group *lg; 3234 struct ext4_locality_group *lg;
3293 struct ext4_prealloc_space *pa; 3235 struct ext4_prealloc_space *pa, *cpa = NULL;
3236 ext4_fsblk_t goal_block;
3294 3237
3295 /* only data can be preallocated */ 3238 /* only data can be preallocated */
3296 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3239 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3333,6 +3276,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3333 /* The max size of hash table is PREALLOC_TB_SIZE */ 3276 /* The max size of hash table is PREALLOC_TB_SIZE */
3334 order = PREALLOC_TB_SIZE - 1; 3277 order = PREALLOC_TB_SIZE - 1;
3335 3278
3279 goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
3280 ac->ac_g_ex.fe_start +
3281 le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
3282 /*
3283 * search for the prealloc space that is having
3284 * minimal distance from the goal block.
3285 */
3336 for (i = order; i < PREALLOC_TB_SIZE; i++) { 3286 for (i = order; i < PREALLOC_TB_SIZE; i++) {
3337 rcu_read_lock(); 3287 rcu_read_lock();
3338 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 3288 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
@@ -3340,17 +3290,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3340 spin_lock(&pa->pa_lock); 3290 spin_lock(&pa->pa_lock);
3341 if (pa->pa_deleted == 0 && 3291 if (pa->pa_deleted == 0 &&
3342 pa->pa_free >= ac->ac_o_ex.fe_len) { 3292 pa->pa_free >= ac->ac_o_ex.fe_len) {
3343 atomic_inc(&pa->pa_count); 3293
3344 ext4_mb_use_group_pa(ac, pa); 3294 cpa = ext4_mb_check_group_pa(goal_block,
3345 spin_unlock(&pa->pa_lock); 3295 pa, cpa);
3346 ac->ac_criteria = 20;
3347 rcu_read_unlock();
3348 return 1;
3349 } 3296 }
3350 spin_unlock(&pa->pa_lock); 3297 spin_unlock(&pa->pa_lock);
3351 } 3298 }
3352 rcu_read_unlock(); 3299 rcu_read_unlock();
3353 } 3300 }
3301 if (cpa) {
3302 ext4_mb_use_group_pa(ac, cpa);
3303 ac->ac_criteria = 20;
3304 return 1;
3305 }
3354 return 0; 3306 return 0;
3355} 3307}
3356 3308
@@ -3845,7 +3797,7 @@ out:
3845 * 3797 *
3846 * FIXME!! Make sure it is valid at all the call sites 3798 * FIXME!! Make sure it is valid at all the call sites
3847 */ 3799 */
3848void ext4_mb_discard_inode_preallocations(struct inode *inode) 3800void ext4_discard_preallocations(struct inode *inode)
3849{ 3801{
3850 struct ext4_inode_info *ei = EXT4_I(inode); 3802 struct ext4_inode_info *ei = EXT4_I(inode);
3851 struct super_block *sb = inode->i_sb; 3803 struct super_block *sb = inode->i_sb;
@@ -3857,7 +3809,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
3857 struct ext4_buddy e4b; 3809 struct ext4_buddy e4b;
3858 int err; 3810 int err;
3859 3811
3860 if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { 3812 if (!S_ISREG(inode->i_mode)) {
3861 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3813 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3862 return; 3814 return;
3863 } 3815 }
@@ -4055,8 +4007,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4055 * per cpu locality group is to reduce the contention between block 4007 * per cpu locality group is to reduce the contention between block
4056 * request from multiple CPUs. 4008 * request from multiple CPUs.
4057 */ 4009 */
4058 ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; 4010 ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
4059 put_cpu();
4060 4011
4061 /* we're going to use group allocation */ 4012 /* we're going to use group allocation */
4062 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4013 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4330,33 +4281,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4330ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4281ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4331 struct ext4_allocation_request *ar, int *errp) 4282 struct ext4_allocation_request *ar, int *errp)
4332{ 4283{
4284 int freed;
4333 struct ext4_allocation_context *ac = NULL; 4285 struct ext4_allocation_context *ac = NULL;
4334 struct ext4_sb_info *sbi; 4286 struct ext4_sb_info *sbi;
4335 struct super_block *sb; 4287 struct super_block *sb;
4336 ext4_fsblk_t block = 0; 4288 ext4_fsblk_t block = 0;
4337 int freed; 4289 unsigned long inquota;
4338 int inquota; 4290 unsigned long reserv_blks = 0;
4339 4291
4340 sb = ar->inode->i_sb; 4292 sb = ar->inode->i_sb;
4341 sbi = EXT4_SB(sb); 4293 sbi = EXT4_SB(sb);
4342 4294
4343 if (!test_opt(sb, MBALLOC)) {
4344 block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
4345 &(ar->len), errp);
4346 return block;
4347 }
4348 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4295 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4349 /* 4296 /*
4350 * With delalloc we already reserved the blocks 4297 * With delalloc we already reserved the blocks
4351 */ 4298 */
4352 ar->len = ext4_has_free_blocks(sbi, ar->len); 4299 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
4353 } 4300 /* let others to free the space */
4354 4301 yield();
4355 if (ar->len == 0) { 4302 ar->len = ar->len >> 1;
4356 *errp = -ENOSPC; 4303 }
4357 return 0; 4304 if (!ar->len) {
4305 *errp = -ENOSPC;
4306 return 0;
4307 }
4308 reserv_blks = ar->len;
4358 } 4309 }
4359
4360 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4310 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4361 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4311 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4362 ar->len--; 4312 ar->len--;
@@ -4377,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4377 goto out1; 4327 goto out1;
4378 } 4328 }
4379 4329
4380 ext4_mb_poll_new_transaction(sb, handle);
4381
4382 *errp = ext4_mb_initialize_context(ac, ar); 4330 *errp = ext4_mb_initialize_context(ac, ar);
4383 if (*errp) { 4331 if (*errp) {
4384 ar->len = 0; 4332 ar->len = 0;
@@ -4402,7 +4350,7 @@ repeat:
4402 } 4350 }
4403 4351
4404 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4352 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4405 *errp = ext4_mb_mark_diskspace_used(ac, handle); 4353 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4406 if (*errp == -EAGAIN) { 4354 if (*errp == -EAGAIN) {
4407 ac->ac_b_ex.fe_group = 0; 4355 ac->ac_b_ex.fe_group = 0;
4408 ac->ac_b_ex.fe_start = 0; 4356 ac->ac_b_ex.fe_start = 0;
@@ -4437,35 +4385,20 @@ out1:
4437 4385
4438 return block; 4386 return block;
4439} 4387}
4440static void ext4_mb_poll_new_transaction(struct super_block *sb,
4441 handle_t *handle)
4442{
4443 struct ext4_sb_info *sbi = EXT4_SB(sb);
4444 4388
4445 if (sbi->s_last_transaction == handle->h_transaction->t_tid) 4389/*
4446 return; 4390 * We can merge two free data extents only if the physical blocks
4447 4391 * are contiguous, AND the extents were freed by the same transaction,
4448 /* new transaction! time to close last one and free blocks for 4392 * AND the blocks are associated with the same group.
4449 * committed transaction. we know that only transaction can be 4393 */
4450 * active, so previos transaction can be being logged and we 4394static int can_merge(struct ext4_free_data *entry1,
4451 * know that transaction before previous is known to be already 4395 struct ext4_free_data *entry2)
4452 * logged. this means that now we may free blocks freed in all 4396{
4453 * transactions before previous one. hope I'm clear enough ... */ 4397 if ((entry1->t_tid == entry2->t_tid) &&
4454 4398 (entry1->group == entry2->group) &&
4455 spin_lock(&sbi->s_md_lock); 4399 ((entry1->start_blk + entry1->count) == entry2->start_blk))
4456 if (sbi->s_last_transaction != handle->h_transaction->t_tid) { 4400 return 1;
4457 mb_debug("new transaction %lu, old %lu\n", 4401 return 0;
4458 (unsigned long) handle->h_transaction->t_tid,
4459 (unsigned long) sbi->s_last_transaction);
4460 list_splice_init(&sbi->s_closed_transaction,
4461 &sbi->s_committed_transaction);
4462 list_splice_init(&sbi->s_active_transaction,
4463 &sbi->s_closed_transaction);
4464 sbi->s_last_transaction = handle->h_transaction->t_tid;
4465 }
4466 spin_unlock(&sbi->s_md_lock);
4467
4468 ext4_mb_free_committed_blocks(sb);
4469} 4402}
4470 4403
4471static noinline_for_stack int 4404static noinline_for_stack int
@@ -4475,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4475 struct ext4_group_info *db = e4b->bd_info; 4408 struct ext4_group_info *db = e4b->bd_info;
4476 struct super_block *sb = e4b->bd_sb; 4409 struct super_block *sb = e4b->bd_sb;
4477 struct ext4_sb_info *sbi = EXT4_SB(sb); 4410 struct ext4_sb_info *sbi = EXT4_SB(sb);
4478 struct ext4_free_metadata *md; 4411 struct ext4_free_data *entry, *new_entry;
4479 int i; 4412 struct rb_node **n = &db->bb_free_root.rb_node, *node;
4413 struct rb_node *parent = NULL, *new_node;
4414
4480 4415
4481 BUG_ON(e4b->bd_bitmap_page == NULL); 4416 BUG_ON(e4b->bd_bitmap_page == NULL);
4482 BUG_ON(e4b->bd_buddy_page == NULL); 4417 BUG_ON(e4b->bd_buddy_page == NULL);
4483 4418
4419 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4420 new_entry->start_blk = block;
4421 new_entry->group = group;
4422 new_entry->count = count;
4423 new_entry->t_tid = handle->h_transaction->t_tid;
4424 new_node = &new_entry->node;
4425
4484 ext4_lock_group(sb, group); 4426 ext4_lock_group(sb, group);
4485 for (i = 0; i < count; i++) { 4427 if (!*n) {
4486 md = db->bb_md_cur; 4428 /* first free block exent. We need to
4487 if (md && db->bb_tid != handle->h_transaction->t_tid) { 4429 protect buddy cache from being freed,
4488 db->bb_md_cur = NULL; 4430 * otherwise we'll refresh it from
4489 md = NULL; 4431 * on-disk bitmap and lose not-yet-available
4432 * blocks */
4433 page_cache_get(e4b->bd_buddy_page);
4434 page_cache_get(e4b->bd_bitmap_page);
4435 }
4436 while (*n) {
4437 parent = *n;
4438 entry = rb_entry(parent, struct ext4_free_data, node);
4439 if (block < entry->start_blk)
4440 n = &(*n)->rb_left;
4441 else if (block >= (entry->start_blk + entry->count))
4442 n = &(*n)->rb_right;
4443 else {
4444 ext4_error(sb, __func__,
4445 "Double free of blocks %d (%d %d)\n",
4446 block, entry->start_blk, entry->count);
4447 return 0;
4490 } 4448 }
4449 }
4491 4450
4492 if (md == NULL) { 4451 rb_link_node(new_node, parent, n);
4493 ext4_unlock_group(sb, group); 4452 rb_insert_color(new_node, &db->bb_free_root);
4494 md = kmalloc(sizeof(*md), GFP_NOFS); 4453
4495 if (md == NULL) 4454 /* Now try to see the extent can be merged to left and right */
4496 return -ENOMEM; 4455 node = rb_prev(new_node);
4497 md->num = 0; 4456 if (node) {
4498 md->group = group; 4457 entry = rb_entry(node, struct ext4_free_data, node);
4499 4458 if (can_merge(entry, new_entry)) {
4500 ext4_lock_group(sb, group); 4459 new_entry->start_blk = entry->start_blk;
4501 if (db->bb_md_cur == NULL) { 4460 new_entry->count += entry->count;
4502 spin_lock(&sbi->s_md_lock); 4461 rb_erase(node, &(db->bb_free_root));
4503 list_add(&md->list, &sbi->s_active_transaction); 4462 spin_lock(&sbi->s_md_lock);
4504 spin_unlock(&sbi->s_md_lock); 4463 list_del(&entry->list);
4505 /* protect buddy cache from being freed, 4464 spin_unlock(&sbi->s_md_lock);
4506 * otherwise we'll refresh it from 4465 kmem_cache_free(ext4_free_ext_cachep, entry);
4507 * on-disk bitmap and lose not-yet-available
4508 * blocks */
4509 page_cache_get(e4b->bd_buddy_page);
4510 page_cache_get(e4b->bd_bitmap_page);
4511 db->bb_md_cur = md;
4512 db->bb_tid = handle->h_transaction->t_tid;
4513 mb_debug("new md 0x%p for group %lu\n",
4514 md, md->group);
4515 } else {
4516 kfree(md);
4517 md = db->bb_md_cur;
4518 }
4519 } 4466 }
4467 }
4520 4468
4521 BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); 4469 node = rb_next(new_node);
4522 md->blocks[md->num] = block + i; 4470 if (node) {
4523 md->num++; 4471 entry = rb_entry(node, struct ext4_free_data, node);
4524 if (md->num == EXT4_BB_MAX_BLOCKS) { 4472 if (can_merge(new_entry, entry)) {
4525 /* no more space, put full container on a sb's list */ 4473 new_entry->count += entry->count;
4526 db->bb_md_cur = NULL; 4474 rb_erase(node, &(db->bb_free_root));
4475 spin_lock(&sbi->s_md_lock);
4476 list_del(&entry->list);
4477 spin_unlock(&sbi->s_md_lock);
4478 kmem_cache_free(ext4_free_ext_cachep, entry);
4527 } 4479 }
4528 } 4480 }
4481 /* Add the extent to transaction's private list */
4482 spin_lock(&sbi->s_md_lock);
4483 list_add(&new_entry->list, &handle->h_transaction->t_private_list);
4484 spin_unlock(&sbi->s_md_lock);
4529 ext4_unlock_group(sb, group); 4485 ext4_unlock_group(sb, group);
4530 return 0; 4486 return 0;
4531} 4487}
@@ -4553,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4553 4509
4554 *freed = 0; 4510 *freed = 0;
4555 4511
4556 ext4_mb_poll_new_transaction(sb, handle);
4557
4558 sbi = EXT4_SB(sb); 4512 sbi = EXT4_SB(sb);
4559 es = EXT4_SB(sb)->s_es; 4513 es = EXT4_SB(sb)->s_es;
4560 if (block < le32_to_cpu(es->s_first_data_block) || 4514 if (block < le32_to_cpu(es->s_first_data_block) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c7c9906c2a75..b5dff1fff1e5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -18,6 +18,8 @@
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/version.h> 20#include <linux/version.h>
21#include <linux/blkdev.h>
22#include <linux/marker.h>
21#include "ext4_jbd2.h" 23#include "ext4_jbd2.h"
22#include "ext4.h" 24#include "ext4.h"
23#include "group.h" 25#include "group.h"
@@ -98,23 +100,29 @@
98 100
99static struct kmem_cache *ext4_pspace_cachep; 101static struct kmem_cache *ext4_pspace_cachep;
100static struct kmem_cache *ext4_ac_cachep; 102static struct kmem_cache *ext4_ac_cachep;
103static struct kmem_cache *ext4_free_ext_cachep;
101 104
102#ifdef EXT4_BB_MAX_BLOCKS 105struct ext4_free_data {
103#undef EXT4_BB_MAX_BLOCKS 106 /* this links the free block information from group_info */
104#endif 107 struct rb_node node;
105#define EXT4_BB_MAX_BLOCKS 30
106 108
107struct ext4_free_metadata { 109 /* this links the free block information from ext4_sb_info */
108 ext4_group_t group;
109 unsigned short num;
110 ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
111 struct list_head list; 110 struct list_head list;
111
112 /* group which free block extent belongs */
113 ext4_group_t group;
114
115 /* free block extent */
116 ext4_grpblk_t start_blk;
117 ext4_grpblk_t count;
118
119 /* transaction which freed this extent */
120 tid_t t_tid;
112}; 121};
113 122
114struct ext4_group_info { 123struct ext4_group_info {
115 unsigned long bb_state; 124 unsigned long bb_state;
116 unsigned long bb_tid; 125 struct rb_root bb_free_root;
117 struct ext4_free_metadata *bb_md_cur;
118 unsigned short bb_first_free; 126 unsigned short bb_first_free;
119 unsigned short bb_free; 127 unsigned short bb_free;
120 unsigned short bb_fragments; 128 unsigned short bb_fragments;
@@ -257,13 +265,10 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
257 265
258#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 266#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
259 267
260static struct proc_dir_entry *proc_root_ext4;
261struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); 268struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
262 269
263static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 270static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
264 ext4_group_t group); 271 ext4_group_t group);
265static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
266static void ext4_mb_free_committed_blocks(struct super_block *);
267static void ext4_mb_return_to_preallocation(struct inode *inode, 272static void ext4_mb_return_to_preallocation(struct inode *inode,
268 struct ext4_buddy *e4b, sector_t block, 273 struct ext4_buddy *e4b, sector_t block,
269 int count); 274 int count);
@@ -271,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
271 struct super_block *, struct ext4_prealloc_space *pa); 276 struct super_block *, struct ext4_prealloc_space *pa);
272static int ext4_mb_init_per_dev_proc(struct super_block *sb); 277static int ext4_mb_init_per_dev_proc(struct super_block *sb);
273static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); 278static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
279static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
274 280
275 281
276static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 282static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b9e077ba07e9..f2a9cf498ecd 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
53 * credit. But below we try to not accumalate too much 53 * credit. But below we try to not accumalate too much
54 * of them by restarting the journal. 54 * of them by restarting the journal.
55 */ 55 */
56 needed = ext4_ext_calc_credits_for_insert(inode, path); 56 needed = ext4_ext_calc_credits_for_single_extent(inode,
57 lb->last_block - lb->first_block + 1, path);
57 58
58 /* 59 /*
59 * Make sure the credit we accumalated is not really high 60 * Make sure the credit we accumalated is not really high
@@ -446,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
446 447
447} 448}
448 449
449int ext4_ext_migrate(struct inode *inode, struct file *filp, 450int ext4_ext_migrate(struct inode *inode)
450 unsigned int cmd, unsigned long arg)
451{ 451{
452 handle_t *handle; 452 handle_t *handle;
453 int retval = 0, i; 453 int retval = 0, i;
@@ -515,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
515 * when we add extents we extent the journal 515 * when we add extents we extent the journal
516 */ 516 */
517 /* 517 /*
518 * inode_mutex prevent write and truncate on the file. Read still goes
519 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
520 * switch the inode format to prevent read.
521 */
522 mutex_lock(&(inode->i_mutex));
523 /*
524 * Even though we take i_mutex we can still cause block allocation 518 * Even though we take i_mutex we can still cause block allocation
525 * via mmap write to holes. If we have allocated new blocks we fail 519 * via mmap write to holes. If we have allocated new blocks we fail
526 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 520 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
@@ -622,7 +616,6 @@ err_out:
622 tmp_inode->i_nlink = 0; 616 tmp_inode->i_nlink = 0;
623 617
624 ext4_journal_stop(handle); 618 ext4_journal_stop(handle);
625 mutex_unlock(&(inode->i_mutex));
626 619
627 if (tmp_inode) 620 if (tmp_inode)
628 iput(tmp_inode); 621 iput(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 387ad98350c3..92db9e945147 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,34 +151,36 @@ struct dx_map_entry
151 151
152static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); 152static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
153static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); 153static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
154static inline unsigned dx_get_hash (struct dx_entry *entry); 154static inline unsigned dx_get_hash(struct dx_entry *entry);
155static void dx_set_hash (struct dx_entry *entry, unsigned value); 155static void dx_set_hash(struct dx_entry *entry, unsigned value);
156static unsigned dx_get_count (struct dx_entry *entries); 156static unsigned dx_get_count(struct dx_entry *entries);
157static unsigned dx_get_limit (struct dx_entry *entries); 157static unsigned dx_get_limit(struct dx_entry *entries);
158static void dx_set_count (struct dx_entry *entries, unsigned value); 158static void dx_set_count(struct dx_entry *entries, unsigned value);
159static void dx_set_limit (struct dx_entry *entries, unsigned value); 159static void dx_set_limit(struct dx_entry *entries, unsigned value);
160static unsigned dx_root_limit (struct inode *dir, unsigned infosize); 160static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
161static unsigned dx_node_limit (struct inode *dir); 161static unsigned dx_node_limit(struct inode *dir);
162static struct dx_frame *dx_probe(struct dentry *dentry, 162static struct dx_frame *dx_probe(const struct qstr *d_name,
163 struct inode *dir, 163 struct inode *dir,
164 struct dx_hash_info *hinfo, 164 struct dx_hash_info *hinfo,
165 struct dx_frame *frame, 165 struct dx_frame *frame,
166 int *err); 166 int *err);
167static void dx_release (struct dx_frame *frames); 167static void dx_release(struct dx_frame *frames);
168static int dx_make_map (struct ext4_dir_entry_2 *de, int size, 168static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
169 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 169 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
170static void dx_sort_map(struct dx_map_entry *map, unsigned count); 170static void dx_sort_map(struct dx_map_entry *map, unsigned count);
171static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, 171static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
172 struct dx_map_entry *offsets, int count); 172 struct dx_map_entry *offsets, int count);
173static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); 173static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
174static void dx_insert_block(struct dx_frame *frame, 174static void dx_insert_block(struct dx_frame *frame,
175 u32 hash, ext4_lblk_t block); 175 u32 hash, ext4_lblk_t block);
176static int ext4_htree_next_block(struct inode *dir, __u32 hash, 176static int ext4_htree_next_block(struct inode *dir, __u32 hash,
177 struct dx_frame *frame, 177 struct dx_frame *frame,
178 struct dx_frame *frames, 178 struct dx_frame *frames,
179 __u32 *start_hash); 179 __u32 *start_hash);
180static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, 180static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
181 struct ext4_dir_entry_2 **res_dir, int *err); 181 const struct qstr *d_name,
182 struct ext4_dir_entry_2 **res_dir,
183 int *err);
182static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 184static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
183 struct inode *inode); 185 struct inode *inode);
184 186
@@ -207,44 +209,44 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
207 entry->block = cpu_to_le32(value); 209 entry->block = cpu_to_le32(value);
208} 210}
209 211
210static inline unsigned dx_get_hash (struct dx_entry *entry) 212static inline unsigned dx_get_hash(struct dx_entry *entry)
211{ 213{
212 return le32_to_cpu(entry->hash); 214 return le32_to_cpu(entry->hash);
213} 215}
214 216
215static inline void dx_set_hash (struct dx_entry *entry, unsigned value) 217static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
216{ 218{
217 entry->hash = cpu_to_le32(value); 219 entry->hash = cpu_to_le32(value);
218} 220}
219 221
220static inline unsigned dx_get_count (struct dx_entry *entries) 222static inline unsigned dx_get_count(struct dx_entry *entries)
221{ 223{
222 return le16_to_cpu(((struct dx_countlimit *) entries)->count); 224 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
223} 225}
224 226
225static inline unsigned dx_get_limit (struct dx_entry *entries) 227static inline unsigned dx_get_limit(struct dx_entry *entries)
226{ 228{
227 return le16_to_cpu(((struct dx_countlimit *) entries)->limit); 229 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
228} 230}
229 231
230static inline void dx_set_count (struct dx_entry *entries, unsigned value) 232static inline void dx_set_count(struct dx_entry *entries, unsigned value)
231{ 233{
232 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); 234 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
233} 235}
234 236
235static inline void dx_set_limit (struct dx_entry *entries, unsigned value) 237static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
236{ 238{
237 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); 239 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
238} 240}
239 241
240static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) 242static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
241{ 243{
242 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 244 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
243 EXT4_DIR_REC_LEN(2) - infosize; 245 EXT4_DIR_REC_LEN(2) - infosize;
244 return entry_space / sizeof(struct dx_entry); 246 return entry_space / sizeof(struct dx_entry);
245} 247}
246 248
247static inline unsigned dx_node_limit (struct inode *dir) 249static inline unsigned dx_node_limit(struct inode *dir)
248{ 250{
249 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 251 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
250 return entry_space / sizeof(struct dx_entry); 252 return entry_space / sizeof(struct dx_entry);
@@ -254,12 +256,12 @@ static inline unsigned dx_node_limit (struct inode *dir)
254 * Debug 256 * Debug
255 */ 257 */
256#ifdef DX_DEBUG 258#ifdef DX_DEBUG
257static void dx_show_index (char * label, struct dx_entry *entries) 259static void dx_show_index(char * label, struct dx_entry *entries)
258{ 260{
259 int i, n = dx_get_count (entries); 261 int i, n = dx_get_count (entries);
260 printk("%s index ", label); 262 printk(KERN_DEBUG "%s index ", label);
261 for (i = 0; i < n; i++) { 263 for (i = 0; i < n; i++) {
262 printk("%x->%lu ", i? dx_get_hash(entries + i) : 264 printk("%x->%lu ", i ? dx_get_hash(entries + i) :
263 0, (unsigned long)dx_get_block(entries + i)); 265 0, (unsigned long)dx_get_block(entries + i));
264 } 266 }
265 printk("\n"); 267 printk("\n");
@@ -306,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
306 struct dx_entry *entries, int levels) 308 struct dx_entry *entries, int levels)
307{ 309{
308 unsigned blocksize = dir->i_sb->s_blocksize; 310 unsigned blocksize = dir->i_sb->s_blocksize;
309 unsigned count = dx_get_count (entries), names = 0, space = 0, i; 311 unsigned count = dx_get_count(entries), names = 0, space = 0, i;
310 unsigned bcount = 0; 312 unsigned bcount = 0;
311 struct buffer_head *bh; 313 struct buffer_head *bh;
312 int err; 314 int err;
@@ -325,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
325 names += stats.names; 327 names += stats.names;
326 space += stats.space; 328 space += stats.space;
327 bcount += stats.bcount; 329 bcount += stats.bcount;
328 brelse (bh); 330 brelse(bh);
329 } 331 }
330 if (bcount) 332 if (bcount)
331 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", 333 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
332 names, space/bcount,(space/bcount)*100/blocksize); 334 levels ? "" : " ", names, space/bcount,
335 (space/bcount)*100/blocksize);
333 return (struct stats) { names, space, bcount}; 336 return (struct stats) { names, space, bcount};
334} 337}
335#endif /* DX_DEBUG */ 338#endif /* DX_DEBUG */
@@ -344,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
344 * back to userspace. 347 * back to userspace.
345 */ 348 */
346static struct dx_frame * 349static struct dx_frame *
347dx_probe(struct dentry *dentry, struct inode *dir, 350dx_probe(const struct qstr *d_name, struct inode *dir,
348 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) 351 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
349{ 352{
350 unsigned count, indirect; 353 unsigned count, indirect;
@@ -355,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
355 u32 hash; 358 u32 hash;
356 359
357 frame->bh = NULL; 360 frame->bh = NULL;
358 if (dentry)
359 dir = dentry->d_parent->d_inode;
360 if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) 361 if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
361 goto fail; 362 goto fail;
362 root = (struct dx_root *) bh->b_data; 363 root = (struct dx_root *) bh->b_data;
@@ -372,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
372 } 373 }
373 hinfo->hash_version = root->info.hash_version; 374 hinfo->hash_version = root->info.hash_version;
374 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 375 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
375 if (dentry) 376 if (d_name)
376 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); 377 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
377 hash = hinfo->hash; 378 hash = hinfo->hash;
378 379
379 if (root->info.unused_flags & 1) { 380 if (root->info.unused_flags & 1) {
@@ -406,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
406 goto fail; 407 goto fail;
407 } 408 }
408 409
409 dxtrace (printk("Look up %x", hash)); 410 dxtrace(printk("Look up %x", hash));
410 while (1) 411 while (1)
411 { 412 {
412 count = dx_get_count(entries); 413 count = dx_get_count(entries);
@@ -555,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
555 0, &err))) 556 0, &err)))
556 return err; /* Failure */ 557 return err; /* Failure */
557 p++; 558 p++;
558 brelse (p->bh); 559 brelse(p->bh);
559 p->bh = bh; 560 p->bh = bh;
560 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; 561 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
561 } 562 }
@@ -593,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
593 /* On error, skip the f_pos to the next block. */ 594 /* On error, skip the f_pos to the next block. */
594 dir_file->f_pos = (dir_file->f_pos | 595 dir_file->f_pos = (dir_file->f_pos |
595 (dir->i_sb->s_blocksize - 1)) + 1; 596 (dir->i_sb->s_blocksize - 1)) + 1;
596 brelse (bh); 597 brelse(bh);
597 return count; 598 return count;
598 } 599 }
599 ext4fs_dirhash(de->name, de->name_len, hinfo); 600 ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -635,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
635 int ret, err; 636 int ret, err;
636 __u32 hashval; 637 __u32 hashval;
637 638
638 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, 639 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
639 start_minor_hash)); 640 start_hash, start_minor_hash));
640 dir = dir_file->f_path.dentry->d_inode; 641 dir = dir_file->f_path.dentry->d_inode;
641 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 642 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
642 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 643 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -648,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
648 } 649 }
649 hinfo.hash = start_hash; 650 hinfo.hash = start_hash;
650 hinfo.minor_hash = 0; 651 hinfo.minor_hash = 0;
651 frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); 652 frame = dx_probe(NULL, dir, &hinfo, frames, &err);
652 if (!frame) 653 if (!frame)
653 return err; 654 return err;
654 655
@@ -694,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
694 break; 695 break;
695 } 696 }
696 dx_release(frames); 697 dx_release(frames);
697 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", 698 dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
698 count, *next_hash)); 699 "next hash: %x\n", count, *next_hash));
699 return count; 700 return count;
700errout: 701errout:
701 dx_release(frames); 702 dx_release(frames);
@@ -802,17 +803,17 @@ static inline int ext4_match (int len, const char * const name,
802/* 803/*
803 * Returns 0 if not found, -1 on failure, and 1 on success 804 * Returns 0 if not found, -1 on failure, and 1 on success
804 */ 805 */
805static inline int search_dirblock(struct buffer_head * bh, 806static inline int search_dirblock(struct buffer_head *bh,
806 struct inode *dir, 807 struct inode *dir,
807 struct dentry *dentry, 808 const struct qstr *d_name,
808 unsigned long offset, 809 unsigned long offset,
809 struct ext4_dir_entry_2 ** res_dir) 810 struct ext4_dir_entry_2 ** res_dir)
810{ 811{
811 struct ext4_dir_entry_2 * de; 812 struct ext4_dir_entry_2 * de;
812 char * dlimit; 813 char * dlimit;
813 int de_len; 814 int de_len;
814 const char *name = dentry->d_name.name; 815 const char *name = d_name->name;
815 int namelen = dentry->d_name.len; 816 int namelen = d_name->len;
816 817
817 de = (struct ext4_dir_entry_2 *) bh->b_data; 818 de = (struct ext4_dir_entry_2 *) bh->b_data;
818 dlimit = bh->b_data + dir->i_sb->s_blocksize; 819 dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -851,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh,
851 * The returned buffer_head has ->b_count elevated. The caller is expected 852 * The returned buffer_head has ->b_count elevated. The caller is expected
852 * to brelse() it when appropriate. 853 * to brelse() it when appropriate.
853 */ 854 */
854static struct buffer_head * ext4_find_entry (struct dentry *dentry, 855static struct buffer_head * ext4_find_entry (struct inode *dir,
856 const struct qstr *d_name,
855 struct ext4_dir_entry_2 ** res_dir) 857 struct ext4_dir_entry_2 ** res_dir)
856{ 858{
857 struct super_block * sb; 859 struct super_block *sb;
858 struct buffer_head * bh_use[NAMEI_RA_SIZE]; 860 struct buffer_head *bh_use[NAMEI_RA_SIZE];
859 struct buffer_head * bh, *ret = NULL; 861 struct buffer_head *bh, *ret = NULL;
860 ext4_lblk_t start, block, b; 862 ext4_lblk_t start, block, b;
861 int ra_max = 0; /* Number of bh's in the readahead 863 int ra_max = 0; /* Number of bh's in the readahead
862 buffer, bh_use[] */ 864 buffer, bh_use[] */
@@ -865,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
865 int num = 0; 867 int num = 0;
866 ext4_lblk_t nblocks; 868 ext4_lblk_t nblocks;
867 int i, err; 869 int i, err;
868 struct inode *dir = dentry->d_parent->d_inode;
869 int namelen; 870 int namelen;
870 871
871 *res_dir = NULL; 872 *res_dir = NULL;
872 sb = dir->i_sb; 873 sb = dir->i_sb;
873 namelen = dentry->d_name.len; 874 namelen = d_name->len;
874 if (namelen > EXT4_NAME_LEN) 875 if (namelen > EXT4_NAME_LEN)
875 return NULL; 876 return NULL;
876 if (is_dx(dir)) { 877 if (is_dx(dir)) {
877 bh = ext4_dx_find_entry(dentry, res_dir, &err); 878 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
878 /* 879 /*
879 * On success, or if the error was file not found, 880 * On success, or if the error was file not found,
880 * return. Otherwise, fall back to doing a search the 881 * return. Otherwise, fall back to doing a search the
@@ -882,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
882 */ 883 */
883 if (bh || (err != ERR_BAD_DX_DIR)) 884 if (bh || (err != ERR_BAD_DX_DIR))
884 return bh; 885 return bh;
885 dxtrace(printk("ext4_find_entry: dx failed, falling back\n")); 886 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
887 "falling back\n"));
886 } 888 }
887 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); 889 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
888 start = EXT4_I(dir)->i_dir_start_lookup; 890 start = EXT4_I(dir)->i_dir_start_lookup;
@@ -926,7 +928,7 @@ restart:
926 brelse(bh); 928 brelse(bh);
927 goto next; 929 goto next;
928 } 930 }
929 i = search_dirblock(bh, dir, dentry, 931 i = search_dirblock(bh, dir, d_name,
930 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); 932 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
931 if (i == 1) { 933 if (i == 1) {
932 EXT4_I(dir)->i_dir_start_lookup = block; 934 EXT4_I(dir)->i_dir_start_lookup = block;
@@ -956,11 +958,11 @@ restart:
956cleanup_and_exit: 958cleanup_and_exit:
957 /* Clean up the read-ahead blocks */ 959 /* Clean up the read-ahead blocks */
958 for (; ra_ptr < ra_max; ra_ptr++) 960 for (; ra_ptr < ra_max; ra_ptr++)
959 brelse (bh_use[ra_ptr]); 961 brelse(bh_use[ra_ptr]);
960 return ret; 962 return ret;
961} 963}
962 964
963static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, 965static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
964 struct ext4_dir_entry_2 **res_dir, int *err) 966 struct ext4_dir_entry_2 **res_dir, int *err)
965{ 967{
966 struct super_block * sb; 968 struct super_block * sb;
@@ -971,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
971 struct buffer_head *bh; 973 struct buffer_head *bh;
972 ext4_lblk_t block; 974 ext4_lblk_t block;
973 int retval; 975 int retval;
974 int namelen = dentry->d_name.len; 976 int namelen = d_name->len;
975 const u8 *name = dentry->d_name.name; 977 const u8 *name = d_name->name;
976 struct inode *dir = dentry->d_parent->d_inode;
977 978
978 sb = dir->i_sb; 979 sb = dir->i_sb;
979 /* NFS may look up ".." - look at dx_root directory block */ 980 /* NFS may look up ".." - look at dx_root directory block */
980 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ 981 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
981 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) 982 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
982 return NULL; 983 return NULL;
983 } else { 984 } else {
984 frame = frames; 985 frame = frames;
@@ -1010,7 +1011,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
1010 return bh; 1011 return bh;
1011 } 1012 }
1012 } 1013 }
1013 brelse (bh); 1014 brelse(bh);
1014 /* Check to see if we should continue to search */ 1015 /* Check to see if we should continue to search */
1015 retval = ext4_htree_next_block(dir, hash, frame, 1016 retval = ext4_htree_next_block(dir, hash, frame,
1016 frames, NULL); 1017 frames, NULL);
@@ -1025,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
1025 1026
1026 *err = -ENOENT; 1027 *err = -ENOENT;
1027errout: 1028errout:
1028 dxtrace(printk("%s not found\n", name)); 1029 dxtrace(printk(KERN_DEBUG "%s not found\n", name));
1029 dx_release (frames); 1030 dx_release (frames);
1030 return NULL; 1031 return NULL;
1031} 1032}
1032 1033
1033static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) 1034static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
1034{ 1035{
1035 struct inode * inode; 1036 struct inode *inode;
1036 struct ext4_dir_entry_2 * de; 1037 struct ext4_dir_entry_2 *de;
1037 struct buffer_head * bh; 1038 struct buffer_head *bh;
1038 1039
1039 if (dentry->d_name.len > EXT4_NAME_LEN) 1040 if (dentry->d_name.len > EXT4_NAME_LEN)
1040 return ERR_PTR(-ENAMETOOLONG); 1041 return ERR_PTR(-ENAMETOOLONG);
1041 1042
1042 bh = ext4_find_entry(dentry, &de); 1043 bh = ext4_find_entry(dir, &dentry->d_name, &de);
1043 inode = NULL; 1044 inode = NULL;
1044 if (bh) { 1045 if (bh) {
1045 unsigned long ino = le32_to_cpu(de->inode); 1046 unsigned long ino = le32_to_cpu(de->inode);
1046 brelse (bh); 1047 brelse(bh);
1047 if (!ext4_valid_inum(dir->i_sb, ino)) { 1048 if (!ext4_valid_inum(dir->i_sb, ino)) {
1048 ext4_error(dir->i_sb, "ext4_lookup", 1049 ext4_error(dir->i_sb, "ext4_lookup",
1049 "bad inode number: %lu", ino); 1050 "bad inode number: %lu", ino);
@@ -1062,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
1062 unsigned long ino; 1063 unsigned long ino;
1063 struct dentry *parent; 1064 struct dentry *parent;
1064 struct inode *inode; 1065 struct inode *inode;
1065 struct dentry dotdot; 1066 static const struct qstr dotdot = {
1067 .name = "..",
1068 .len = 2,
1069 };
1066 struct ext4_dir_entry_2 * de; 1070 struct ext4_dir_entry_2 * de;
1067 struct buffer_head *bh; 1071 struct buffer_head *bh;
1068 1072
1069 dotdot.d_name.name = ".."; 1073 bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1070 dotdot.d_name.len = 2;
1071 dotdot.d_parent = child; /* confusing, isn't it! */
1072
1073 bh = ext4_find_entry(&dotdot, &de);
1074 inode = NULL; 1074 inode = NULL;
1075 if (!bh) 1075 if (!bh)
1076 return ERR_PTR(-ENOENT); 1076 return ERR_PTR(-ENOENT);
@@ -1201,10 +1201,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1201 1201
1202 /* create map in the end of data2 block */ 1202 /* create map in the end of data2 block */
1203 map = (struct dx_map_entry *) (data2 + blocksize); 1203 map = (struct dx_map_entry *) (data2 + blocksize);
1204 count = dx_make_map ((struct ext4_dir_entry_2 *) data1, 1204 count = dx_make_map((struct ext4_dir_entry_2 *) data1,
1205 blocksize, hinfo, map); 1205 blocksize, hinfo, map);
1206 map -= count; 1206 map -= count;
1207 dx_sort_map (map, count); 1207 dx_sort_map(map, count);
1208 /* Split the existing block in the middle, size-wise */ 1208 /* Split the existing block in the middle, size-wise */
1209 size = 0; 1209 size = 0;
1210 move = 0; 1210 move = 0;
@@ -1225,7 +1225,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1225 1225
1226 /* Fancy dance to stay within two buffers */ 1226 /* Fancy dance to stay within two buffers */
1227 de2 = dx_move_dirents(data1, data2, map + split, count - split); 1227 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1228 de = dx_pack_dirents(data1,blocksize); 1228 de = dx_pack_dirents(data1, blocksize);
1229 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); 1229 de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
1230 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); 1230 de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
1231 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1231 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1237,15 +1237,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1237 swap(*bh, bh2); 1237 swap(*bh, bh2);
1238 de = de2; 1238 de = de2;
1239 } 1239 }
1240 dx_insert_block (frame, hash2 + continued, newblock); 1240 dx_insert_block(frame, hash2 + continued, newblock);
1241 err = ext4_journal_dirty_metadata (handle, bh2); 1241 err = ext4_journal_dirty_metadata(handle, bh2);
1242 if (err) 1242 if (err)
1243 goto journal_error; 1243 goto journal_error;
1244 err = ext4_journal_dirty_metadata (handle, frame->bh); 1244 err = ext4_journal_dirty_metadata(handle, frame->bh);
1245 if (err) 1245 if (err)
1246 goto journal_error; 1246 goto journal_error;
1247 brelse (bh2); 1247 brelse(bh2);
1248 dxtrace(dx_show_index ("frame", frame->entries)); 1248 dxtrace(dx_show_index("frame", frame->entries));
1249 return de; 1249 return de;
1250 1250
1251journal_error: 1251journal_error:
@@ -1271,7 +1271,7 @@ errout:
1271 */ 1271 */
1272static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1272static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1273 struct inode *inode, struct ext4_dir_entry_2 *de, 1273 struct inode *inode, struct ext4_dir_entry_2 *de,
1274 struct buffer_head * bh) 1274 struct buffer_head *bh)
1275{ 1275{
1276 struct inode *dir = dentry->d_parent->d_inode; 1276 struct inode *dir = dentry->d_parent->d_inode;
1277 const char *name = dentry->d_name.name; 1277 const char *name = dentry->d_name.name;
@@ -1288,11 +1288,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1288 while ((char *) de <= top) { 1288 while ((char *) de <= top) {
1289 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1289 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1290 bh, offset)) { 1290 bh, offset)) {
1291 brelse (bh); 1291 brelse(bh);
1292 return -EIO; 1292 return -EIO;
1293 } 1293 }
1294 if (ext4_match (namelen, name, de)) { 1294 if (ext4_match(namelen, name, de)) {
1295 brelse (bh); 1295 brelse(bh);
1296 return -EEXIST; 1296 return -EEXIST;
1297 } 1297 }
1298 nlen = EXT4_DIR_REC_LEN(de->name_len); 1298 nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1329,7 +1329,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1329 } else 1329 } else
1330 de->inode = 0; 1330 de->inode = 0;
1331 de->name_len = namelen; 1331 de->name_len = namelen;
1332 memcpy (de->name, name, namelen); 1332 memcpy(de->name, name, namelen);
1333 /* 1333 /*
1334 * XXX shouldn't update any times until successful 1334 * XXX shouldn't update any times until successful
1335 * completion of syscall, but too many callers depend 1335 * completion of syscall, but too many callers depend
@@ -1377,7 +1377,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1377 struct fake_dirent *fde; 1377 struct fake_dirent *fde;
1378 1378
1379 blocksize = dir->i_sb->s_blocksize; 1379 blocksize = dir->i_sb->s_blocksize;
1380 dxtrace(printk("Creating index\n")); 1380 dxtrace(printk(KERN_DEBUG "Creating index\n"));
1381 retval = ext4_journal_get_write_access(handle, bh); 1381 retval = ext4_journal_get_write_access(handle, bh);
1382 if (retval) { 1382 if (retval) {
1383 ext4_std_error(dir->i_sb, retval); 1383 ext4_std_error(dir->i_sb, retval);
@@ -1386,7 +1386,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1386 } 1386 }
1387 root = (struct dx_root *) bh->b_data; 1387 root = (struct dx_root *) bh->b_data;
1388 1388
1389 bh2 = ext4_append (handle, dir, &block, &retval); 1389 bh2 = ext4_append(handle, dir, &block, &retval);
1390 if (!(bh2)) { 1390 if (!(bh2)) {
1391 brelse(bh); 1391 brelse(bh);
1392 return retval; 1392 return retval;
@@ -1412,9 +1412,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1412 root->info.info_length = sizeof(root->info); 1412 root->info.info_length = sizeof(root->info);
1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 1413 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1414 entries = root->entries; 1414 entries = root->entries;
1415 dx_set_block (entries, 1); 1415 dx_set_block(entries, 1);
1416 dx_set_count (entries, 1); 1416 dx_set_count(entries, 1);
1417 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); 1417 dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
1418 1418
1419 /* Initialize as for dx_probe */ 1419 /* Initialize as for dx_probe */
1420 hinfo.hash_version = root->info.hash_version; 1420 hinfo.hash_version = root->info.hash_version;
@@ -1443,14 +1443,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1443 * may not sleep between calling this and putting something into 1443 * may not sleep between calling this and putting something into
1444 * the entry, as someone else might have used it while you slept. 1444 * the entry, as someone else might have used it while you slept.
1445 */ 1445 */
1446static int ext4_add_entry (handle_t *handle, struct dentry *dentry, 1446static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1447 struct inode *inode) 1447 struct inode *inode)
1448{ 1448{
1449 struct inode *dir = dentry->d_parent->d_inode; 1449 struct inode *dir = dentry->d_parent->d_inode;
1450 unsigned long offset; 1450 unsigned long offset;
1451 struct buffer_head * bh; 1451 struct buffer_head *bh;
1452 struct ext4_dir_entry_2 *de; 1452 struct ext4_dir_entry_2 *de;
1453 struct super_block * sb; 1453 struct super_block *sb;
1454 int retval; 1454 int retval;
1455 int dx_fallback=0; 1455 int dx_fallback=0;
1456 unsigned blocksize; 1456 unsigned blocksize;
@@ -1500,13 +1500,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1500 struct dx_frame frames[2], *frame; 1500 struct dx_frame frames[2], *frame;
1501 struct dx_entry *entries, *at; 1501 struct dx_entry *entries, *at;
1502 struct dx_hash_info hinfo; 1502 struct dx_hash_info hinfo;
1503 struct buffer_head * bh; 1503 struct buffer_head *bh;
1504 struct inode *dir = dentry->d_parent->d_inode; 1504 struct inode *dir = dentry->d_parent->d_inode;
1505 struct super_block * sb = dir->i_sb; 1505 struct super_block *sb = dir->i_sb;
1506 struct ext4_dir_entry_2 *de; 1506 struct ext4_dir_entry_2 *de;
1507 int err; 1507 int err;
1508 1508
1509 frame = dx_probe(dentry, NULL, &hinfo, frames, &err); 1509 frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1510 if (!frame) 1510 if (!frame)
1511 return err; 1511 return err;
1512 entries = frame->entries; 1512 entries = frame->entries;
@@ -1527,7 +1527,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1527 } 1527 }
1528 1528
1529 /* Block full, should compress but for now just split */ 1529 /* Block full, should compress but for now just split */
1530 dxtrace(printk("using %u of %u node entries\n", 1530 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1531 dx_get_count(entries), dx_get_limit(entries))); 1531 dx_get_count(entries), dx_get_limit(entries)));
1532 /* Need to split index? */ 1532 /* Need to split index? */
1533 if (dx_get_count(entries) == dx_get_limit(entries)) { 1533 if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1559,7 +1559,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1559 if (levels) { 1559 if (levels) {
1560 unsigned icount1 = icount/2, icount2 = icount - icount1; 1560 unsigned icount1 = icount/2, icount2 = icount - icount1;
1561 unsigned hash2 = dx_get_hash(entries + icount1); 1561 unsigned hash2 = dx_get_hash(entries + icount1);
1562 dxtrace(printk("Split index %i/%i\n", icount1, icount2)); 1562 dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
1563 icount1, icount2));
1563 1564
1564 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ 1565 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1565 err = ext4_journal_get_write_access(handle, 1566 err = ext4_journal_get_write_access(handle,
@@ -1567,11 +1568,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1567 if (err) 1568 if (err)
1568 goto journal_error; 1569 goto journal_error;
1569 1570
1570 memcpy ((char *) entries2, (char *) (entries + icount1), 1571 memcpy((char *) entries2, (char *) (entries + icount1),
1571 icount2 * sizeof(struct dx_entry)); 1572 icount2 * sizeof(struct dx_entry));
1572 dx_set_count (entries, icount1); 1573 dx_set_count(entries, icount1);
1573 dx_set_count (entries2, icount2); 1574 dx_set_count(entries2, icount2);
1574 dx_set_limit (entries2, dx_node_limit(dir)); 1575 dx_set_limit(entries2, dx_node_limit(dir));
1575 1576
1576 /* Which index block gets the new entry? */ 1577 /* Which index block gets the new entry? */
1577 if (at - entries >= icount1) { 1578 if (at - entries >= icount1) {
@@ -1579,16 +1580,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1579 frame->entries = entries = entries2; 1580 frame->entries = entries = entries2;
1580 swap(frame->bh, bh2); 1581 swap(frame->bh, bh2);
1581 } 1582 }
1582 dx_insert_block (frames + 0, hash2, newblock); 1583 dx_insert_block(frames + 0, hash2, newblock);
1583 dxtrace(dx_show_index ("node", frames[1].entries)); 1584 dxtrace(dx_show_index("node", frames[1].entries));
1584 dxtrace(dx_show_index ("node", 1585 dxtrace(dx_show_index("node",
1585 ((struct dx_node *) bh2->b_data)->entries)); 1586 ((struct dx_node *) bh2->b_data)->entries));
1586 err = ext4_journal_dirty_metadata(handle, bh2); 1587 err = ext4_journal_dirty_metadata(handle, bh2);
1587 if (err) 1588 if (err)
1588 goto journal_error; 1589 goto journal_error;
1589 brelse (bh2); 1590 brelse (bh2);
1590 } else { 1591 } else {
1591 dxtrace(printk("Creating second level index...\n")); 1592 dxtrace(printk(KERN_DEBUG
1593 "Creating second level index...\n"));
1592 memcpy((char *) entries2, (char *) entries, 1594 memcpy((char *) entries2, (char *) entries,
1593 icount * sizeof(struct dx_entry)); 1595 icount * sizeof(struct dx_entry));
1594 dx_set_limit(entries2, dx_node_limit(dir)); 1596 dx_set_limit(entries2, dx_node_limit(dir));
@@ -1630,12 +1632,12 @@ cleanup:
1630 * ext4_delete_entry deletes a directory entry by merging it with the 1632 * ext4_delete_entry deletes a directory entry by merging it with the
1631 * previous entry 1633 * previous entry
1632 */ 1634 */
1633static int ext4_delete_entry (handle_t *handle, 1635static int ext4_delete_entry(handle_t *handle,
1634 struct inode * dir, 1636 struct inode *dir,
1635 struct ext4_dir_entry_2 * de_del, 1637 struct ext4_dir_entry_2 *de_del,
1636 struct buffer_head * bh) 1638 struct buffer_head *bh)
1637{ 1639{
1638 struct ext4_dir_entry_2 * de, * pde; 1640 struct ext4_dir_entry_2 *de, *pde;
1639 int i; 1641 int i;
1640 1642
1641 i = 0; 1643 i = 0;
@@ -1716,11 +1718,11 @@ static int ext4_add_nondir(handle_t *handle,
1716 * If the create succeeds, we fill in the inode information 1718 * If the create succeeds, we fill in the inode information
1717 * with d_instantiate(). 1719 * with d_instantiate().
1718 */ 1720 */
1719static int ext4_create (struct inode * dir, struct dentry * dentry, int mode, 1721static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
1720 struct nameidata *nd) 1722 struct nameidata *nd)
1721{ 1723{
1722 handle_t *handle; 1724 handle_t *handle;
1723 struct inode * inode; 1725 struct inode *inode;
1724 int err, retries = 0; 1726 int err, retries = 0;
1725 1727
1726retry: 1728retry:
@@ -1747,8 +1749,8 @@ retry:
1747 return err; 1749 return err;
1748} 1750}
1749 1751
1750static int ext4_mknod (struct inode * dir, struct dentry *dentry, 1752static int ext4_mknod(struct inode *dir, struct dentry *dentry,
1751 int mode, dev_t rdev) 1753 int mode, dev_t rdev)
1752{ 1754{
1753 handle_t *handle; 1755 handle_t *handle;
1754 struct inode *inode; 1756 struct inode *inode;
@@ -1767,11 +1769,11 @@ retry:
1767 if (IS_DIRSYNC(dir)) 1769 if (IS_DIRSYNC(dir))
1768 handle->h_sync = 1; 1770 handle->h_sync = 1;
1769 1771
1770 inode = ext4_new_inode (handle, dir, mode); 1772 inode = ext4_new_inode(handle, dir, mode);
1771 err = PTR_ERR(inode); 1773 err = PTR_ERR(inode);
1772 if (!IS_ERR(inode)) { 1774 if (!IS_ERR(inode)) {
1773 init_special_inode(inode, inode->i_mode, rdev); 1775 init_special_inode(inode, inode->i_mode, rdev);
1774#ifdef CONFIG_EXT4DEV_FS_XATTR 1776#ifdef CONFIG_EXT4_FS_XATTR
1775 inode->i_op = &ext4_special_inode_operations; 1777 inode->i_op = &ext4_special_inode_operations;
1776#endif 1778#endif
1777 err = ext4_add_nondir(handle, dentry, inode); 1779 err = ext4_add_nondir(handle, dentry, inode);
@@ -1782,12 +1784,12 @@ retry:
1782 return err; 1784 return err;
1783} 1785}
1784 1786
1785static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode) 1787static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1786{ 1788{
1787 handle_t *handle; 1789 handle_t *handle;
1788 struct inode * inode; 1790 struct inode *inode;
1789 struct buffer_head * dir_block; 1791 struct buffer_head *dir_block;
1790 struct ext4_dir_entry_2 * de; 1792 struct ext4_dir_entry_2 *de;
1791 int err, retries = 0; 1793 int err, retries = 0;
1792 1794
1793 if (EXT4_DIR_LINK_MAX(dir)) 1795 if (EXT4_DIR_LINK_MAX(dir))
@@ -1803,7 +1805,7 @@ retry:
1803 if (IS_DIRSYNC(dir)) 1805 if (IS_DIRSYNC(dir))
1804 handle->h_sync = 1; 1806 handle->h_sync = 1;
1805 1807
1806 inode = ext4_new_inode (handle, dir, S_IFDIR | mode); 1808 inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
1807 err = PTR_ERR(inode); 1809 err = PTR_ERR(inode);
1808 if (IS_ERR(inode)) 1810 if (IS_ERR(inode))
1809 goto out_stop; 1811 goto out_stop;
@@ -1811,7 +1813,7 @@ retry:
1811 inode->i_op = &ext4_dir_inode_operations; 1813 inode->i_op = &ext4_dir_inode_operations;
1812 inode->i_fop = &ext4_dir_operations; 1814 inode->i_fop = &ext4_dir_operations;
1813 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1815 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1814 dir_block = ext4_bread (handle, inode, 0, 1, &err); 1816 dir_block = ext4_bread(handle, inode, 0, 1, &err);
1815 if (!dir_block) 1817 if (!dir_block)
1816 goto out_clear_inode; 1818 goto out_clear_inode;
1817 BUFFER_TRACE(dir_block, "get_write_access"); 1819 BUFFER_TRACE(dir_block, "get_write_access");
@@ -1820,26 +1822,26 @@ retry:
1820 de->inode = cpu_to_le32(inode->i_ino); 1822 de->inode = cpu_to_le32(inode->i_ino);
1821 de->name_len = 1; 1823 de->name_len = 1;
1822 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); 1824 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
1823 strcpy (de->name, "."); 1825 strcpy(de->name, ".");
1824 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1826 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1825 de = ext4_next_entry(de); 1827 de = ext4_next_entry(de);
1826 de->inode = cpu_to_le32(dir->i_ino); 1828 de->inode = cpu_to_le32(dir->i_ino);
1827 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - 1829 de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
1828 EXT4_DIR_REC_LEN(1)); 1830 EXT4_DIR_REC_LEN(1));
1829 de->name_len = 2; 1831 de->name_len = 2;
1830 strcpy (de->name, ".."); 1832 strcpy(de->name, "..");
1831 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1833 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1832 inode->i_nlink = 2; 1834 inode->i_nlink = 2;
1833 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); 1835 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
1834 ext4_journal_dirty_metadata(handle, dir_block); 1836 ext4_journal_dirty_metadata(handle, dir_block);
1835 brelse (dir_block); 1837 brelse(dir_block);
1836 ext4_mark_inode_dirty(handle, inode); 1838 ext4_mark_inode_dirty(handle, inode);
1837 err = ext4_add_entry (handle, dentry, inode); 1839 err = ext4_add_entry(handle, dentry, inode);
1838 if (err) { 1840 if (err) {
1839out_clear_inode: 1841out_clear_inode:
1840 clear_nlink(inode); 1842 clear_nlink(inode);
1841 ext4_mark_inode_dirty(handle, inode); 1843 ext4_mark_inode_dirty(handle, inode);
1842 iput (inode); 1844 iput(inode);
1843 goto out_stop; 1845 goto out_stop;
1844 } 1846 }
1845 ext4_inc_count(handle, dir); 1847 ext4_inc_count(handle, dir);
@@ -1856,17 +1858,17 @@ out_stop:
1856/* 1858/*
1857 * routine to check that the specified directory is empty (for rmdir) 1859 * routine to check that the specified directory is empty (for rmdir)
1858 */ 1860 */
1859static int empty_dir (struct inode * inode) 1861static int empty_dir(struct inode *inode)
1860{ 1862{
1861 unsigned long offset; 1863 unsigned long offset;
1862 struct buffer_head * bh; 1864 struct buffer_head *bh;
1863 struct ext4_dir_entry_2 * de, * de1; 1865 struct ext4_dir_entry_2 *de, *de1;
1864 struct super_block * sb; 1866 struct super_block *sb;
1865 int err = 0; 1867 int err = 0;
1866 1868
1867 sb = inode->i_sb; 1869 sb = inode->i_sb;
1868 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1870 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1869 !(bh = ext4_bread (NULL, inode, 0, 0, &err))) { 1871 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1870 if (err) 1872 if (err)
1871 ext4_error(inode->i_sb, __func__, 1873 ext4_error(inode->i_sb, __func__,
1872 "error %d reading directory #%lu offset 0", 1874 "error %d reading directory #%lu offset 0",
@@ -1881,23 +1883,23 @@ static int empty_dir (struct inode * inode)
1881 de1 = ext4_next_entry(de); 1883 de1 = ext4_next_entry(de);
1882 if (le32_to_cpu(de->inode) != inode->i_ino || 1884 if (le32_to_cpu(de->inode) != inode->i_ino ||
1883 !le32_to_cpu(de1->inode) || 1885 !le32_to_cpu(de1->inode) ||
1884 strcmp (".", de->name) || 1886 strcmp(".", de->name) ||
1885 strcmp ("..", de1->name)) { 1887 strcmp("..", de1->name)) {
1886 ext4_warning (inode->i_sb, "empty_dir", 1888 ext4_warning(inode->i_sb, "empty_dir",
1887 "bad directory (dir #%lu) - no `.' or `..'", 1889 "bad directory (dir #%lu) - no `.' or `..'",
1888 inode->i_ino); 1890 inode->i_ino);
1889 brelse (bh); 1891 brelse(bh);
1890 return 1; 1892 return 1;
1891 } 1893 }
1892 offset = ext4_rec_len_from_disk(de->rec_len) + 1894 offset = ext4_rec_len_from_disk(de->rec_len) +
1893 ext4_rec_len_from_disk(de1->rec_len); 1895 ext4_rec_len_from_disk(de1->rec_len);
1894 de = ext4_next_entry(de1); 1896 de = ext4_next_entry(de1);
1895 while (offset < inode->i_size ) { 1897 while (offset < inode->i_size) {
1896 if (!bh || 1898 if (!bh ||
1897 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1899 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1898 err = 0; 1900 err = 0;
1899 brelse (bh); 1901 brelse(bh);
1900 bh = ext4_bread (NULL, inode, 1902 bh = ext4_bread(NULL, inode,
1901 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1903 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1902 if (!bh) { 1904 if (!bh) {
1903 if (err) 1905 if (err)
@@ -1917,13 +1919,13 @@ static int empty_dir (struct inode * inode)
1917 continue; 1919 continue;
1918 } 1920 }
1919 if (le32_to_cpu(de->inode)) { 1921 if (le32_to_cpu(de->inode)) {
1920 brelse (bh); 1922 brelse(bh);
1921 return 0; 1923 return 0;
1922 } 1924 }
1923 offset += ext4_rec_len_from_disk(de->rec_len); 1925 offset += ext4_rec_len_from_disk(de->rec_len);
1924 de = ext4_next_entry(de); 1926 de = ext4_next_entry(de);
1925 } 1927 }
1926 brelse (bh); 1928 brelse(bh);
1927 return 1; 1929 return 1;
1928} 1930}
1929 1931
@@ -1954,8 +1956,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1954 * ->i_nlink. For, say it, character device. Not a regular file, 1956 * ->i_nlink. For, say it, character device. Not a regular file,
1955 * not a directory, not a symlink and ->i_nlink > 0. 1957 * not a directory, not a symlink and ->i_nlink > 0.
1956 */ 1958 */
1957 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1959 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1958 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 1960 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1959 1961
1960 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); 1962 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
1961 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 1963 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2069,12 +2071,12 @@ out_brelse:
2069 goto out_err; 2071 goto out_err;
2070} 2072}
2071 2073
2072static int ext4_rmdir (struct inode * dir, struct dentry *dentry) 2074static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2073{ 2075{
2074 int retval; 2076 int retval;
2075 struct inode * inode; 2077 struct inode *inode;
2076 struct buffer_head * bh; 2078 struct buffer_head *bh;
2077 struct ext4_dir_entry_2 * de; 2079 struct ext4_dir_entry_2 *de;
2078 handle_t *handle; 2080 handle_t *handle;
2079 2081
2080 /* Initialize quotas before so that eventual writes go in 2082 /* Initialize quotas before so that eventual writes go in
@@ -2085,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2085 return PTR_ERR(handle); 2087 return PTR_ERR(handle);
2086 2088
2087 retval = -ENOENT; 2089 retval = -ENOENT;
2088 bh = ext4_find_entry (dentry, &de); 2090 bh = ext4_find_entry(dir, &dentry->d_name, &de);
2089 if (!bh) 2091 if (!bh)
2090 goto end_rmdir; 2092 goto end_rmdir;
2091 2093
@@ -2099,16 +2101,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2099 goto end_rmdir; 2101 goto end_rmdir;
2100 2102
2101 retval = -ENOTEMPTY; 2103 retval = -ENOTEMPTY;
2102 if (!empty_dir (inode)) 2104 if (!empty_dir(inode))
2103 goto end_rmdir; 2105 goto end_rmdir;
2104 2106
2105 retval = ext4_delete_entry(handle, dir, de, bh); 2107 retval = ext4_delete_entry(handle, dir, de, bh);
2106 if (retval) 2108 if (retval)
2107 goto end_rmdir; 2109 goto end_rmdir;
2108 if (!EXT4_DIR_LINK_EMPTY(inode)) 2110 if (!EXT4_DIR_LINK_EMPTY(inode))
2109 ext4_warning (inode->i_sb, "ext4_rmdir", 2111 ext4_warning(inode->i_sb, "ext4_rmdir",
2110 "empty directory has too many links (%d)", 2112 "empty directory has too many links (%d)",
2111 inode->i_nlink); 2113 inode->i_nlink);
2112 inode->i_version++; 2114 inode->i_version++;
2113 clear_nlink(inode); 2115 clear_nlink(inode);
2114 /* There's no need to set i_disksize: the fact that i_nlink is 2116 /* There's no need to set i_disksize: the fact that i_nlink is
@@ -2124,16 +2126,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2124 2126
2125end_rmdir: 2127end_rmdir:
2126 ext4_journal_stop(handle); 2128 ext4_journal_stop(handle);
2127 brelse (bh); 2129 brelse(bh);
2128 return retval; 2130 return retval;
2129} 2131}
2130 2132
2131static int ext4_unlink(struct inode * dir, struct dentry *dentry) 2133static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2132{ 2134{
2133 int retval; 2135 int retval;
2134 struct inode * inode; 2136 struct inode *inode;
2135 struct buffer_head * bh; 2137 struct buffer_head *bh;
2136 struct ext4_dir_entry_2 * de; 2138 struct ext4_dir_entry_2 *de;
2137 handle_t *handle; 2139 handle_t *handle;
2138 2140
2139 /* Initialize quotas before so that eventual writes go 2141 /* Initialize quotas before so that eventual writes go
@@ -2147,7 +2149,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2147 handle->h_sync = 1; 2149 handle->h_sync = 1;
2148 2150
2149 retval = -ENOENT; 2151 retval = -ENOENT;
2150 bh = ext4_find_entry (dentry, &de); 2152 bh = ext4_find_entry(dir, &dentry->d_name, &de);
2151 if (!bh) 2153 if (!bh)
2152 goto end_unlink; 2154 goto end_unlink;
2153 2155
@@ -2158,9 +2160,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2158 goto end_unlink; 2160 goto end_unlink;
2159 2161
2160 if (!inode->i_nlink) { 2162 if (!inode->i_nlink) {
2161 ext4_warning (inode->i_sb, "ext4_unlink", 2163 ext4_warning(inode->i_sb, "ext4_unlink",
2162 "Deleting nonexistent file (%lu), %d", 2164 "Deleting nonexistent file (%lu), %d",
2163 inode->i_ino, inode->i_nlink); 2165 inode->i_ino, inode->i_nlink);
2164 inode->i_nlink = 1; 2166 inode->i_nlink = 1;
2165 } 2167 }
2166 retval = ext4_delete_entry(handle, dir, de, bh); 2168 retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2178,15 +2180,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2178 2180
2179end_unlink: 2181end_unlink:
2180 ext4_journal_stop(handle); 2182 ext4_journal_stop(handle);
2181 brelse (bh); 2183 brelse(bh);
2182 return retval; 2184 return retval;
2183} 2185}
2184 2186
2185static int ext4_symlink (struct inode * dir, 2187static int ext4_symlink(struct inode *dir,
2186 struct dentry *dentry, const char * symname) 2188 struct dentry *dentry, const char *symname)
2187{ 2189{
2188 handle_t *handle; 2190 handle_t *handle;
2189 struct inode * inode; 2191 struct inode *inode;
2190 int l, err, retries = 0; 2192 int l, err, retries = 0;
2191 2193
2192 l = strlen(symname)+1; 2194 l = strlen(symname)+1;
@@ -2203,12 +2205,12 @@ retry:
2203 if (IS_DIRSYNC(dir)) 2205 if (IS_DIRSYNC(dir))
2204 handle->h_sync = 1; 2206 handle->h_sync = 1;
2205 2207
2206 inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); 2208 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
2207 err = PTR_ERR(inode); 2209 err = PTR_ERR(inode);
2208 if (IS_ERR(inode)) 2210 if (IS_ERR(inode))
2209 goto out_stop; 2211 goto out_stop;
2210 2212
2211 if (l > sizeof (EXT4_I(inode)->i_data)) { 2213 if (l > sizeof(EXT4_I(inode)->i_data)) {
2212 inode->i_op = &ext4_symlink_inode_operations; 2214 inode->i_op = &ext4_symlink_inode_operations;
2213 ext4_set_aops(inode); 2215 ext4_set_aops(inode);
2214 /* 2216 /*
@@ -2221,14 +2223,14 @@ retry:
2221 if (err) { 2223 if (err) {
2222 clear_nlink(inode); 2224 clear_nlink(inode);
2223 ext4_mark_inode_dirty(handle, inode); 2225 ext4_mark_inode_dirty(handle, inode);
2224 iput (inode); 2226 iput(inode);
2225 goto out_stop; 2227 goto out_stop;
2226 } 2228 }
2227 } else { 2229 } else {
2228 /* clear the extent format for fast symlink */ 2230 /* clear the extent format for fast symlink */
2229 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2231 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
2230 inode->i_op = &ext4_fast_symlink_inode_operations; 2232 inode->i_op = &ext4_fast_symlink_inode_operations;
2231 memcpy((char*)&EXT4_I(inode)->i_data,symname,l); 2233 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2232 inode->i_size = l-1; 2234 inode->i_size = l-1;
2233 } 2235 }
2234 EXT4_I(inode)->i_disksize = inode->i_size; 2236 EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2240,8 +2242,8 @@ out_stop:
2240 return err; 2242 return err;
2241} 2243}
2242 2244
2243static int ext4_link (struct dentry * old_dentry, 2245static int ext4_link(struct dentry *old_dentry,
2244 struct inode * dir, struct dentry *dentry) 2246 struct inode *dir, struct dentry *dentry)
2245{ 2247{
2246 handle_t *handle; 2248 handle_t *handle;
2247 struct inode *inode = old_dentry->d_inode; 2249 struct inode *inode = old_dentry->d_inode;
@@ -2284,13 +2286,13 @@ retry:
2284 * Anybody can rename anything with this: the permission checks are left to the 2286 * Anybody can rename anything with this: the permission checks are left to the
2285 * higher-level routines. 2287 * higher-level routines.
2286 */ 2288 */
2287static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, 2289static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2288 struct inode * new_dir,struct dentry *new_dentry) 2290 struct inode *new_dir, struct dentry *new_dentry)
2289{ 2291{
2290 handle_t *handle; 2292 handle_t *handle;
2291 struct inode * old_inode, * new_inode; 2293 struct inode *old_inode, *new_inode;
2292 struct buffer_head * old_bh, * new_bh, * dir_bh; 2294 struct buffer_head *old_bh, *new_bh, *dir_bh;
2293 struct ext4_dir_entry_2 * old_de, * new_de; 2295 struct ext4_dir_entry_2 *old_de, *new_de;
2294 int retval; 2296 int retval;
2295 2297
2296 old_bh = new_bh = dir_bh = NULL; 2298 old_bh = new_bh = dir_bh = NULL;
@@ -2308,7 +2310,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2308 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 2310 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2309 handle->h_sync = 1; 2311 handle->h_sync = 1;
2310 2312
2311 old_bh = ext4_find_entry (old_dentry, &old_de); 2313 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2312 /* 2314 /*
2313 * Check for inode number is _not_ due to possible IO errors. 2315 * Check for inode number is _not_ due to possible IO errors.
2314 * We might rmdir the source, keep it as pwd of some process 2316 * We might rmdir the source, keep it as pwd of some process
@@ -2321,32 +2323,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2321 goto end_rename; 2323 goto end_rename;
2322 2324
2323 new_inode = new_dentry->d_inode; 2325 new_inode = new_dentry->d_inode;
2324 new_bh = ext4_find_entry (new_dentry, &new_de); 2326 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2325 if (new_bh) { 2327 if (new_bh) {
2326 if (!new_inode) { 2328 if (!new_inode) {
2327 brelse (new_bh); 2329 brelse(new_bh);
2328 new_bh = NULL; 2330 new_bh = NULL;
2329 } 2331 }
2330 } 2332 }
2331 if (S_ISDIR(old_inode->i_mode)) { 2333 if (S_ISDIR(old_inode->i_mode)) {
2332 if (new_inode) { 2334 if (new_inode) {
2333 retval = -ENOTEMPTY; 2335 retval = -ENOTEMPTY;
2334 if (!empty_dir (new_inode)) 2336 if (!empty_dir(new_inode))
2335 goto end_rename; 2337 goto end_rename;
2336 } 2338 }
2337 retval = -EIO; 2339 retval = -EIO;
2338 dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval); 2340 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
2339 if (!dir_bh) 2341 if (!dir_bh)
2340 goto end_rename; 2342 goto end_rename;
2341 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) 2343 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2342 goto end_rename; 2344 goto end_rename;
2343 retval = -EMLINK; 2345 retval = -EMLINK;
2344 if (!new_inode && new_dir!=old_dir && 2346 if (!new_inode && new_dir != old_dir &&
2345 new_dir->i_nlink >= EXT4_LINK_MAX) 2347 new_dir->i_nlink >= EXT4_LINK_MAX)
2346 goto end_rename; 2348 goto end_rename;
2347 } 2349 }
2348 if (!new_bh) { 2350 if (!new_bh) {
2349 retval = ext4_add_entry (handle, new_dentry, old_inode); 2351 retval = ext4_add_entry(handle, new_dentry, old_inode);
2350 if (retval) 2352 if (retval)
2351 goto end_rename; 2353 goto end_rename;
2352 } else { 2354 } else {
@@ -2388,7 +2390,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2388 struct buffer_head *old_bh2; 2390 struct buffer_head *old_bh2;
2389 struct ext4_dir_entry_2 *old_de2; 2391 struct ext4_dir_entry_2 *old_de2;
2390 2392
2391 old_bh2 = ext4_find_entry(old_dentry, &old_de2); 2393 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2392 if (old_bh2) { 2394 if (old_bh2) {
2393 retval = ext4_delete_entry(handle, old_dir, 2395 retval = ext4_delete_entry(handle, old_dir,
2394 old_de2, old_bh2); 2396 old_de2, old_bh2);
@@ -2433,9 +2435,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2433 retval = 0; 2435 retval = 0;
2434 2436
2435end_rename: 2437end_rename:
2436 brelse (dir_bh); 2438 brelse(dir_bh);
2437 brelse (old_bh); 2439 brelse(old_bh);
2438 brelse (new_bh); 2440 brelse(new_bh);
2439 ext4_journal_stop(handle); 2441 ext4_journal_stop(handle);
2440 return retval; 2442 return retval;
2441} 2443}
@@ -2454,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2454 .mknod = ext4_mknod, 2456 .mknod = ext4_mknod,
2455 .rename = ext4_rename, 2457 .rename = ext4_rename,
2456 .setattr = ext4_setattr, 2458 .setattr = ext4_setattr,
2457#ifdef CONFIG_EXT4DEV_FS_XATTR 2459#ifdef CONFIG_EXT4_FS_XATTR
2458 .setxattr = generic_setxattr, 2460 .setxattr = generic_setxattr,
2459 .getxattr = generic_getxattr, 2461 .getxattr = generic_getxattr,
2460 .listxattr = ext4_listxattr, 2462 .listxattr = ext4_listxattr,
@@ -2465,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2465 2467
2466const struct inode_operations ext4_special_inode_operations = { 2468const struct inode_operations ext4_special_inode_operations = {
2467 .setattr = ext4_setattr, 2469 .setattr = ext4_setattr,
2468#ifdef CONFIG_EXT4DEV_FS_XATTR 2470#ifdef CONFIG_EXT4_FS_XATTR
2469 .setxattr = generic_setxattr, 2471 .setxattr = generic_setxattr,
2470 .getxattr = generic_getxattr, 2472 .getxattr = generic_getxattr,
2471 .listxattr = ext4_listxattr, 2473 .listxattr = ext4_listxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 0a9265164265..b6ec1843a015 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -416,8 +416,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
416 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", 416 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
417 gdb_num); 417 gdb_num);
418 418
419 /* 419 /*
420 * If we are not using the primary superblock/GDT copy don't resize, 420 * If we are not using the primary superblock/GDT copy don't resize,
421 * because the user tools have no way of handling this. Probably a 421 * because the user tools have no way of handling this. Probably a
422 * bad time to do it anyways. 422 * bad time to do it anyways.
423 */ 423 */
@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
773 773
774 if (reserved_gdb || gdb_off == 0) { 774 if (reserved_gdb || gdb_off == 0) {
775 if (!EXT4_HAS_COMPAT_FEATURE(sb, 775 if (!EXT4_HAS_COMPAT_FEATURE(sb,
776 EXT4_FEATURE_COMPAT_RESIZE_INODE)){ 776 EXT4_FEATURE_COMPAT_RESIZE_INODE)
777 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
777 ext4_warning(sb, __func__, 778 ext4_warning(sb, __func__,
778 "No reserved GDT blocks, can't resize"); 779 "No reserved GDT blocks, can't resize");
779 return -EPERM; 780 return -EPERM;
@@ -869,11 +870,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
869 * We can allocate memory for mb_alloc based on the new group 870 * We can allocate memory for mb_alloc based on the new group
870 * descriptor 871 * descriptor
871 */ 872 */
872 if (test_opt(sb, MBALLOC)) { 873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); 874 if (err)
874 if (err) 875 goto exit_journal;
875 goto exit_journal; 876
876 }
877 /* 877 /*
878 * Make the new blocks and inodes valid next. We do this before 878 * Make the new blocks and inodes valid next. We do this before
879 * increasing the group count so that once the group is enabled, 879 * increasing the group count so that once the group is enabled,
@@ -928,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
928 percpu_counter_add(&sbi->s_freeinodes_counter, 928 percpu_counter_add(&sbi->s_freeinodes_counter,
929 EXT4_INODES_PER_GROUP(sb)); 929 EXT4_INODES_PER_GROUP(sb));
930 930
931 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
932 ext4_group_t flex_group;
933 flex_group = ext4_flex_group(sbi, input->group);
934 sbi->s_flex_groups[flex_group].free_blocks +=
935 input->free_blocks_count;
936 sbi->s_flex_groups[flex_group].free_inodes +=
937 EXT4_INODES_PER_GROUP(sb);
938 }
939
931 ext4_journal_dirty_metadata(handle, sbi->s_sbh); 940 ext4_journal_dirty_metadata(handle, sbi->s_sbh);
932 sb->s_dirt = 1; 941 sb->s_dirt = 1;
933 942
@@ -963,7 +972,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
963 ext4_group_t o_groups_count; 972 ext4_group_t o_groups_count;
964 ext4_grpblk_t last; 973 ext4_grpblk_t last;
965 ext4_grpblk_t add; 974 ext4_grpblk_t add;
966 struct buffer_head * bh; 975 struct buffer_head *bh;
967 handle_t *handle; 976 handle_t *handle;
968 int err; 977 int err;
969 unsigned long freed_blocks; 978 unsigned long freed_blocks;
@@ -1076,8 +1085,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1076 /* 1085 /*
1077 * Mark mballoc pages as not up to date so that they will be updated 1086 * Mark mballoc pages as not up to date so that they will be updated
1078 * next time they are loaded by ext4_mb_load_buddy. 1087 * next time they are loaded by ext4_mb_load_buddy.
1088 *
1089 * XXX Bad, Bad, BAD!!! We should not be overloading the
1090 * Uptodate flag, particularly on thte bitmap bh, as way of
1091 * hinting to ext4_mb_load_buddy() that it needs to be
1092 * overloaded. A user could take a LVM snapshot, then do an
1093 * on-line fsck, and clear the uptodate flag, and this would
1094 * not be a bug in userspace, but a bug in the kernel. FIXME!!!
1079 */ 1095 */
1080 if (test_opt(sb, MBALLOC)) { 1096 {
1081 struct ext4_sb_info *sbi = EXT4_SB(sb); 1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1082 struct inode *inode = sbi->s_buddy_cache; 1098 struct inode *inode = sbi->s_buddy_cache;
1083 int blocks_per_page; 1099 int blocks_per_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d5d77958b861..9b2b2bc4ec17 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,8 @@
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/quotaops.h> 35#include <linux/quotaops.h>
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/proc_fs.h>
38#include <linux/marker.h>
37#include <linux/log2.h> 39#include <linux/log2.h>
38#include <linux/crc16.h> 40#include <linux/crc16.h>
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
@@ -45,6 +47,8 @@
45#include "namei.h" 47#include "namei.h"
46#include "group.h" 48#include "group.h"
47 49
50struct proc_dir_entry *ext4_proc_root;
51
48static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 52static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
49 unsigned long journal_devnum); 53 unsigned long journal_devnum);
50static int ext4_create_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
@@ -370,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
370 */ 374 */
371} 375}
372 376
373int ext4_update_compat_feature(handle_t *handle,
374 struct super_block *sb, __u32 compat)
375{
376 int err = 0;
377 if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
378 err = ext4_journal_get_write_access(handle,
379 EXT4_SB(sb)->s_sbh);
380 if (err)
381 return err;
382 EXT4_SET_COMPAT_FEATURE(sb, compat);
383 sb->s_dirt = 1;
384 handle->h_sync = 1;
385 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
386 "call ext4_journal_dirty_met adata");
387 err = ext4_journal_dirty_metadata(handle,
388 EXT4_SB(sb)->s_sbh);
389 }
390 return err;
391}
392
393int ext4_update_rocompat_feature(handle_t *handle,
394 struct super_block *sb, __u32 rocompat)
395{
396 int err = 0;
397 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
398 err = ext4_journal_get_write_access(handle,
399 EXT4_SB(sb)->s_sbh);
400 if (err)
401 return err;
402 EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
403 sb->s_dirt = 1;
404 handle->h_sync = 1;
405 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
406 "call ext4_journal_dirty_met adata");
407 err = ext4_journal_dirty_metadata(handle,
408 EXT4_SB(sb)->s_sbh);
409 }
410 return err;
411}
412
413int ext4_update_incompat_feature(handle_t *handle,
414 struct super_block *sb, __u32 incompat)
415{
416 int err = 0;
417 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
418 err = ext4_journal_get_write_access(handle,
419 EXT4_SB(sb)->s_sbh);
420 if (err)
421 return err;
422 EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
423 sb->s_dirt = 1;
424 handle->h_sync = 1;
425 BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
426 "call ext4_journal_dirty_met adata");
427 err = ext4_journal_dirty_metadata(handle,
428 EXT4_SB(sb)->s_sbh);
429 }
430 return err;
431}
432
433/* 377/*
434 * Open the external journal device 378 * Open the external journal device
435 */ 379 */
@@ -503,15 +447,18 @@ static void ext4_put_super(struct super_block *sb)
503 ext4_mb_release(sb); 447 ext4_mb_release(sb);
504 ext4_ext_release(sb); 448 ext4_ext_release(sb);
505 ext4_xattr_put_super(sb); 449 ext4_xattr_put_super(sb);
506 jbd2_journal_destroy(sbi->s_journal); 450 if (jbd2_journal_destroy(sbi->s_journal) < 0)
451 ext4_abort(sb, __func__, "Couldn't clean up the journal");
507 sbi->s_journal = NULL; 452 sbi->s_journal = NULL;
508 if (!(sb->s_flags & MS_RDONLY)) { 453 if (!(sb->s_flags & MS_RDONLY)) {
509 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 454 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
510 es->s_state = cpu_to_le16(sbi->s_mount_state); 455 es->s_state = cpu_to_le16(sbi->s_mount_state);
511 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
512 mark_buffer_dirty(sbi->s_sbh);
513 ext4_commit_super(sb, es, 1); 456 ext4_commit_super(sb, es, 1);
514 } 457 }
458 if (sbi->s_proc) {
459 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
460 remove_proc_entry(sb->s_id, ext4_proc_root);
461 }
515 462
516 for (i = 0; i < sbi->s_gdb_count; i++) 463 for (i = 0; i < sbi->s_gdb_count; i++)
517 brelse(sbi->s_group_desc[i]); 464 brelse(sbi->s_group_desc[i]);
@@ -520,6 +467,7 @@ static void ext4_put_super(struct super_block *sb)
520 percpu_counter_destroy(&sbi->s_freeblocks_counter); 467 percpu_counter_destroy(&sbi->s_freeblocks_counter);
521 percpu_counter_destroy(&sbi->s_freeinodes_counter); 468 percpu_counter_destroy(&sbi->s_freeinodes_counter);
522 percpu_counter_destroy(&sbi->s_dirs_counter); 469 percpu_counter_destroy(&sbi->s_dirs_counter);
470 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
523 brelse(sbi->s_sbh); 471 brelse(sbi->s_sbh);
524#ifdef CONFIG_QUOTA 472#ifdef CONFIG_QUOTA
525 for (i = 0; i < MAXQUOTAS; i++) 473 for (i = 0; i < MAXQUOTAS; i++)
@@ -562,12 +510,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
562 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); 510 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
563 if (!ei) 511 if (!ei)
564 return NULL; 512 return NULL;
565#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 513#ifdef CONFIG_EXT4_FS_POSIX_ACL
566 ei->i_acl = EXT4_ACL_NOT_CACHED; 514 ei->i_acl = EXT4_ACL_NOT_CACHED;
567 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 515 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
568#endif 516#endif
569 ei->i_block_alloc_info = NULL;
570 ei->vfs_inode.i_version = 1; 517 ei->vfs_inode.i_version = 1;
518 ei->vfs_inode.i_data.writeback_index = 0;
571 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 519 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
572 INIT_LIST_HEAD(&ei->i_prealloc_list); 520 INIT_LIST_HEAD(&ei->i_prealloc_list);
573 spin_lock_init(&ei->i_prealloc_lock); 521 spin_lock_init(&ei->i_prealloc_lock);
@@ -598,7 +546,7 @@ static void init_once(void *foo)
598 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 546 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
599 547
600 INIT_LIST_HEAD(&ei->i_orphan); 548 INIT_LIST_HEAD(&ei->i_orphan);
601#ifdef CONFIG_EXT4DEV_FS_XATTR 549#ifdef CONFIG_EXT4_FS_XATTR
602 init_rwsem(&ei->xattr_sem); 550 init_rwsem(&ei->xattr_sem);
603#endif 551#endif
604 init_rwsem(&ei->i_data_sem); 552 init_rwsem(&ei->i_data_sem);
@@ -624,8 +572,7 @@ static void destroy_inodecache(void)
624 572
625static void ext4_clear_inode(struct inode *inode) 573static void ext4_clear_inode(struct inode *inode)
626{ 574{
627 struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info; 575#ifdef CONFIG_EXT4_FS_POSIX_ACL
628#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
629 if (EXT4_I(inode)->i_acl && 576 if (EXT4_I(inode)->i_acl &&
630 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) { 577 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
631 posix_acl_release(EXT4_I(inode)->i_acl); 578 posix_acl_release(EXT4_I(inode)->i_acl);
@@ -637,10 +584,7 @@ static void ext4_clear_inode(struct inode *inode)
637 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED; 584 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
638 } 585 }
639#endif 586#endif
640 ext4_discard_reservation(inode); 587 ext4_discard_preallocations(inode);
641 EXT4_I(inode)->i_block_alloc_info = NULL;
642 if (unlikely(rsv))
643 kfree(rsv);
644 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 588 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
645 &EXT4_I(inode)->jinode); 589 &EXT4_I(inode)->jinode);
646} 590}
@@ -653,7 +597,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
653 597
654 if (sbi->s_jquota_fmt) 598 if (sbi->s_jquota_fmt)
655 seq_printf(seq, ",jqfmt=%s", 599 seq_printf(seq, ",jqfmt=%s",
656 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); 600 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
657 601
658 if (sbi->s_qf_names[USRQUOTA]) 602 if (sbi->s_qf_names[USRQUOTA])
659 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); 603 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -717,7 +661,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
717 seq_puts(seq, ",debug"); 661 seq_puts(seq, ",debug");
718 if (test_opt(sb, OLDALLOC)) 662 if (test_opt(sb, OLDALLOC))
719 seq_puts(seq, ",oldalloc"); 663 seq_puts(seq, ",oldalloc");
720#ifdef CONFIG_EXT4DEV_FS_XATTR 664#ifdef CONFIG_EXT4_FS_XATTR
721 if (test_opt(sb, XATTR_USER) && 665 if (test_opt(sb, XATTR_USER) &&
722 !(def_mount_opts & EXT4_DEFM_XATTR_USER)) 666 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
723 seq_puts(seq, ",user_xattr"); 667 seq_puts(seq, ",user_xattr");
@@ -726,7 +670,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
726 seq_puts(seq, ",nouser_xattr"); 670 seq_puts(seq, ",nouser_xattr");
727 } 671 }
728#endif 672#endif
729#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 673#ifdef CONFIG_EXT4_FS_POSIX_ACL
730 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) 674 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
731 seq_puts(seq, ",acl"); 675 seq_puts(seq, ",acl");
732 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) 676 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -751,8 +695,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
751 seq_puts(seq, ",nobh"); 695 seq_puts(seq, ",nobh");
752 if (!test_opt(sb, EXTENTS)) 696 if (!test_opt(sb, EXTENTS))
753 seq_puts(seq, ",noextents"); 697 seq_puts(seq, ",noextents");
754 if (!test_opt(sb, MBALLOC))
755 seq_puts(seq, ",nomballoc");
756 if (test_opt(sb, I_VERSION)) 698 if (test_opt(sb, I_VERSION))
757 seq_puts(seq, ",i_version"); 699 seq_puts(seq, ",i_version");
758 if (!test_opt(sb, DELALLOC)) 700 if (!test_opt(sb, DELALLOC))
@@ -772,6 +714,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
772 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 714 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
773 seq_puts(seq, ",data=writeback"); 715 seq_puts(seq, ",data=writeback");
774 716
717 if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
718 seq_printf(seq, ",inode_readahead_blks=%u",
719 sbi->s_inode_readahead_blks);
720
721 if (test_opt(sb, DATA_ERR_ABORT))
722 seq_puts(seq, ",data_err=abort");
723
775 ext4_show_quota_options(seq, sb); 724 ext4_show_quota_options(seq, sb);
776 return 0; 725 return 0;
777} 726}
@@ -821,7 +770,7 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
821} 770}
822 771
823#ifdef CONFIG_QUOTA 772#ifdef CONFIG_QUOTA
824#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group") 773#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
825#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 774#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
826 775
827static int ext4_dquot_initialize(struct inode *inode, int type); 776static int ext4_dquot_initialize(struct inode *inode, int type);
@@ -895,20 +844,22 @@ static const struct export_operations ext4_export_ops = {
895enum { 844enum {
896 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 845 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
897 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 846 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
898 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, 847 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
899 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 848 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
900 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 849 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
901 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 850 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
902 Opt_journal_checksum, Opt_journal_async_commit, 851 Opt_journal_checksum, Opt_journal_async_commit,
903 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 852 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
853 Opt_data_err_abort, Opt_data_err_ignore,
904 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 854 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
905 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 855 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
906 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 856 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
907 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 857 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
908 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, 858 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
859 Opt_inode_readahead_blks
909}; 860};
910 861
911static match_table_t tokens = { 862static const match_table_t tokens = {
912 {Opt_bsd_df, "bsddf"}, 863 {Opt_bsd_df, "bsddf"},
913 {Opt_minix_df, "minixdf"}, 864 {Opt_minix_df, "minixdf"},
914 {Opt_grpid, "grpid"}, 865 {Opt_grpid, "grpid"},
@@ -922,8 +873,6 @@ static match_table_t tokens = {
922 {Opt_err_panic, "errors=panic"}, 873 {Opt_err_panic, "errors=panic"},
923 {Opt_err_ro, "errors=remount-ro"}, 874 {Opt_err_ro, "errors=remount-ro"},
924 {Opt_nouid32, "nouid32"}, 875 {Opt_nouid32, "nouid32"},
925 {Opt_nocheck, "nocheck"},
926 {Opt_nocheck, "check=none"},
927 {Opt_debug, "debug"}, 876 {Opt_debug, "debug"},
928 {Opt_oldalloc, "oldalloc"}, 877 {Opt_oldalloc, "oldalloc"},
929 {Opt_orlov, "orlov"}, 878 {Opt_orlov, "orlov"},
@@ -946,6 +895,8 @@ static match_table_t tokens = {
946 {Opt_data_journal, "data=journal"}, 895 {Opt_data_journal, "data=journal"},
947 {Opt_data_ordered, "data=ordered"}, 896 {Opt_data_ordered, "data=ordered"},
948 {Opt_data_writeback, "data=writeback"}, 897 {Opt_data_writeback, "data=writeback"},
898 {Opt_data_err_abort, "data_err=abort"},
899 {Opt_data_err_ignore, "data_err=ignore"},
949 {Opt_offusrjquota, "usrjquota="}, 900 {Opt_offusrjquota, "usrjquota="},
950 {Opt_usrjquota, "usrjquota=%s"}, 901 {Opt_usrjquota, "usrjquota=%s"},
951 {Opt_offgrpjquota, "grpjquota="}, 902 {Opt_offgrpjquota, "grpjquota="},
@@ -960,12 +911,11 @@ static match_table_t tokens = {
960 {Opt_extents, "extents"}, 911 {Opt_extents, "extents"},
961 {Opt_noextents, "noextents"}, 912 {Opt_noextents, "noextents"},
962 {Opt_i_version, "i_version"}, 913 {Opt_i_version, "i_version"},
963 {Opt_mballoc, "mballoc"},
964 {Opt_nomballoc, "nomballoc"},
965 {Opt_stripe, "stripe=%u"}, 914 {Opt_stripe, "stripe=%u"},
966 {Opt_resize, "resize"}, 915 {Opt_resize, "resize"},
967 {Opt_delalloc, "delalloc"}, 916 {Opt_delalloc, "delalloc"},
968 {Opt_nodelalloc, "nodelalloc"}, 917 {Opt_nodelalloc, "nodelalloc"},
918 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
969 {Opt_err, NULL}, 919 {Opt_err, NULL},
970}; 920};
971 921
@@ -980,7 +930,7 @@ static ext4_fsblk_t get_sb_block(void **data)
980 /*todo: use simple_strtoll with >32bit ext4 */ 930 /*todo: use simple_strtoll with >32bit ext4 */
981 sb_block = simple_strtoul(options, &options, 0); 931 sb_block = simple_strtoul(options, &options, 0);
982 if (*options && *options != ',') { 932 if (*options && *options != ',') {
983 printk("EXT4-fs: Invalid sb specification: %s\n", 933 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
984 (char *) *data); 934 (char *) *data);
985 return 1; 935 return 1;
986 } 936 }
@@ -1059,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb,
1059 case Opt_nouid32: 1009 case Opt_nouid32:
1060 set_opt(sbi->s_mount_opt, NO_UID32); 1010 set_opt(sbi->s_mount_opt, NO_UID32);
1061 break; 1011 break;
1062 case Opt_nocheck:
1063 clear_opt(sbi->s_mount_opt, CHECK);
1064 break;
1065 case Opt_debug: 1012 case Opt_debug:
1066 set_opt(sbi->s_mount_opt, DEBUG); 1013 set_opt(sbi->s_mount_opt, DEBUG);
1067 break; 1014 break;
@@ -1071,7 +1018,7 @@ static int parse_options(char *options, struct super_block *sb,
1071 case Opt_orlov: 1018 case Opt_orlov:
1072 clear_opt(sbi->s_mount_opt, OLDALLOC); 1019 clear_opt(sbi->s_mount_opt, OLDALLOC);
1073 break; 1020 break;
1074#ifdef CONFIG_EXT4DEV_FS_XATTR 1021#ifdef CONFIG_EXT4_FS_XATTR
1075 case Opt_user_xattr: 1022 case Opt_user_xattr:
1076 set_opt(sbi->s_mount_opt, XATTR_USER); 1023 set_opt(sbi->s_mount_opt, XATTR_USER);
1077 break; 1024 break;
@@ -1081,10 +1028,11 @@ static int parse_options(char *options, struct super_block *sb,
1081#else 1028#else
1082 case Opt_user_xattr: 1029 case Opt_user_xattr:
1083 case Opt_nouser_xattr: 1030 case Opt_nouser_xattr:
1084 printk("EXT4 (no)user_xattr options not supported\n"); 1031 printk(KERN_ERR "EXT4 (no)user_xattr options "
1032 "not supported\n");
1085 break; 1033 break;
1086#endif 1034#endif
1087#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 1035#ifdef CONFIG_EXT4_FS_POSIX_ACL
1088 case Opt_acl: 1036 case Opt_acl:
1089 set_opt(sbi->s_mount_opt, POSIX_ACL); 1037 set_opt(sbi->s_mount_opt, POSIX_ACL);
1090 break; 1038 break;
@@ -1094,7 +1042,8 @@ static int parse_options(char *options, struct super_block *sb,
1094#else 1042#else
1095 case Opt_acl: 1043 case Opt_acl:
1096 case Opt_noacl: 1044 case Opt_noacl:
1097 printk("EXT4 (no)acl options not supported\n"); 1045 printk(KERN_ERR "EXT4 (no)acl options "
1046 "not supported\n");
1098 break; 1047 break;
1099#endif 1048#endif
1100 case Opt_reservation: 1049 case Opt_reservation:
@@ -1177,6 +1126,12 @@ static int parse_options(char *options, struct super_block *sb,
1177 sbi->s_mount_opt |= data_opt; 1126 sbi->s_mount_opt |= data_opt;
1178 } 1127 }
1179 break; 1128 break;
1129 case Opt_data_err_abort:
1130 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1131 break;
1132 case Opt_data_err_ignore:
1133 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1134 break;
1180#ifdef CONFIG_QUOTA 1135#ifdef CONFIG_QUOTA
1181 case Opt_usrjquota: 1136 case Opt_usrjquota:
1182 qtype = USRQUOTA; 1137 qtype = USRQUOTA;
@@ -1188,8 +1143,8 @@ set_qf_name:
1188 sb_any_quota_suspended(sb)) && 1143 sb_any_quota_suspended(sb)) &&
1189 !sbi->s_qf_names[qtype]) { 1144 !sbi->s_qf_names[qtype]) {
1190 printk(KERN_ERR 1145 printk(KERN_ERR
1191 "EXT4-fs: Cannot change journaled " 1146 "EXT4-fs: Cannot change journaled "
1192 "quota options when quota turned on.\n"); 1147 "quota options when quota turned on.\n");
1193 return 0; 1148 return 0;
1194 } 1149 }
1195 qname = match_strdup(&args[0]); 1150 qname = match_strdup(&args[0]);
@@ -1356,12 +1311,6 @@ set_qf_format:
1356 case Opt_nodelalloc: 1311 case Opt_nodelalloc:
1357 clear_opt(sbi->s_mount_opt, DELALLOC); 1312 clear_opt(sbi->s_mount_opt, DELALLOC);
1358 break; 1313 break;
1359 case Opt_mballoc:
1360 set_opt(sbi->s_mount_opt, MBALLOC);
1361 break;
1362 case Opt_nomballoc:
1363 clear_opt(sbi->s_mount_opt, MBALLOC);
1364 break;
1365 case Opt_stripe: 1314 case Opt_stripe:
1366 if (match_int(&args[0], &option)) 1315 if (match_int(&args[0], &option))
1367 return 0; 1316 return 0;
@@ -1372,6 +1321,13 @@ set_qf_format:
1372 case Opt_delalloc: 1321 case Opt_delalloc:
1373 set_opt(sbi->s_mount_opt, DELALLOC); 1322 set_opt(sbi->s_mount_opt, DELALLOC);
1374 break; 1323 break;
1324 case Opt_inode_readahead_blks:
1325 if (match_int(&args[0], &option))
1326 return 0;
1327 if (option < 0 || option > (1 << 30))
1328 return 0;
1329 sbi->s_inode_readahead_blks = option;
1330 break;
1375 default: 1331 default:
1376 printk(KERN_ERR 1332 printk(KERN_ERR
1377 "EXT4-fs: Unrecognized mount option \"%s\" " 1333 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1472,15 +1428,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1472 EXT4_INODES_PER_GROUP(sb), 1428 EXT4_INODES_PER_GROUP(sb),
1473 sbi->s_mount_opt); 1429 sbi->s_mount_opt);
1474 1430
1475 printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id); 1431 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1476 if (EXT4_SB(sb)->s_journal->j_inode == NULL) { 1432 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1477 char b[BDEVNAME_SIZE]; 1433 "external", EXT4_SB(sb)->s_journal->j_devname);
1478
1479 printk("external journal on %s\n",
1480 bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
1481 } else {
1482 printk("internal journal\n");
1483 }
1484 return res; 1434 return res;
1485} 1435}
1486 1436
@@ -1503,8 +1453,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
1503 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1453 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1504 groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1454 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1505 1455
1506 flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / 1456 /* We allocate both existing and potentially added groups */
1507 groups_per_flex; 1457 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1458 ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
1459 EXT4_DESC_PER_BLOCK_BITS(sb))) /
1460 groups_per_flex;
1508 sbi->s_flex_groups = kzalloc(flex_group_count * 1461 sbi->s_flex_groups = kzalloc(flex_group_count *
1509 sizeof(struct flex_groups), GFP_KERNEL); 1462 sizeof(struct flex_groups), GFP_KERNEL);
1510 if (sbi->s_flex_groups == NULL) { 1463 if (sbi->s_flex_groups == NULL) {
@@ -1583,7 +1536,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1583 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) 1536 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
1584 flexbg_flag = 1; 1537 flexbg_flag = 1;
1585 1538
1586 ext4_debug ("Checking group descriptors"); 1539 ext4_debug("Checking group descriptors");
1587 1540
1588 for (i = 0; i < sbi->s_groups_count; i++) { 1541 for (i = 0; i < sbi->s_groups_count; i++) {
1589 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1542 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1598,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1598 if (block_bitmap < first_block || block_bitmap > last_block) { 1551 if (block_bitmap < first_block || block_bitmap > last_block) {
1599 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1552 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1600 "Block bitmap for group %lu not in group " 1553 "Block bitmap for group %lu not in group "
1601 "(block %llu)!", i, block_bitmap); 1554 "(block %llu)!\n", i, block_bitmap);
1602 return 0; 1555 return 0;
1603 } 1556 }
1604 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1557 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1605 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1558 if (inode_bitmap < first_block || inode_bitmap > last_block) {
1606 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1559 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1607 "Inode bitmap for group %lu not in group " 1560 "Inode bitmap for group %lu not in group "
1608 "(block %llu)!", i, inode_bitmap); 1561 "(block %llu)!\n", i, inode_bitmap);
1609 return 0; 1562 return 0;
1610 } 1563 }
1611 inode_table = ext4_inode_table(sb, gdp); 1564 inode_table = ext4_inode_table(sb, gdp);
@@ -1613,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb)
1613 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1566 inode_table + sbi->s_itb_per_group - 1 > last_block) {
1614 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1567 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1615 "Inode table for group %lu not in group " 1568 "Inode table for group %lu not in group "
1616 "(block %llu)!", i, inode_table); 1569 "(block %llu)!\n", i, inode_table);
1617 return 0; 1570 return 0;
1618 } 1571 }
1619 spin_lock(sb_bgl_lock(sbi, i)); 1572 spin_lock(sb_bgl_lock(sbi, i));
@@ -1622,8 +1575,10 @@ static int ext4_check_descriptors(struct super_block *sb)
1622 "Checksum for group %lu failed (%u!=%u)\n", 1575 "Checksum for group %lu failed (%u!=%u)\n",
1623 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1576 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1624 gdp)), le16_to_cpu(gdp->bg_checksum)); 1577 gdp)), le16_to_cpu(gdp->bg_checksum));
1625 if (!(sb->s_flags & MS_RDONLY)) 1578 if (!(sb->s_flags & MS_RDONLY)) {
1579 spin_unlock(sb_bgl_lock(sbi, i));
1626 return 0; 1580 return 0;
1581 }
1627 } 1582 }
1628 spin_unlock(sb_bgl_lock(sbi, i)); 1583 spin_unlock(sb_bgl_lock(sbi, i));
1629 if (!flexbg_flag) 1584 if (!flexbg_flag)
@@ -1713,9 +1668,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1713 DQUOT_INIT(inode); 1668 DQUOT_INIT(inode);
1714 if (inode->i_nlink) { 1669 if (inode->i_nlink) {
1715 printk(KERN_DEBUG 1670 printk(KERN_DEBUG
1716 "%s: truncating inode %lu to %Ld bytes\n", 1671 "%s: truncating inode %lu to %lld bytes\n",
1717 __func__, inode->i_ino, inode->i_size); 1672 __func__, inode->i_ino, inode->i_size);
1718 jbd_debug(2, "truncating inode %lu to %Ld bytes\n", 1673 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
1719 inode->i_ino, inode->i_size); 1674 inode->i_ino, inode->i_size);
1720 ext4_truncate(inode); 1675 ext4_truncate(inode);
1721 nr_truncates++; 1676 nr_truncates++;
@@ -1756,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
1756 * 1711 *
1757 * Note, this does *not* consider any metadata overhead for vfs i_blocks. 1712 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
1758 */ 1713 */
1759static loff_t ext4_max_size(int blkbits) 1714static loff_t ext4_max_size(int blkbits, int has_huge_files)
1760{ 1715{
1761 loff_t res; 1716 loff_t res;
1762 loff_t upper_limit = MAX_LFS_FILESIZE; 1717 loff_t upper_limit = MAX_LFS_FILESIZE;
1763 1718
1764 /* small i_blocks in vfs inode? */ 1719 /* small i_blocks in vfs inode? */
1765 if (sizeof(blkcnt_t) < sizeof(u64)) { 1720 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1766 /* 1721 /*
1767 * CONFIG_LSF is not enabled implies the inode 1722 * CONFIG_LSF is not enabled implies the inode
1768 * i_block represent total blocks in 512 bytes 1723 * i_block represent total blocks in 512 bytes
@@ -1792,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits)
1792 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. 1747 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
1793 * We need to be 1 filesystem block less than the 2^48 sector limit. 1748 * We need to be 1 filesystem block less than the 2^48 sector limit.
1794 */ 1749 */
1795static loff_t ext4_max_bitmap_size(int bits) 1750static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
1796{ 1751{
1797 loff_t res = EXT4_NDIR_BLOCKS; 1752 loff_t res = EXT4_NDIR_BLOCKS;
1798 int meta_blocks; 1753 int meta_blocks;
@@ -1805,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits)
1805 * total number of 512 bytes blocks of the file 1760 * total number of 512 bytes blocks of the file
1806 */ 1761 */
1807 1762
1808 if (sizeof(blkcnt_t) < sizeof(u64)) { 1763 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
1809 /* 1764 /*
1810 * CONFIG_LSF is not enabled implies the inode 1765 * !has_huge_files or CONFIG_LSF is not enabled
1811 * i_block represent total blocks in 512 bytes 1766 * implies the inode i_block represent total blocks in
1812 * 32 == size of vfs inode i_blocks * 8 1767 * 512 bytes 32 == size of vfs inode i_blocks * 8
1813 */ 1768 */
1814 upper_limit = (1LL << 32) - 1; 1769 upper_limit = (1LL << 32) - 1;
1815 1770
@@ -1913,11 +1868,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1913 unsigned long journal_devnum = 0; 1868 unsigned long journal_devnum = 0;
1914 unsigned long def_mount_opts; 1869 unsigned long def_mount_opts;
1915 struct inode *root; 1870 struct inode *root;
1871 char *cp;
1916 int ret = -EINVAL; 1872 int ret = -EINVAL;
1917 int blocksize; 1873 int blocksize;
1918 int db_count; 1874 int db_count;
1919 int i; 1875 int i;
1920 int needs_recovery; 1876 int needs_recovery, has_huge_files;
1921 __le32 features; 1877 __le32 features;
1922 __u64 blocks_count; 1878 __u64 blocks_count;
1923 int err; 1879 int err;
@@ -1929,10 +1885,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1929 sbi->s_mount_opt = 0; 1885 sbi->s_mount_opt = 0;
1930 sbi->s_resuid = EXT4_DEF_RESUID; 1886 sbi->s_resuid = EXT4_DEF_RESUID;
1931 sbi->s_resgid = EXT4_DEF_RESGID; 1887 sbi->s_resgid = EXT4_DEF_RESGID;
1888 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
1932 sbi->s_sb_block = sb_block; 1889 sbi->s_sb_block = sb_block;
1933 1890
1934 unlock_kernel(); 1891 unlock_kernel();
1935 1892
1893 /* Cleanup superblock name */
1894 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
1895 *cp = '!';
1896
1936 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 1897 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
1937 if (!blocksize) { 1898 if (!blocksize) {
1938 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); 1899 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
@@ -1972,11 +1933,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1972 set_opt(sbi->s_mount_opt, GRPID); 1933 set_opt(sbi->s_mount_opt, GRPID);
1973 if (def_mount_opts & EXT4_DEFM_UID16) 1934 if (def_mount_opts & EXT4_DEFM_UID16)
1974 set_opt(sbi->s_mount_opt, NO_UID32); 1935 set_opt(sbi->s_mount_opt, NO_UID32);
1975#ifdef CONFIG_EXT4DEV_FS_XATTR 1936#ifdef CONFIG_EXT4_FS_XATTR
1976 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 1937 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
1977 set_opt(sbi->s_mount_opt, XATTR_USER); 1938 set_opt(sbi->s_mount_opt, XATTR_USER);
1978#endif 1939#endif
1979#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 1940#ifdef CONFIG_EXT4_FS_POSIX_ACL
1980 if (def_mount_opts & EXT4_DEFM_ACL) 1941 if (def_mount_opts & EXT4_DEFM_ACL)
1981 set_opt(sbi->s_mount_opt, POSIX_ACL); 1942 set_opt(sbi->s_mount_opt, POSIX_ACL);
1982#endif 1943#endif
@@ -2011,11 +1972,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2011 ext4_warning(sb, __func__, 1972 ext4_warning(sb, __func__,
2012 "extents feature not enabled on this filesystem, " 1973 "extents feature not enabled on this filesystem, "
2013 "use tune2fs.\n"); 1974 "use tune2fs.\n");
2014 /*
2015 * turn on mballoc code by default in ext4 filesystem
2016 * Use -o nomballoc to turn it off
2017 */
2018 set_opt(sbi->s_mount_opt, MBALLOC);
2019 1975
2020 /* 1976 /*
2021 * enable delayed allocation by default 1977 * enable delayed allocation by default
@@ -2040,16 +1996,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2040 "running e2fsck is recommended\n"); 1996 "running e2fsck is recommended\n");
2041 1997
2042 /* 1998 /*
2043 * Since ext4 is still considered development code, we require
2044 * that the TEST_FILESYS flag in s->flags be set.
2045 */
2046 if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
2047 printk(KERN_WARNING "EXT4-fs: %s: not marked "
2048 "OK to use with test code.\n", sb->s_id);
2049 goto failed_mount;
2050 }
2051
2052 /*
2053 * Check feature flags regardless of the revision level, since we 1999 * Check feature flags regardless of the revision level, since we
2054 * previously didn't change the revision level when setting the flags, 2000 * previously didn't change the revision level when setting the flags,
2055 * so there is a chance incompat flags are set on a rev 0 filesystem. 2001 * so there is a chance incompat flags are set on a rev 0 filesystem.
@@ -2068,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2068 sb->s_id, le32_to_cpu(features)); 2014 sb->s_id, le32_to_cpu(features));
2069 goto failed_mount; 2015 goto failed_mount;
2070 } 2016 }
2071 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 2017 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2018 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2019 if (has_huge_files) {
2072 /* 2020 /*
2073 * Large file size enabled file system can only be 2021 * Large file size enabled file system can only be
2074 * mount if kernel is build with CONFIG_LSF 2022 * mount if kernel is build with CONFIG_LSF
@@ -2118,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2118 } 2066 }
2119 } 2067 }
2120 2068
2121 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); 2069 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
2122 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); 2070 has_huge_files);
2071 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
2123 2072
2124 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 2073 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
2125 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; 2074 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2218,6 +2167,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2218 goto failed_mount; 2167 goto failed_mount;
2219 } 2168 }
2220 2169
2170#ifdef CONFIG_PROC_FS
2171 if (ext4_proc_root)
2172 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
2173
2174 if (sbi->s_proc)
2175 proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
2176 &ext4_ui_proc_fops,
2177 &sbi->s_inode_readahead_blks);
2178#endif
2179
2221 bgl_lock_init(&sbi->s_blockgroup_lock); 2180 bgl_lock_init(&sbi->s_blockgroup_lock);
2222 2181
2223 for (i = 0; i < db_count; i++) { 2182 for (i = 0; i < db_count; i++) {
@@ -2256,24 +2215,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2256 err = percpu_counter_init(&sbi->s_dirs_counter, 2215 err = percpu_counter_init(&sbi->s_dirs_counter,
2257 ext4_count_dirs(sb)); 2216 ext4_count_dirs(sb));
2258 } 2217 }
2218 if (!err) {
2219 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2220 }
2259 if (err) { 2221 if (err) {
2260 printk(KERN_ERR "EXT4-fs: insufficient memory\n"); 2222 printk(KERN_ERR "EXT4-fs: insufficient memory\n");
2261 goto failed_mount3; 2223 goto failed_mount3;
2262 } 2224 }
2263 2225
2264 /* per fileystem reservation list head & lock */
2265 spin_lock_init(&sbi->s_rsv_window_lock);
2266 sbi->s_rsv_window_root = RB_ROOT;
2267 /* Add a single, static dummy reservation to the start of the
2268 * reservation window list --- it gives us a placeholder for
2269 * append-at-start-of-list which makes the allocation logic
2270 * _much_ simpler. */
2271 sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
2272 sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
2273 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
2274 sbi->s_rsv_window_head.rsv_goal_size = 0;
2275 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
2276
2277 sbi->s_stripe = ext4_get_stripe_size(sbi); 2226 sbi->s_stripe = ext4_get_stripe_size(sbi);
2278 2227
2279 /* 2228 /*
@@ -2443,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2443 "available.\n"); 2392 "available.\n");
2444 } 2393 }
2445 2394
2395 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2396 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
2397 "requested data journaling mode\n");
2398 clear_opt(sbi->s_mount_opt, DELALLOC);
2399 } else if (test_opt(sb, DELALLOC))
2400 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2401
2402 ext4_ext_init(sb);
2403 err = ext4_mb_init(sb, needs_recovery);
2404 if (err) {
2405 printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
2406 err);
2407 goto failed_mount4;
2408 }
2409
2446 /* 2410 /*
2447 * akpm: core read_super() calls in here with the superblock locked. 2411 * akpm: core read_super() calls in here with the superblock locked.
2448 * That deadlocks, because orphan cleanup needs to lock the superblock 2412 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2462,16 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2462 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2426 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
2463 "writeback"); 2427 "writeback");
2464 2428
2465 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2466 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
2467 "requested data journaling mode\n");
2468 clear_opt(sbi->s_mount_opt, DELALLOC);
2469 } else if (test_opt(sb, DELALLOC))
2470 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2471
2472 ext4_ext_init(sb);
2473 ext4_mb_init(sb, needs_recovery);
2474
2475 lock_kernel(); 2429 lock_kernel();
2476 return 0; 2430 return 0;
2477 2431
@@ -2488,11 +2442,16 @@ failed_mount3:
2488 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2442 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2489 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2443 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2490 percpu_counter_destroy(&sbi->s_dirs_counter); 2444 percpu_counter_destroy(&sbi->s_dirs_counter);
2445 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
2491failed_mount2: 2446failed_mount2:
2492 for (i = 0; i < db_count; i++) 2447 for (i = 0; i < db_count; i++)
2493 brelse(sbi->s_group_desc[i]); 2448 brelse(sbi->s_group_desc[i]);
2494 kfree(sbi->s_group_desc); 2449 kfree(sbi->s_group_desc);
2495failed_mount: 2450failed_mount:
2451 if (sbi->s_proc) {
2452 remove_proc_entry("inode_readahead_blks", sbi->s_proc);
2453 remove_proc_entry(sb->s_id, ext4_proc_root);
2454 }
2496#ifdef CONFIG_QUOTA 2455#ifdef CONFIG_QUOTA
2497 for (i = 0; i < MAXQUOTAS; i++) 2456 for (i = 0; i < MAXQUOTAS; i++)
2498 kfree(sbi->s_qf_names[i]); 2457 kfree(sbi->s_qf_names[i]);
@@ -2526,6 +2485,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2526 journal->j_flags |= JBD2_BARRIER; 2485 journal->j_flags |= JBD2_BARRIER;
2527 else 2486 else
2528 journal->j_flags &= ~JBD2_BARRIER; 2487 journal->j_flags &= ~JBD2_BARRIER;
2488 if (test_opt(sb, DATA_ERR_ABORT))
2489 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
2490 else
2491 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
2529 spin_unlock(&journal->j_state_lock); 2492 spin_unlock(&journal->j_state_lock);
2530} 2493}
2531 2494
@@ -2551,7 +2514,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2551 return NULL; 2514 return NULL;
2552 } 2515 }
2553 2516
2554 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", 2517 jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
2555 journal_inode, journal_inode->i_size); 2518 journal_inode, journal_inode->i_size);
2556 if (!S_ISREG(journal_inode->i_mode)) { 2519 if (!S_ISREG(journal_inode->i_mode)) {
2557 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); 2520 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -2714,6 +2677,11 @@ static int ext4_load_journal(struct super_block *sb,
2714 return -EINVAL; 2677 return -EINVAL;
2715 } 2678 }
2716 2679
2680 if (journal->j_flags & JBD2_BARRIER)
2681 printk(KERN_INFO "EXT4-fs: barriers enabled\n");
2682 else
2683 printk(KERN_INFO "EXT4-fs: barriers disabled\n");
2684
2717 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 2685 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2718 err = jbd2_journal_update_format(journal); 2686 err = jbd2_journal_update_format(journal);
2719 if (err) { 2687 if (err) {
@@ -2798,13 +2766,34 @@ static void ext4_commit_super(struct super_block *sb,
2798 2766
2799 if (!sbh) 2767 if (!sbh)
2800 return; 2768 return;
2769 if (buffer_write_io_error(sbh)) {
2770 /*
2771 * Oh, dear. A previous attempt to write the
2772 * superblock failed. This could happen because the
2773 * USB device was yanked out. Or it could happen to
2774 * be a transient write error and maybe the block will
2775 * be remapped. Nothing we can do but to retry the
2776 * write and hope for the best.
2777 */
2778 printk(KERN_ERR "ext4: previous I/O error to "
2779 "superblock detected for %s.\n", sb->s_id);
2780 clear_buffer_write_io_error(sbh);
2781 set_buffer_uptodate(sbh);
2782 }
2801 es->s_wtime = cpu_to_le32(get_seconds()); 2783 es->s_wtime = cpu_to_le32(get_seconds());
2802 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); 2784 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
2803 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 2785 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
2804 BUFFER_TRACE(sbh, "marking dirty"); 2786 BUFFER_TRACE(sbh, "marking dirty");
2805 mark_buffer_dirty(sbh); 2787 mark_buffer_dirty(sbh);
2806 if (sync) 2788 if (sync) {
2807 sync_dirty_buffer(sbh); 2789 sync_dirty_buffer(sbh);
2790 if (buffer_write_io_error(sbh)) {
2791 printk(KERN_ERR "ext4: I/O error while writing "
2792 "superblock for %s.\n", sb->s_id);
2793 clear_buffer_write_io_error(sbh);
2794 set_buffer_uptodate(sbh);
2795 }
2796 }
2808} 2797}
2809 2798
2810 2799
@@ -2819,7 +2808,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2819 journal_t *journal = EXT4_SB(sb)->s_journal; 2808 journal_t *journal = EXT4_SB(sb)->s_journal;
2820 2809
2821 jbd2_journal_lock_updates(journal); 2810 jbd2_journal_lock_updates(journal);
2822 jbd2_journal_flush(journal); 2811 if (jbd2_journal_flush(journal) < 0)
2812 goto out;
2813
2823 lock_super(sb); 2814 lock_super(sb);
2824 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && 2815 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2825 sb->s_flags & MS_RDONLY) { 2816 sb->s_flags & MS_RDONLY) {
@@ -2828,6 +2819,8 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2828 ext4_commit_super(sb, es, 1); 2819 ext4_commit_super(sb, es, 1);
2829 } 2820 }
2830 unlock_super(sb); 2821 unlock_super(sb);
2822
2823out:
2831 jbd2_journal_unlock_updates(journal); 2824 jbd2_journal_unlock_updates(journal);
2832} 2825}
2833 2826
@@ -2906,6 +2899,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2906{ 2899{
2907 tid_t target; 2900 tid_t target;
2908 2901
2902 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2909 sb->s_dirt = 0; 2903 sb->s_dirt = 0;
2910 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 2904 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2911 if (wait) 2905 if (wait)
@@ -2927,7 +2921,13 @@ static void ext4_write_super_lockfs(struct super_block *sb)
2927 2921
2928 /* Now we set up the journal barrier. */ 2922 /* Now we set up the journal barrier. */
2929 jbd2_journal_lock_updates(journal); 2923 jbd2_journal_lock_updates(journal);
2930 jbd2_journal_flush(journal); 2924
2925 /*
2926 * We don't want to clear needs_recovery flag when we failed
2927 * to flush the journal.
2928 */
2929 if (jbd2_journal_flush(journal) < 0)
2930 return;
2931 2931
2932 /* Journal blocked and flushed, clear needs_recovery flag. */ 2932 /* Journal blocked and flushed, clear needs_recovery flag. */
2933 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 2933 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -3161,7 +3161,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3161 buf->f_type = EXT4_SUPER_MAGIC; 3161 buf->f_type = EXT4_SUPER_MAGIC;
3162 buf->f_bsize = sb->s_blocksize; 3162 buf->f_bsize = sb->s_blocksize;
3163 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3163 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
3164 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); 3164 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
3165 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
3165 ext4_free_blocks_count_set(es, buf->f_bfree); 3166 ext4_free_blocks_count_set(es, buf->f_bfree);
3166 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3167 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
3167 if (buf->f_bfree < ext4_r_blocks_count(es)) 3168 if (buf->f_bfree < ext4_r_blocks_count(es))
@@ -3366,8 +3367,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3366 * otherwise be livelocked... 3367 * otherwise be livelocked...
3367 */ 3368 */
3368 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 3369 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
3369 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 3370 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
3370 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 3371 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3372 if (err) {
3373 path_put(&nd.path);
3374 return err;
3375 }
3371 } 3376 }
3372 3377
3373 err = vfs_quota_on_path(sb, type, format_id, &nd.path); 3378 err = vfs_quota_on_path(sb, type, format_id, &nd.path);
@@ -3431,7 +3436,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3431 handle_t *handle = journal_current_handle(); 3436 handle_t *handle = journal_current_handle();
3432 3437
3433 if (!handle) { 3438 if (!handle) {
3434 printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)" 3439 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3435 " cancelled because transaction is not started.\n", 3440 " cancelled because transaction is not started.\n",
3436 (unsigned long long)off, (unsigned long long)len); 3441 (unsigned long long)off, (unsigned long long)len);
3437 return -EIO; 3442 return -EIO;
@@ -3492,18 +3497,82 @@ static int ext4_get_sb(struct file_system_type *fs_type,
3492 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); 3497 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3493} 3498}
3494 3499
3500#ifdef CONFIG_PROC_FS
3501static int ext4_ui_proc_show(struct seq_file *m, void *v)
3502{
3503 unsigned int *p = m->private;
3504
3505 seq_printf(m, "%u\n", *p);
3506 return 0;
3507}
3508
3509static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3510{
3511 return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
3512}
3513
3514static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3515 size_t cnt, loff_t *ppos)
3516{
3517 unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
3518 char str[32];
3519 unsigned long value;
3520
3521 if (cnt >= sizeof(str))
3522 return -EINVAL;
3523 if (copy_from_user(str, buf, cnt))
3524 return -EFAULT;
3525 value = simple_strtol(str, NULL, 0);
3526 if (value < 0)
3527 return -ERANGE;
3528 *p = value;
3529 return cnt;
3530}
3531
3532const struct file_operations ext4_ui_proc_fops = {
3533 .owner = THIS_MODULE,
3534 .open = ext4_ui_proc_open,
3535 .read = seq_read,
3536 .llseek = seq_lseek,
3537 .release = single_release,
3538 .write = ext4_ui_proc_write,
3539};
3540#endif
3541
3542static struct file_system_type ext4_fs_type = {
3543 .owner = THIS_MODULE,
3544 .name = "ext4",
3545 .get_sb = ext4_get_sb,
3546 .kill_sb = kill_block_super,
3547 .fs_flags = FS_REQUIRES_DEV,
3548};
3549
3550#ifdef CONFIG_EXT4DEV_COMPAT
3551static int ext4dev_get_sb(struct file_system_type *fs_type,
3552 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
3553{
3554 printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
3555 "to mount using ext4\n");
3556 printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
3557 "will go away by 2.6.31\n");
3558 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
3559}
3560
3495static struct file_system_type ext4dev_fs_type = { 3561static struct file_system_type ext4dev_fs_type = {
3496 .owner = THIS_MODULE, 3562 .owner = THIS_MODULE,
3497 .name = "ext4dev", 3563 .name = "ext4dev",
3498 .get_sb = ext4_get_sb, 3564 .get_sb = ext4dev_get_sb,
3499 .kill_sb = kill_block_super, 3565 .kill_sb = kill_block_super,
3500 .fs_flags = FS_REQUIRES_DEV, 3566 .fs_flags = FS_REQUIRES_DEV,
3501}; 3567};
3568MODULE_ALIAS("ext4dev");
3569#endif
3502 3570
3503static int __init init_ext4_fs(void) 3571static int __init init_ext4_fs(void)
3504{ 3572{
3505 int err; 3573 int err;
3506 3574
3575 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
3507 err = init_ext4_mballoc(); 3576 err = init_ext4_mballoc();
3508 if (err) 3577 if (err)
3509 return err; 3578 return err;
@@ -3514,9 +3583,16 @@ static int __init init_ext4_fs(void)
3514 err = init_inodecache(); 3583 err = init_inodecache();
3515 if (err) 3584 if (err)
3516 goto out1; 3585 goto out1;
3517 err = register_filesystem(&ext4dev_fs_type); 3586 err = register_filesystem(&ext4_fs_type);
3518 if (err) 3587 if (err)
3519 goto out; 3588 goto out;
3589#ifdef CONFIG_EXT4DEV_COMPAT
3590 err = register_filesystem(&ext4dev_fs_type);
3591 if (err) {
3592 unregister_filesystem(&ext4_fs_type);
3593 goto out;
3594 }
3595#endif
3520 return 0; 3596 return 0;
3521out: 3597out:
3522 destroy_inodecache(); 3598 destroy_inodecache();
@@ -3529,10 +3605,14 @@ out2:
3529 3605
3530static void __exit exit_ext4_fs(void) 3606static void __exit exit_ext4_fs(void)
3531{ 3607{
3608 unregister_filesystem(&ext4_fs_type);
3609#ifdef CONFIG_EXT4DEV_COMPAT
3532 unregister_filesystem(&ext4dev_fs_type); 3610 unregister_filesystem(&ext4dev_fs_type);
3611#endif
3533 destroy_inodecache(); 3612 destroy_inodecache();
3534 exit_ext4_xattr(); 3613 exit_ext4_xattr();
3535 exit_ext4_mballoc(); 3614 exit_ext4_mballoc();
3615 remove_proc_entry("fs/ext4", NULL);
3536} 3616}
3537 3617
3538MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3618MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e9178643dc01..00740cb32be3 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
23#include "ext4.h" 23#include "ext4.h"
24#include "xattr.h" 24#include "xattr.h"
25 25
26static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd) 26static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
27{ 27{
28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); 28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data); 29 nd_set_link(nd, (char *) ei->i_data);
30 return NULL; 30 return NULL;
31} 31}
32 32
@@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37#ifdef CONFIG_EXT4DEV_FS_XATTR 37#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 39 .getxattr = generic_getxattr,
40 .listxattr = ext4_listxattr, 40 .listxattr = ext4_listxattr,
@@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 45const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 46 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 47 .follow_link = ext4_follow_link,
48#ifdef CONFIG_EXT4DEV_FS_XATTR 48#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 50 .getxattr = generic_getxattr,
51 .listxattr = ext4_listxattr, 51 .listxattr = ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8954208b4893..80626d516fee 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
104 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, 104 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
105#endif 105#endif
106 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, 106 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
107#ifdef CONFIG_EXT4DEV_FS_SECURITY 107#ifdef CONFIG_EXT4_FS_SECURITY
108 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, 108 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
109#endif 109#endif
110}; 110};
@@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
112struct xattr_handler *ext4_xattr_handlers[] = { 112struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
116 &ext4_xattr_acl_access_handler, 116 &ext4_xattr_acl_access_handler,
117 &ext4_xattr_acl_default_handler, 117 &ext4_xattr_acl_default_handler,
118#endif 118#endif
119#ifdef CONFIG_EXT4DEV_FS_SECURITY 119#ifdef CONFIG_EXT4_FS_SECURITY
120 &ext4_xattr_security_handler, 120 &ext4_xattr_security_handler,
121#endif 121#endif
122 NULL 122 NULL
@@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
959 struct ext4_xattr_block_find bs = { 959 struct ext4_xattr_block_find bs = {
960 .s = { .not_found = -ENODATA, }, 960 .s = { .not_found = -ENODATA, },
961 }; 961 };
962 unsigned long no_expand;
962 int error; 963 int error;
963 964
964 if (!name) 965 if (!name)
@@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
966 if (strlen(name) > 255) 967 if (strlen(name) > 255)
967 return -ERANGE; 968 return -ERANGE;
968 down_write(&EXT4_I(inode)->xattr_sem); 969 down_write(&EXT4_I(inode)->xattr_sem);
970 no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
971 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
972
969 error = ext4_get_inode_loc(inode, &is.iloc); 973 error = ext4_get_inode_loc(inode, &is.iloc);
970 if (error) 974 if (error)
971 goto cleanup; 975 goto cleanup;
@@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1042cleanup: 1046cleanup:
1043 brelse(is.iloc.bh); 1047 brelse(is.iloc.bh);
1044 brelse(bs.bh); 1048 brelse(bs.bh);
1049 if (no_expand == 0)
1050 EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
1045 up_write(&EXT4_I(inode)->xattr_sem); 1051 up_write(&EXT4_I(inode)->xattr_sem);
1046 return error; 1052 return error;
1047} 1053}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe979bb9..8ede88b18c29 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@ struct ext4_xattr_entry {
51 (((name_len) + EXT4_XATTR_ROUND + \ 51 (((name_len) + EXT4_XATTR_ROUND + \
52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) 52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
53#define EXT4_XATTR_NEXT(entry) \ 53#define EXT4_XATTR_NEXT(entry) \
54 ( (struct ext4_xattr_entry *)( \ 54 ((struct ext4_xattr_entry *)( \
55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) ) 55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
56#define EXT4_XATTR_SIZE(size) \ 56#define EXT4_XATTR_SIZE(size) \
57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) 57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
58 58
@@ -63,7 +63,7 @@ struct ext4_xattr_entry {
63 EXT4_I(inode)->i_extra_isize)) 63 EXT4_I(inode)->i_extra_isize))
64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 64#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
65 65
66# ifdef CONFIG_EXT4DEV_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4DEV_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
93static inline int 93static inline int
94ext4_xattr_get(struct inode *inode, int name_index, const char *name, 94ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
141 141
142#define ext4_xattr_handlers NULL 142#define ext4_xattr_handlers NULL
143 143
144# endif /* CONFIG_EXT4DEV_FS_XATTR */ 144# endif /* CONFIG_EXT4_FS_XATTR */
145 145
146#ifdef CONFIG_EXT4DEV_FS_SECURITY 146#ifdef CONFIG_EXT4_FS_SECURITY
147extern int ext4_init_security(handle_t *handle, struct inode *inode, 147extern int ext4_init_security(handle_t *handle, struct inode *inode,
148 struct inode *dir); 148 struct inode *dir);
149#else 149#else
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7e..fb98b3d847ed 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/msdos_fs.h> 8#include <linux/msdos_fs.h>
9#include <linux/blkdev.h>
9 10
10struct fatent_operations { 11struct fatent_operations {
11 void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); 12 void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
535 struct fat_entry fatent; 536 struct fat_entry fatent;
536 struct buffer_head *bhs[MAX_BUF_PER_PAGE]; 537 struct buffer_head *bhs[MAX_BUF_PER_PAGE];
537 int i, err, nr_bhs; 538 int i, err, nr_bhs;
539 int first_cl = cluster;
538 540
539 nr_bhs = 0; 541 nr_bhs = 0;
540 fatent_init(&fatent); 542 fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
551 goto error; 553 goto error;
552 } 554 }
553 555
556 /*
557 * Issue discard for the sectors we no longer care about,
558 * batching contiguous clusters into one request
559 */
560 if (cluster != fatent.entry + 1) {
561 int nr_clus = fatent.entry - first_cl + 1;
562
563 sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
564 nr_clus * sbi->sec_per_clus);
565 first_cl = cluster;
566 }
567
554 ops->ent_put(&fatent, FAT_ENT_FREE); 568 ops->ent_put(&fatent, FAT_ENT_FREE);
555 if (sbi->free_clusters != -1) { 569 if (sbi->free_clusters != -1) {
556 sbi->free_clusters++; 570 sbi->free_clusters++;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 80ff3381fa21..d12cdf2a0406 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -855,7 +855,7 @@ enum {
855 Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err, 855 Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
856}; 856};
857 857
858static match_table_t fat_tokens = { 858static const match_table_t fat_tokens = {
859 {Opt_check_r, "check=relaxed"}, 859 {Opt_check_r, "check=relaxed"},
860 {Opt_check_s, "check=strict"}, 860 {Opt_check_s, "check=strict"},
861 {Opt_check_n, "check=normal"}, 861 {Opt_check_n, "check=normal"},
@@ -890,14 +890,14 @@ static match_table_t fat_tokens = {
890 {Opt_tz_utc, "tz=UTC"}, 890 {Opt_tz_utc, "tz=UTC"},
891 {Opt_err, NULL}, 891 {Opt_err, NULL},
892}; 892};
893static match_table_t msdos_tokens = { 893static const match_table_t msdos_tokens = {
894 {Opt_nodots, "nodots"}, 894 {Opt_nodots, "nodots"},
895 {Opt_nodots, "dotsOK=no"}, 895 {Opt_nodots, "dotsOK=no"},
896 {Opt_dots, "dots"}, 896 {Opt_dots, "dots"},
897 {Opt_dots, "dotsOK=yes"}, 897 {Opt_dots, "dotsOK=yes"},
898 {Opt_err, NULL} 898 {Opt_err, NULL}
899}; 899};
900static match_table_t vfat_tokens = { 900static const match_table_t vfat_tokens = {
901 {Opt_charset, "iocharset=%s"}, 901 {Opt_charset, "iocharset=%s"},
902 {Opt_shortname_lower, "shortname=lower"}, 902 {Opt_shortname_lower, "shortname=lower"},
903 {Opt_shortname_win95, "shortname=win95"}, 903 {Opt_shortname_win95, "shortname=win95"},
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25adfc3c693a..d0ff0b8cf309 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -8,7 +8,7 @@
8 * pages against inodes. ie: data writeback. Writeout of the 8 * pages against inodes. ie: data writeback. Writeout of the
9 * inode itself is not handled here. 9 * inode itself is not handled here.
10 * 10 *
11 * 10Apr2002 akpm@zip.com.au 11 * 10Apr2002 Andrew Morton
12 * Split out of fs/inode.c 12 * Split out of fs/inode.c
13 * Additions for address_space-based writeback 13 * Additions for address_space-based writeback
14 */ 14 */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d2249f174e20..6a84388cacff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -354,7 +354,7 @@ enum {
354 OPT_ERR 354 OPT_ERR
355}; 355};
356 356
357static match_table_t tokens = { 357static const match_table_t tokens = {
358 {OPT_FD, "fd=%u"}, 358 {OPT_FD, "fd=%u"},
359 {OPT_ROOTMODE, "rootmode=%o"}, 359 {OPT_ROOTMODE, "rootmode=%o"},
360 {OPT_USER_ID, "user_id=%u"}, 360 {OPT_USER_ID, "user_id=%u"},
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13391e546616..c962283d4e7f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1265,6 +1265,8 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1265 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1265 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1266 if (time_before(now, holdtime)) 1266 if (time_before(now, holdtime))
1267 delay = holdtime - now; 1267 delay = holdtime - now;
1268 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1269 delay = gl->gl_ops->go_min_hold_time;
1268 1270
1269 spin_lock(&gl->gl_spin); 1271 spin_lock(&gl->gl_spin);
1270 handle_callback(gl, state, 1, delay); 1272 handle_callback(gl, state, 1, delay);
@@ -1578,8 +1580,6 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1578 *p++ = 'a'; 1580 *p++ = 'a';
1579 if (flags & GL_EXACT) 1581 if (flags & GL_EXACT)
1580 *p++ = 'E'; 1582 *p++ = 'E';
1581 if (flags & GL_ATIME)
1582 *p++ = 'a';
1583 if (flags & GL_NOCACHE) 1583 if (flags & GL_NOCACHE)
1584 *p++ = 'c'; 1584 *p++ = 'c';
1585 if (test_bit(HIF_HOLDER, &iflags)) 1585 if (test_bit(HIF_HOLDER, &iflags))
@@ -1816,15 +1816,17 @@ restart:
1816 if (gl) { 1816 if (gl) {
1817 gi->gl = hlist_entry(gl->gl_list.next, 1817 gi->gl = hlist_entry(gl->gl_list.next,
1818 struct gfs2_glock, gl_list); 1818 struct gfs2_glock, gl_list);
1819 if (gi->gl) 1819 } else {
1820 gfs2_glock_hold(gi->gl); 1820 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
1821 struct gfs2_glock, gl_list);
1821 } 1822 }
1823 if (gi->gl)
1824 gfs2_glock_hold(gi->gl);
1822 read_unlock(gl_lock_addr(gi->hash)); 1825 read_unlock(gl_lock_addr(gi->hash));
1823 if (gl) 1826 if (gl)
1824 gfs2_glock_put(gl); 1827 gfs2_glock_put(gl);
1825 if (gl && gi->gl == NULL)
1826 gi->hash++;
1827 while (gi->gl == NULL) { 1828 while (gi->gl == NULL) {
1829 gi->hash++;
1828 if (gi->hash >= GFS2_GL_HASH_SIZE) 1830 if (gi->hash >= GFS2_GL_HASH_SIZE)
1829 return 1; 1831 return 1;
1830 read_lock(gl_lock_addr(gi->hash)); 1832 read_lock(gl_lock_addr(gi->hash));
@@ -1833,7 +1835,6 @@ restart:
1833 if (gi->gl) 1835 if (gi->gl)
1834 gfs2_glock_hold(gi->gl); 1836 gfs2_glock_hold(gi->gl);
1835 read_unlock(gl_lock_addr(gi->hash)); 1837 read_unlock(gl_lock_addr(gi->hash));
1836 gi->hash++;
1837 } 1838 }
1838 1839
1839 if (gi->sdp != gi->gl->gl_sbd) 1840 if (gi->sdp != gi->gl->gl_sbd)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 971d92af70fc..695c6b193611 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -24,7 +24,6 @@
24#define GL_ASYNC 0x00000040 24#define GL_ASYNC 0x00000040
25#define GL_EXACT 0x00000080 25#define GL_EXACT 0x00000080
26#define GL_SKIP 0x00000100 26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400 27#define GL_NOCACHE 0x00000400
29 28
30#define GLR_TRYFAILED 13 29#define GLR_TRYFAILED 13
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 448697a5c462..f566ec1b4e8e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -386,20 +386,21 @@ struct gfs2_statfs_change_host {
386#define GFS2_DATA_ORDERED 2 386#define GFS2_DATA_ORDERED 2
387 387
388struct gfs2_args { 388struct gfs2_args {
389 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ 389 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
390 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 390 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
391 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 391 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
392 int ar_spectator; /* Don't get a journal because we're always RO */ 392 unsigned int ar_spectator:1; /* Don't get a journal */
393 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */ 393 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
394 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */ 394 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
395 int ar_localcaching; /* Local-style caching (dangerous on multihost) */ 395 unsigned int ar_localcaching:1; /* Local caching */
396 int ar_debug; /* Oops on errors instead of trying to be graceful */ 396 unsigned int ar_debug:1; /* Oops on errors */
397 int ar_upgrade; /* Upgrade ondisk/multihost format */ 397 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
398 unsigned int ar_num_glockd; /* Number of glockd threads */ 398 unsigned int ar_posix_acl:1; /* Enable posix acls */
399 int ar_posix_acl; /* Enable posix acls */ 399 unsigned int ar_quota:2; /* off/account/on */
400 int ar_quota; /* off/account/on */ 400 unsigned int ar_suiddir:1; /* suiddir support */
401 int ar_suiddir; /* suiddir support */ 401 unsigned int ar_data:2; /* ordered/writeback */
402 int ar_data; /* ordered/writeback */ 402 unsigned int ar_meta:1; /* mount metafs */
403 unsigned int ar_num_glockd; /* Number of glockd threads */
403}; 404};
404 405
405struct gfs2_tune { 406struct gfs2_tune {
@@ -419,7 +420,6 @@ struct gfs2_tune {
419 unsigned int gt_quota_scale_den; /* Denominator */ 420 unsigned int gt_quota_scale_den; /* Denominator */
420 unsigned int gt_quota_cache_secs; 421 unsigned int gt_quota_cache_secs;
421 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ 422 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
422 unsigned int gt_atime_quantum; /* Min secs between atime updates */
423 unsigned int gt_new_files_jdata; 423 unsigned int gt_new_files_jdata;
424 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ 424 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
425 unsigned int gt_stall_secs; /* Detects trouble! */ 425 unsigned int gt_stall_secs; /* Detects trouble! */
@@ -432,7 +432,7 @@ enum {
432 SDF_JOURNAL_CHECKED = 0, 432 SDF_JOURNAL_CHECKED = 0,
433 SDF_JOURNAL_LIVE = 1, 433 SDF_JOURNAL_LIVE = 1,
434 SDF_SHUTDOWN = 2, 434 SDF_SHUTDOWN = 2,
435 SDF_NOATIME = 3, 435 SDF_NOBARRIERS = 3,
436}; 436};
437 437
438#define GFS2_FSNAME_LEN 256 438#define GFS2_FSNAME_LEN 256
@@ -461,7 +461,6 @@ struct gfs2_sb_host {
461 461
462struct gfs2_sbd { 462struct gfs2_sbd {
463 struct super_block *sd_vfs; 463 struct super_block *sd_vfs;
464 struct super_block *sd_vfs_meta;
465 struct kobject sd_kobj; 464 struct kobject sd_kobj;
466 unsigned long sd_flags; /* SDF_... */ 465 unsigned long sd_flags; /* SDF_... */
467 struct gfs2_sb_host sd_sb; 466 struct gfs2_sb_host sd_sb;
@@ -499,7 +498,9 @@ struct gfs2_sbd {
499 498
500 /* Inode Stuff */ 499 /* Inode Stuff */
501 500
502 struct inode *sd_master_dir; 501 struct dentry *sd_master_dir;
502 struct dentry *sd_root_dir;
503
503 struct inode *sd_jindex; 504 struct inode *sd_jindex;
504 struct inode *sd_inum_inode; 505 struct inode *sd_inum_inode;
505 struct inode *sd_statfs_inode; 506 struct inode *sd_statfs_inode;
@@ -634,7 +635,6 @@ struct gfs2_sbd {
634 /* Debugging crud */ 635 /* Debugging crud */
635 636
636 unsigned long sd_last_warning; 637 unsigned long sd_last_warning;
637 struct vfsmount *sd_gfs2mnt;
638 struct dentry *debugfs_dir; /* debugfs directory */ 638 struct dentry *debugfs_dir; /* debugfs directory */
639 struct dentry *debugfs_dentry_glocks; /* for debugfs */ 639 struct dentry *debugfs_dentry_glocks; /* for debugfs */
640}; 640};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8b0806a32948..7cee695fa441 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,6 +18,7 @@
18#include <linux/crc32.h> 18#include <linux/crc32.h>
19#include <linux/lm_interface.h> 19#include <linux/lm_interface.h>
20#include <linux/security.h> 20#include <linux/security.h>
21#include <linux/time.h>
21 22
22#include "gfs2.h" 23#include "gfs2.h"
23#include "incore.h" 24#include "incore.h"
@@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
249{ 250{
250 struct gfs2_dinode_host *di = &ip->i_di; 251 struct gfs2_dinode_host *di = &ip->i_di;
251 const struct gfs2_dinode *str = buf; 252 const struct gfs2_dinode *str = buf;
253 struct timespec atime;
252 u16 height, depth; 254 u16 height, depth;
253 255
254 if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) 256 if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
275 di->di_size = be64_to_cpu(str->di_size); 277 di->di_size = be64_to_cpu(str->di_size);
276 i_size_write(&ip->i_inode, di->di_size); 278 i_size_write(&ip->i_inode, di->di_size);
277 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 279 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
278 ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); 280 atime.tv_sec = be64_to_cpu(str->di_atime);
279 ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 281 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
282 if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
283 ip->i_inode.i_atime = atime;
280 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); 284 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
281 ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); 285 ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
282 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); 286 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
@@ -1033,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1033 1037
1034 if (bh) 1038 if (bh)
1035 brelse(bh); 1039 brelse(bh);
1036 if (!inode)
1037 return ERR_PTR(-ENOMEM);
1038 return inode; 1040 return inode;
1039 1041
1040fail_gunlock2: 1042fail_gunlock2:
1041 gfs2_glock_dq_uninit(ghs + 1); 1043 gfs2_glock_dq_uninit(ghs + 1);
1042 if (inode) 1044 if (inode && !IS_ERR(inode))
1043 iput(inode); 1045 iput(inode);
1044fail_gunlock: 1046fail_gunlock:
1045 gfs2_glock_dq(ghs); 1047 gfs2_glock_dq(ghs);
@@ -1140,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1140 return 0; 1142 return 0;
1141} 1143}
1142 1144
1143/*
1144 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1145 * @this: move this
1146 * @to: to here
1147 *
1148 * Follow @to back to the root and make sure we don't encounter @this
1149 * Assumes we already hold the rename lock.
1150 *
1151 * Returns: errno
1152 */
1153
1154int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1155{
1156 struct inode *dir = &to->i_inode;
1157 struct super_block *sb = dir->i_sb;
1158 struct inode *tmp;
1159 struct qstr dotdot;
1160 int error = 0;
1161
1162 gfs2_str2qstr(&dotdot, "..");
1163
1164 igrab(dir);
1165
1166 for (;;) {
1167 if (dir == &this->i_inode) {
1168 error = -EINVAL;
1169 break;
1170 }
1171 if (dir == sb->s_root->d_inode) {
1172 error = 0;
1173 break;
1174 }
1175
1176 tmp = gfs2_lookupi(dir, &dotdot, 1);
1177 if (IS_ERR(tmp)) {
1178 error = PTR_ERR(tmp);
1179 break;
1180 }
1181
1182 iput(dir);
1183 dir = tmp;
1184 }
1185
1186 iput(dir);
1187
1188 return error;
1189}
1190
1191/** 1145/**
1192 * gfs2_readlinki - return the contents of a symlink 1146 * gfs2_readlinki - return the contents of a symlink
1193 * @ip: the symlink's inode 1147 * @ip: the symlink's inode
@@ -1207,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1207 unsigned int x; 1161 unsigned int x;
1208 int error; 1162 int error;
1209 1163
1210 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); 1164 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
1211 error = gfs2_glock_nq_atime(&i_gh); 1165 error = gfs2_glock_nq(&i_gh);
1212 if (error) { 1166 if (error) {
1213 gfs2_holder_uninit(&i_gh); 1167 gfs2_holder_uninit(&i_gh);
1214 return error; 1168 return error;
@@ -1243,101 +1197,6 @@ out:
1243 return error; 1197 return error;
1244} 1198}
1245 1199
1246/**
1247 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1248 * conditionally update the inode's atime
1249 * @gh: the holder to acquire
1250 *
1251 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1252 * Update if the difference between the current time and the inode's current
1253 * atime is greater than an interval specified at mount.
1254 *
1255 * Returns: errno
1256 */
1257
1258int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1259{
1260 struct gfs2_glock *gl = gh->gh_gl;
1261 struct gfs2_sbd *sdp = gl->gl_sbd;
1262 struct gfs2_inode *ip = gl->gl_object;
1263 s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1264 unsigned int state;
1265 int flags;
1266 int error;
1267 struct timespec tv = CURRENT_TIME;
1268
1269 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1270 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1271 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1272 return -EINVAL;
1273
1274 state = gh->gh_state;
1275 flags = gh->gh_flags;
1276
1277 error = gfs2_glock_nq(gh);
1278 if (error)
1279 return error;
1280
1281 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1282 (sdp->sd_vfs->s_flags & MS_RDONLY))
1283 return 0;
1284
1285 if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
1286 gfs2_glock_dq(gh);
1287 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1288 gh);
1289 error = gfs2_glock_nq(gh);
1290 if (error)
1291 return error;
1292
1293 /* Verify that atime hasn't been updated while we were
1294 trying to get exclusive lock. */
1295
1296 tv = CURRENT_TIME;
1297 if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
1298 struct buffer_head *dibh;
1299 struct gfs2_dinode *di;
1300
1301 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1302 if (error == -EROFS)
1303 return 0;
1304 if (error)
1305 goto fail;
1306
1307 error = gfs2_meta_inode_buffer(ip, &dibh);
1308 if (error)
1309 goto fail_end_trans;
1310
1311 ip->i_inode.i_atime = tv;
1312
1313 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1314 di = (struct gfs2_dinode *)dibh->b_data;
1315 di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1316 di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
1317 brelse(dibh);
1318
1319 gfs2_trans_end(sdp);
1320 }
1321
1322 /* If someone else has asked for the glock,
1323 unlock and let them have it. Then reacquire
1324 in the original state. */
1325 if (gfs2_glock_is_blocking(gl)) {
1326 gfs2_glock_dq(gh);
1327 gfs2_holder_reinit(state, flags, gh);
1328 return gfs2_glock_nq(gh);
1329 }
1330 }
1331
1332 return 0;
1333
1334fail_end_trans:
1335 gfs2_trans_end(sdp);
1336fail:
1337 gfs2_glock_dq(gh);
1338 return error;
1339}
1340
1341static int 1200static int
1342__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) 1201__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1343{ 1202{
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 58f9607d6a86..2d43f69610a0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -91,9 +91,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
91int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, 91int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
92 const struct gfs2_inode *ip); 92 const struct gfs2_inode *ip);
93int gfs2_permission(struct inode *inode, int mask); 93int gfs2_permission(struct inode *inode, int mask);
94int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
95int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); 94int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
96int gfs2_glock_nq_atime(struct gfs2_holder *gh);
97int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); 95int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
98struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 96struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
99void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 97void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 09d78c216f48..0c4cbe6c8285 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -144,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data,
144 144
145 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), 145 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
146 &ls->dlm_lockspace, 146 &ls->dlm_lockspace,
147 DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0), 147 DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
148 (nodir ? DLM_LSFL_NODIR : 0),
148 GDLM_LVB_SIZE); 149 GDLM_LVB_SIZE);
149 if (error) { 150 if (error) {
150 log_error("dlm_new_lockspace error %d", error); 151 log_error("dlm_new_lockspace error %d", error);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6c6af9f5e3ab..ad305854bdc6 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/freezer.h> 20#include <linux/freezer.h>
21#include <linux/bio.h>
21 22
22#include "gfs2.h" 23#include "gfs2.h"
23#include "incore.h" 24#include "incore.h"
@@ -584,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
584 memset(bh->b_data, 0, bh->b_size); 585 memset(bh->b_data, 0, bh->b_size);
585 set_buffer_uptodate(bh); 586 set_buffer_uptodate(bh);
586 clear_buffer_dirty(bh); 587 clear_buffer_dirty(bh);
587 unlock_buffer(bh);
588 588
589 gfs2_ail1_empty(sdp, 0); 589 gfs2_ail1_empty(sdp, 0);
590 tail = current_tail(sdp); 590 tail = current_tail(sdp);
@@ -601,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
601 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); 601 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
602 lh->lh_hash = cpu_to_be32(hash); 602 lh->lh_hash = cpu_to_be32(hash);
603 603
604 set_buffer_dirty(bh); 604 bh->b_end_io = end_buffer_write_sync;
605 if (sync_dirty_buffer(bh)) 605 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
606 goto skip_barrier;
607 get_bh(bh);
608 submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
609 wait_on_buffer(bh);
610 if (buffer_eopnotsupp(bh)) {
611 clear_buffer_eopnotsupp(bh);
612 set_buffer_uptodate(bh);
613 set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
614 lock_buffer(bh);
615skip_barrier:
616 get_bh(bh);
617 submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
618 wait_on_buffer(bh);
619 }
620 if (!buffer_uptodate(bh))
606 gfs2_io_error_bh(sdp, bh); 621 gfs2_io_error_bh(sdp, bh);
607 brelse(bh); 622 brelse(bh);
608 623
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index b941f9f9f958..f96eb90a2cfa 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,10 +42,11 @@ enum {
42 Opt_nosuiddir, 42 Opt_nosuiddir,
43 Opt_data_writeback, 43 Opt_data_writeback,
44 Opt_data_ordered, 44 Opt_data_ordered,
45 Opt_meta,
45 Opt_err, 46 Opt_err,
46}; 47};
47 48
48static match_table_t tokens = { 49static const match_table_t tokens = {
49 {Opt_lockproto, "lockproto=%s"}, 50 {Opt_lockproto, "lockproto=%s"},
50 {Opt_locktable, "locktable=%s"}, 51 {Opt_locktable, "locktable=%s"},
51 {Opt_hostdata, "hostdata=%s"}, 52 {Opt_hostdata, "hostdata=%s"},
@@ -66,6 +67,7 @@ static match_table_t tokens = {
66 {Opt_nosuiddir, "nosuiddir"}, 67 {Opt_nosuiddir, "nosuiddir"},
67 {Opt_data_writeback, "data=writeback"}, 68 {Opt_data_writeback, "data=writeback"},
68 {Opt_data_ordered, "data=ordered"}, 69 {Opt_data_ordered, "data=ordered"},
70 {Opt_meta, "meta"},
69 {Opt_err, NULL} 71 {Opt_err, NULL}
70}; 72};
71 73
@@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
239 case Opt_data_ordered: 241 case Opt_data_ordered:
240 args->ar_data = GFS2_DATA_ORDERED; 242 args->ar_data = GFS2_DATA_ORDERED;
241 break; 243 break;
244 case Opt_meta:
245 if (remount && args->ar_meta != 1)
246 goto cant_remount;
247 args->ar_meta = 1;
248 break;
242 case Opt_err: 249 case Opt_err:
243 default: 250 default:
244 fs_info(sdp, "unknown option: %s\n", o); 251 fs_info(sdp, "unknown option: %s\n", o);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e64a1b04117a..27563816e1c5 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -512,8 +512,8 @@ static int gfs2_readpage(struct file *file, struct page *page)
512 int error; 512 int error;
513 513
514 unlock_page(page); 514 unlock_page(page);
515 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 515 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
516 error = gfs2_glock_nq_atime(&gh); 516 error = gfs2_glock_nq(&gh);
517 if (unlikely(error)) 517 if (unlikely(error))
518 goto out; 518 goto out;
519 error = AOP_TRUNCATED_PAGE; 519 error = AOP_TRUNCATED_PAGE;
@@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
594 struct gfs2_holder gh; 594 struct gfs2_holder gh;
595 int ret; 595 int ret;
596 596
597 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 597 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
598 ret = gfs2_glock_nq_atime(&gh); 598 ret = gfs2_glock_nq(&gh);
599 if (unlikely(ret)) 599 if (unlikely(ret))
600 goto out_uninit; 600 goto out_uninit;
601 if (!gfs2_is_stuffed(ip)) 601 if (!gfs2_is_stuffed(ip))
@@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
636 unsigned to = from + len; 636 unsigned to = from + len;
637 struct page *page; 637 struct page *page;
638 638
639 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); 639 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
640 error = gfs2_glock_nq_atime(&ip->i_gh); 640 error = gfs2_glock_nq(&ip->i_gh);
641 if (unlikely(error)) 641 if (unlikely(error))
642 goto out_uninit; 642 goto out_uninit;
643 643
@@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
975 if (gfs2_is_stuffed(ip)) 975 if (gfs2_is_stuffed(ip))
976 return 0; 976 return 0;
977 977
978 if (offset > i_size_read(&ip->i_inode)) 978 if (offset >= i_size_read(&ip->i_inode))
979 return 0; 979 return 0;
980 return 1; 980 return 1;
981} 981}
@@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1000 * unfortunately have the option of only flushing a range like 1000 * unfortunately have the option of only flushing a range like
1001 * the VFS does. 1001 * the VFS does.
1002 */ 1002 */
1003 gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh); 1003 gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
1004 rv = gfs2_glock_nq_atime(&gh); 1004 rv = gfs2_glock_nq(&gh);
1005 if (rv) 1005 if (rv)
1006 return rv; 1006 return rv;
1007 rv = gfs2_ok_for_dio(ip, rw, offset); 1007 rv = gfs2_ok_for_dio(ip, rw, offset);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e9a366d4411c..3a747f8e2188 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -89,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
89 u64 offset = file->f_pos; 89 u64 offset = file->f_pos;
90 int error; 90 int error;
91 91
92 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); 92 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
93 error = gfs2_glock_nq_atime(&d_gh); 93 error = gfs2_glock_nq(&d_gh);
94 if (error) { 94 if (error) {
95 gfs2_holder_uninit(&d_gh); 95 gfs2_holder_uninit(&d_gh);
96 return error; 96 return error;
@@ -153,8 +153,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
153 int error; 153 int error;
154 u32 fsflags; 154 u32 fsflags;
155 155
156 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); 156 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
157 error = gfs2_glock_nq_atime(&gh); 157 error = gfs2_glock_nq(&gh);
158 if (error) 158 if (error)
159 return error; 159 return error;
160 160
@@ -351,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
351 struct gfs2_alloc *al; 351 struct gfs2_alloc *al;
352 int ret; 352 int ret;
353 353
354 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh); 354 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
355 ret = gfs2_glock_nq_atime(&gh); 355 ret = gfs2_glock_nq(&gh);
356 if (ret) 356 if (ret)
357 goto out; 357 goto out;
358 358
@@ -434,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
434 struct gfs2_holder i_gh; 434 struct gfs2_holder i_gh;
435 int error; 435 int error;
436 436
437 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); 437 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
438 error = gfs2_glock_nq_atime(&i_gh); 438 error = gfs2_glock_nq(&i_gh);
439 if (error) { 439 if (error) {
440 gfs2_holder_uninit(&i_gh); 440 gfs2_holder_uninit(&i_gh);
441 return error; 441 return error;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b4d1d6490633..b117fcf2c4f5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -40,6 +40,44 @@
40#define DO 0 40#define DO 0
41#define UNDO 1 41#define UNDO 1
42 42
43static const u32 gfs2_old_fs_formats[] = {
44 0
45};
46
47static const u32 gfs2_old_multihost_formats[] = {
48 0
49};
50
51/**
52 * gfs2_tune_init - Fill a gfs2_tune structure with default values
53 * @gt: tune
54 *
55 */
56
57static void gfs2_tune_init(struct gfs2_tune *gt)
58{
59 spin_lock_init(&gt->gt_spin);
60
61 gt->gt_demote_secs = 300;
62 gt->gt_incore_log_blocks = 1024;
63 gt->gt_log_flush_secs = 60;
64 gt->gt_recoverd_secs = 60;
65 gt->gt_logd_secs = 1;
66 gt->gt_quotad_secs = 5;
67 gt->gt_quota_simul_sync = 64;
68 gt->gt_quota_warn_period = 10;
69 gt->gt_quota_scale_num = 1;
70 gt->gt_quota_scale_den = 1;
71 gt->gt_quota_cache_secs = 300;
72 gt->gt_quota_quantum = 60;
73 gt->gt_new_files_jdata = 0;
74 gt->gt_max_readahead = 1 << 18;
75 gt->gt_stall_secs = 600;
76 gt->gt_complain_secs = 10;
77 gt->gt_statfs_quantum = 30;
78 gt->gt_statfs_slow = 0;
79}
80
43static struct gfs2_sbd *init_sbd(struct super_block *sb) 81static struct gfs2_sbd *init_sbd(struct super_block *sb)
44{ 82{
45 struct gfs2_sbd *sdp; 83 struct gfs2_sbd *sdp;
@@ -96,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
96 return sdp; 134 return sdp;
97} 135}
98 136
99static void init_vfs(struct super_block *sb, unsigned noatime) 137
138/**
139 * gfs2_check_sb - Check superblock
140 * @sdp: the filesystem
141 * @sb: The superblock
142 * @silent: Don't print a message if the check fails
143 *
144 * Checks the version code of the FS is one that we understand how to
145 * read and that the sizes of the various on-disk structures have not
146 * changed.
147 */
148
149static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
100{ 150{
101 struct gfs2_sbd *sdp = sb->s_fs_info; 151 unsigned int x;
102 152
103 sb->s_magic = GFS2_MAGIC; 153 if (sb->sb_magic != GFS2_MAGIC ||
104 sb->s_op = &gfs2_super_ops; 154 sb->sb_type != GFS2_METATYPE_SB) {
105 sb->s_export_op = &gfs2_export_ops; 155 if (!silent)
106 sb->s_time_gran = 1; 156 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
107 sb->s_maxbytes = MAX_LFS_FILESIZE; 157 return -EINVAL;
158 }
159
160 /* If format numbers match exactly, we're done. */
161
162 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
163 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
164 return 0;
165
166 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
167 for (x = 0; gfs2_old_fs_formats[x]; x++)
168 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
169 break;
170
171 if (!gfs2_old_fs_formats[x]) {
172 printk(KERN_WARNING
173 "GFS2: code version (%u, %u) is incompatible "
174 "with ondisk format (%u, %u)\n",
175 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
176 sb->sb_fs_format, sb->sb_multihost_format);
177 printk(KERN_WARNING
178 "GFS2: I don't know how to upgrade this FS\n");
179 return -EINVAL;
180 }
181 }
182
183 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
184 for (x = 0; gfs2_old_multihost_formats[x]; x++)
185 if (gfs2_old_multihost_formats[x] ==
186 sb->sb_multihost_format)
187 break;
188
189 if (!gfs2_old_multihost_formats[x]) {
190 printk(KERN_WARNING
191 "GFS2: code version (%u, %u) is incompatible "
192 "with ondisk format (%u, %u)\n",
193 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
194 sb->sb_fs_format, sb->sb_multihost_format);
195 printk(KERN_WARNING
196 "GFS2: I don't know how to upgrade this FS\n");
197 return -EINVAL;
198 }
199 }
200
201 if (!sdp->sd_args.ar_upgrade) {
202 printk(KERN_WARNING
203 "GFS2: code version (%u, %u) is incompatible "
204 "with ondisk format (%u, %u)\n",
205 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
206 sb->sb_fs_format, sb->sb_multihost_format);
207 printk(KERN_INFO
208 "GFS2: Use the \"upgrade\" mount option to upgrade "
209 "the FS\n");
210 printk(KERN_INFO "GFS2: See the manual for more details\n");
211 return -EINVAL;
212 }
213
214 return 0;
215}
216
217static void end_bio_io_page(struct bio *bio, int error)
218{
219 struct page *page = bio->bi_private;
108 220
109 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) 221 if (!error)
110 set_bit(noatime, &sdp->sd_flags); 222 SetPageUptodate(page);
223 else
224 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
225 unlock_page(page);
226}
227
228static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
229{
230 const struct gfs2_sb *str = buf;
231
232 sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
233 sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
234 sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
235 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
236 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
237 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
238 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
239 sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
240 sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
241 sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
242 sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
243
244 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
245 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
246}
247
248/**
249 * gfs2_read_super - Read the gfs2 super block from disk
250 * @sdp: The GFS2 super block
251 * @sector: The location of the super block
252 * @error: The error code to return
253 *
254 * This uses the bio functions to read the super block from disk
255 * because we want to be 100% sure that we never read cached data.
256 * A super block is read twice only during each GFS2 mount and is
257 * never written to by the filesystem. The first time its read no
258 * locks are held, and the only details which are looked at are those
259 * relating to the locking protocol. Once locking is up and working,
260 * the sb is read again under the lock to establish the location of
261 * the master directory (contains pointers to journals etc) and the
262 * root directory.
263 *
264 * Returns: 0 on success or error
265 */
266
267static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
268{
269 struct super_block *sb = sdp->sd_vfs;
270 struct gfs2_sb *p;
271 struct page *page;
272 struct bio *bio;
273
274 page = alloc_page(GFP_NOFS);
275 if (unlikely(!page))
276 return -ENOBUFS;
277
278 ClearPageUptodate(page);
279 ClearPageDirty(page);
280 lock_page(page);
281
282 bio = bio_alloc(GFP_NOFS, 1);
283 if (unlikely(!bio)) {
284 __free_page(page);
285 return -ENOBUFS;
286 }
111 287
112 /* Don't let the VFS update atimes. GFS2 handles this itself. */ 288 bio->bi_sector = sector * (sb->s_blocksize >> 9);
113 sb->s_flags |= MS_NOATIME | MS_NODIRATIME; 289 bio->bi_bdev = sb->s_bdev;
290 bio_add_page(bio, page, PAGE_SIZE, 0);
291
292 bio->bi_end_io = end_bio_io_page;
293 bio->bi_private = page;
294 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
295 wait_on_page_locked(page);
296 bio_put(bio);
297 if (!PageUptodate(page)) {
298 __free_page(page);
299 return -EIO;
300 }
301 p = kmap(page);
302 gfs2_sb_in(&sdp->sd_sb, p);
303 kunmap(page);
304 __free_page(page);
305 return 0;
306}
307/**
308 * gfs2_read_sb - Read super block
309 * @sdp: The GFS2 superblock
310 * @gl: the glock for the superblock (assumed to be held)
311 * @silent: Don't print message if mount fails
312 *
313 */
314
315static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
316{
317 u32 hash_blocks, ind_blocks, leaf_blocks;
318 u32 tmp_blocks;
319 unsigned int x;
320 int error;
321
322 error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
323 if (error) {
324 if (!silent)
325 fs_err(sdp, "can't read superblock\n");
326 return error;
327 }
328
329 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
330 if (error)
331 return error;
332
333 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
334 GFS2_BASIC_BLOCK_SHIFT;
335 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
336 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
337 sizeof(struct gfs2_dinode)) / sizeof(u64);
338 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
339 sizeof(struct gfs2_meta_header)) / sizeof(u64);
340 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
341 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
342 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
343 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
344 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
345 sizeof(struct gfs2_meta_header)) /
346 sizeof(struct gfs2_quota_change);
347
348 /* Compute maximum reservation required to add a entry to a directory */
349
350 hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
351 sdp->sd_jbsize);
352
353 ind_blocks = 0;
354 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
355 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
356 ind_blocks += tmp_blocks;
357 }
358
359 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
360
361 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
362
363 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
364 sizeof(struct gfs2_dinode);
365 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
366 for (x = 2;; x++) {
367 u64 space, d;
368 u32 m;
369
370 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
371 d = space;
372 m = do_div(d, sdp->sd_inptrs);
373
374 if (d != sdp->sd_heightsize[x - 1] || m)
375 break;
376 sdp->sd_heightsize[x] = space;
377 }
378 sdp->sd_max_height = x;
379 sdp->sd_heightsize[x] = ~0;
380 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
381
382 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
383 sizeof(struct gfs2_dinode);
384 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
385 for (x = 2;; x++) {
386 u64 space, d;
387 u32 m;
388
389 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
390 d = space;
391 m = do_div(d, sdp->sd_inptrs);
392
393 if (d != sdp->sd_jheightsize[x - 1] || m)
394 break;
395 sdp->sd_jheightsize[x] = space;
396 }
397 sdp->sd_max_jheight = x;
398 sdp->sd_jheightsize[x] = ~0;
399 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
400
401 return 0;
114} 402}
115 403
116static int init_names(struct gfs2_sbd *sdp, int silent) 404static int init_names(struct gfs2_sbd *sdp, int silent)
@@ -224,51 +512,59 @@ fail:
224 return error; 512 return error;
225} 513}
226 514
227static inline struct inode *gfs2_lookup_root(struct super_block *sb, 515static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
228 u64 no_addr) 516 u64 no_addr, const char *name)
229{ 517{
230 return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); 518 struct gfs2_sbd *sdp = sb->s_fs_info;
519 struct dentry *dentry;
520 struct inode *inode;
521
522 inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
523 if (IS_ERR(inode)) {
524 fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
525 return PTR_ERR(inode);
526 }
527 dentry = d_alloc_root(inode);
528 if (!dentry) {
529 fs_err(sdp, "can't alloc %s dentry\n", name);
530 iput(inode);
531 return -ENOMEM;
532 }
533 dentry->d_op = &gfs2_dops;
534 *dptr = dentry;
535 return 0;
231} 536}
232 537
233static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) 538static int init_sb(struct gfs2_sbd *sdp, int silent)
234{ 539{
235 struct super_block *sb = sdp->sd_vfs; 540 struct super_block *sb = sdp->sd_vfs;
236 struct gfs2_holder sb_gh; 541 struct gfs2_holder sb_gh;
237 u64 no_addr; 542 u64 no_addr;
238 struct inode *inode; 543 int ret;
239 int error = 0;
240 544
241 if (undo) { 545 ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
242 if (sb->s_root) { 546 LM_ST_SHARED, 0, &sb_gh);
243 dput(sb->s_root); 547 if (ret) {
244 sb->s_root = NULL; 548 fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
245 } 549 return ret;
246 return 0;
247 } 550 }
248 551
249 error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, 552 ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
250 LM_ST_SHARED, 0, &sb_gh); 553 if (ret) {
251 if (error) { 554 fs_err(sdp, "can't read superblock: %d\n", ret);
252 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
253 return error;
254 }
255
256 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
257 if (error) {
258 fs_err(sdp, "can't read superblock: %d\n", error);
259 goto out; 555 goto out;
260 } 556 }
261 557
262 /* Set up the buffer cache and SB for real */ 558 /* Set up the buffer cache and SB for real */
263 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { 559 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
264 error = -EINVAL; 560 ret = -EINVAL;
265 fs_err(sdp, "FS block size (%u) is too small for device " 561 fs_err(sdp, "FS block size (%u) is too small for device "
266 "block size (%u)\n", 562 "block size (%u)\n",
267 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); 563 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
268 goto out; 564 goto out;
269 } 565 }
270 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { 566 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
271 error = -EINVAL; 567 ret = -EINVAL;
272 fs_err(sdp, "FS block size (%u) is too big for machine " 568 fs_err(sdp, "FS block size (%u) is too big for machine "
273 "page size (%u)\n", 569 "page size (%u)\n",
274 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); 570 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
@@ -278,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
278 574
279 /* Get the root inode */ 575 /* Get the root inode */
280 no_addr = sdp->sd_sb.sb_root_dir.no_addr; 576 no_addr = sdp->sd_sb.sb_root_dir.no_addr;
281 if (sb->s_type == &gfs2meta_fs_type) 577 ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
282 no_addr = sdp->sd_sb.sb_master_dir.no_addr; 578 if (ret)
283 inode = gfs2_lookup_root(sb, no_addr);
284 if (IS_ERR(inode)) {
285 error = PTR_ERR(inode);
286 fs_err(sdp, "can't read in root inode: %d\n", error);
287 goto out; 579 goto out;
288 }
289 580
290 sb->s_root = d_alloc_root(inode); 581 /* Get the master inode */
291 if (!sb->s_root) { 582 no_addr = sdp->sd_sb.sb_master_dir.no_addr;
292 fs_err(sdp, "can't get root dentry\n"); 583 ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
293 error = -ENOMEM; 584 if (ret) {
294 iput(inode); 585 dput(sdp->sd_root_dir);
295 } else 586 goto out;
296 sb->s_root->d_op = &gfs2_dops; 587 }
297 588 sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
298out: 589out:
299 gfs2_glock_dq_uninit(&sb_gh); 590 gfs2_glock_dq_uninit(&sb_gh);
300 return error; 591 return ret;
301} 592}
302 593
303/** 594/**
@@ -372,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
372 663
373static int init_journal(struct gfs2_sbd *sdp, int undo) 664static int init_journal(struct gfs2_sbd *sdp, int undo)
374{ 665{
666 struct inode *master = sdp->sd_master_dir->d_inode;
375 struct gfs2_holder ji_gh; 667 struct gfs2_holder ji_gh;
376 struct task_struct *p; 668 struct task_struct *p;
377 struct gfs2_inode *ip; 669 struct gfs2_inode *ip;
@@ -383,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
383 goto fail_recoverd; 675 goto fail_recoverd;
384 } 676 }
385 677
386 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex"); 678 sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
387 if (IS_ERR(sdp->sd_jindex)) { 679 if (IS_ERR(sdp->sd_jindex)) {
388 fs_err(sdp, "can't lookup journal index: %d\n", error); 680 fs_err(sdp, "can't lookup journal index: %d\n", error);
389 return PTR_ERR(sdp->sd_jindex); 681 return PTR_ERR(sdp->sd_jindex);
@@ -506,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
506{ 798{
507 int error = 0; 799 int error = 0;
508 struct gfs2_inode *ip; 800 struct gfs2_inode *ip;
509 struct inode *inode; 801 struct inode *master = sdp->sd_master_dir->d_inode;
510 802
511 if (undo) 803 if (undo)
512 goto fail_qinode; 804 goto fail_qinode;
513 805
514 inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
515 if (IS_ERR(inode)) {
516 error = PTR_ERR(inode);
517 fs_err(sdp, "can't read in master directory: %d\n", error);
518 goto fail;
519 }
520 sdp->sd_master_dir = inode;
521
522 error = init_journal(sdp, undo); 806 error = init_journal(sdp, undo);
523 if (error) 807 if (error)
524 goto fail_master; 808 goto fail;
525 809
526 /* Read in the master inode number inode */ 810 /* Read in the master inode number inode */
527 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum"); 811 sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
528 if (IS_ERR(sdp->sd_inum_inode)) { 812 if (IS_ERR(sdp->sd_inum_inode)) {
529 error = PTR_ERR(sdp->sd_inum_inode); 813 error = PTR_ERR(sdp->sd_inum_inode);
530 fs_err(sdp, "can't read in inum inode: %d\n", error); 814 fs_err(sdp, "can't read in inum inode: %d\n", error);
@@ -533,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
533 817
534 818
535 /* Read in the master statfs inode */ 819 /* Read in the master statfs inode */
536 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs"); 820 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
537 if (IS_ERR(sdp->sd_statfs_inode)) { 821 if (IS_ERR(sdp->sd_statfs_inode)) {
538 error = PTR_ERR(sdp->sd_statfs_inode); 822 error = PTR_ERR(sdp->sd_statfs_inode);
539 fs_err(sdp, "can't read in statfs inode: %d\n", error); 823 fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -541,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
541 } 825 }
542 826
543 /* Read in the resource index inode */ 827 /* Read in the resource index inode */
544 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex"); 828 sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
545 if (IS_ERR(sdp->sd_rindex)) { 829 if (IS_ERR(sdp->sd_rindex)) {
546 error = PTR_ERR(sdp->sd_rindex); 830 error = PTR_ERR(sdp->sd_rindex);
547 fs_err(sdp, "can't get resource index inode: %d\n", error); 831 fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -552,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
552 sdp->sd_rindex_uptodate = 0; 836 sdp->sd_rindex_uptodate = 0;
553 837
554 /* Read in the quota inode */ 838 /* Read in the quota inode */
555 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); 839 sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
556 if (IS_ERR(sdp->sd_quota_inode)) { 840 if (IS_ERR(sdp->sd_quota_inode)) {
557 error = PTR_ERR(sdp->sd_quota_inode); 841 error = PTR_ERR(sdp->sd_quota_inode);
558 fs_err(sdp, "can't get quota file inode: %d\n", error); 842 fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -571,8 +855,6 @@ fail_inum:
571 iput(sdp->sd_inum_inode); 855 iput(sdp->sd_inum_inode);
572fail_journal: 856fail_journal:
573 init_journal(sdp, UNDO); 857 init_journal(sdp, UNDO);
574fail_master:
575 iput(sdp->sd_master_dir);
576fail: 858fail:
577 return error; 859 return error;
578} 860}
@@ -583,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
583 char buf[30]; 865 char buf[30];
584 int error = 0; 866 int error = 0;
585 struct gfs2_inode *ip; 867 struct gfs2_inode *ip;
868 struct inode *master = sdp->sd_master_dir->d_inode;
586 869
587 if (sdp->sd_args.ar_spectator) 870 if (sdp->sd_args.ar_spectator)
588 return 0; 871 return 0;
@@ -590,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
590 if (undo) 873 if (undo)
591 goto fail_qc_gh; 874 goto fail_qc_gh;
592 875
593 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node"); 876 pn = gfs2_lookup_simple(master, "per_node");
594 if (IS_ERR(pn)) { 877 if (IS_ERR(pn)) {
595 error = PTR_ERR(pn); 878 error = PTR_ERR(pn);
596 fs_err(sdp, "can't find per_node directory: %d\n", error); 879 fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
800 goto fail; 1083 goto fail;
801 } 1084 }
802 1085
803 init_vfs(sb, SDF_NOATIME); 1086 sb->s_magic = GFS2_MAGIC;
1087 sb->s_op = &gfs2_super_ops;
1088 sb->s_export_op = &gfs2_export_ops;
1089 sb->s_time_gran = 1;
1090 sb->s_maxbytes = MAX_LFS_FILESIZE;
804 1091
805 /* Set up the buffer cache and fill in some fake block size values 1092 /* Set up the buffer cache and fill in some fake block size values
806 to allow us to read-in the on-disk superblock. */ 1093 to allow us to read-in the on-disk superblock. */
@@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
828 if (error) 1115 if (error)
829 goto fail_lm; 1116 goto fail_lm;
830 1117
831 error = init_sb(sdp, silent, DO); 1118 error = init_sb(sdp, silent);
832 if (error) 1119 if (error)
833 goto fail_locking; 1120 goto fail_locking;
834 1121
@@ -869,7 +1156,11 @@ fail_per_node:
869fail_inodes: 1156fail_inodes:
870 init_inodes(sdp, UNDO); 1157 init_inodes(sdp, UNDO);
871fail_sb: 1158fail_sb:
872 init_sb(sdp, 0, UNDO); 1159 if (sdp->sd_root_dir)
1160 dput(sdp->sd_root_dir);
1161 if (sdp->sd_master_dir)
1162 dput(sdp->sd_master_dir);
1163 sb->s_root = NULL;
873fail_locking: 1164fail_locking:
874 init_locking(sdp, &mount_gh, UNDO); 1165 init_locking(sdp, &mount_gh, UNDO);
875fail_lm: 1166fail_lm:
@@ -887,151 +1178,63 @@ fail:
887} 1178}
888 1179
889static int gfs2_get_sb(struct file_system_type *fs_type, int flags, 1180static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
890 const char *dev_name, void *data, struct vfsmount *mnt) 1181 const char *dev_name, void *data, struct vfsmount *mnt)
891{ 1182{
892 struct super_block *sb; 1183 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
893 struct gfs2_sbd *sdp;
894 int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
895 if (error)
896 goto out;
897 sb = mnt->mnt_sb;
898 sdp = sb->s_fs_info;
899 sdp->sd_gfs2mnt = mnt;
900out:
901 return error;
902} 1184}
903 1185
904static int fill_super_meta(struct super_block *sb, struct super_block *new, 1186static struct super_block *get_gfs2_sb(const char *dev_name)
905 void *data, int silent)
906{ 1187{
907 struct gfs2_sbd *sdp = sb->s_fs_info; 1188 struct super_block *sb;
908 struct inode *inode;
909 int error = 0;
910
911 new->s_fs_info = sdp;
912 sdp->sd_vfs_meta = sb;
913
914 init_vfs(new, SDF_NOATIME);
915
916 /* Get the master inode */
917 inode = igrab(sdp->sd_master_dir);
918
919 new->s_root = d_alloc_root(inode);
920 if (!new->s_root) {
921 fs_err(sdp, "can't get root dentry\n");
922 error = -ENOMEM;
923 iput(inode);
924 } else
925 new->s_root->d_op = &gfs2_dops;
926
927 return error;
928}
929
930static int set_bdev_super(struct super_block *s, void *data)
931{
932 s->s_bdev = data;
933 s->s_dev = s->s_bdev->bd_dev;
934 return 0;
935}
936
937static int test_bdev_super(struct super_block *s, void *data)
938{
939 return s->s_bdev == data;
940}
941
942static struct super_block* get_gfs2_sb(const char *dev_name)
943{
944 struct kstat stat;
945 struct nameidata nd; 1189 struct nameidata nd;
946 struct super_block *sb = NULL, *s;
947 int error; 1190 int error;
948 1191
949 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); 1192 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
950 if (error) { 1193 if (error) {
951 printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n", 1194 printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
952 dev_name); 1195 dev_name, error);
953 goto out; 1196 return NULL;
954 }
955 error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
956
957 list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
958 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
959 (S_ISDIR(stat.mode) &&
960 s == nd.path.dentry->d_inode->i_sb)) {
961 sb = s;
962 goto free_nd;
963 }
964 } 1197 }
965 1198 sb = nd.path.dentry->d_inode->i_sb;
966 printk(KERN_WARNING "GFS2: Unrecognized block device or " 1199 if (sb && (sb->s_type == &gfs2_fs_type))
967 "mount point %s\n", dev_name); 1200 atomic_inc(&sb->s_active);
968 1201 else
969free_nd: 1202 sb = NULL;
970 path_put(&nd.path); 1203 path_put(&nd.path);
971out:
972 return sb; 1204 return sb;
973} 1205}
974 1206
975static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, 1207static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
976 const char *dev_name, void *data, struct vfsmount *mnt) 1208 const char *dev_name, void *data, struct vfsmount *mnt)
977{ 1209{
978 int error = 0; 1210 struct super_block *sb = NULL;
979 struct super_block *sb = NULL, *new;
980 struct gfs2_sbd *sdp; 1211 struct gfs2_sbd *sdp;
981 1212
982 sb = get_gfs2_sb(dev_name); 1213 sb = get_gfs2_sb(dev_name);
983 if (!sb) { 1214 if (!sb) {
984 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); 1215 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
985 error = -ENOENT; 1216 return -ENOENT;
986 goto error;
987 } 1217 }
988 sdp = sb->s_fs_info; 1218 sdp = sb->s_fs_info;
989 if (sdp->sd_vfs_meta) { 1219 mnt->mnt_sb = sb;
990 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); 1220 mnt->mnt_root = dget(sdp->sd_master_dir);
991 error = -EBUSY; 1221 return 0;
992 goto error;
993 }
994 down(&sb->s_bdev->bd_mount_sem);
995 new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
996 up(&sb->s_bdev->bd_mount_sem);
997 if (IS_ERR(new)) {
998 error = PTR_ERR(new);
999 goto error;
1000 }
1001 new->s_flags = flags;
1002 strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
1003 sb_set_blocksize(new, sb->s_blocksize);
1004 error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
1005 if (error) {
1006 up_write(&new->s_umount);
1007 deactivate_super(new);
1008 goto error;
1009 }
1010
1011 new->s_flags |= MS_ACTIVE;
1012
1013 /* Grab a reference to the gfs2 mount point */
1014 atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
1015 return simple_set_mnt(mnt, new);
1016error:
1017 return error;
1018} 1222}
1019 1223
1020static void gfs2_kill_sb(struct super_block *sb) 1224static void gfs2_kill_sb(struct super_block *sb)
1021{ 1225{
1022 if (sb->s_fs_info) { 1226 struct gfs2_sbd *sdp = sb->s_fs_info;
1023 gfs2_delete_debugfs_file(sb->s_fs_info); 1227 if (sdp) {
1024 gfs2_meta_syncfs(sb->s_fs_info); 1228 gfs2_meta_syncfs(sdp);
1229 dput(sdp->sd_root_dir);
1230 dput(sdp->sd_master_dir);
1231 sdp->sd_root_dir = NULL;
1232 sdp->sd_master_dir = NULL;
1025 } 1233 }
1234 shrink_dcache_sb(sb);
1026 kill_block_super(sb); 1235 kill_block_super(sb);
1027} 1236 if (sdp)
1028 1237 gfs2_delete_debugfs_file(sdp);
1029static void gfs2_kill_sb_meta(struct super_block *sb)
1030{
1031 struct gfs2_sbd *sdp = sb->s_fs_info;
1032 generic_shutdown_super(sb);
1033 sdp->sd_vfs_meta = NULL;
1034 atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
1035} 1238}
1036 1239
1037struct file_system_type gfs2_fs_type = { 1240struct file_system_type gfs2_fs_type = {
@@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = {
1046 .name = "gfs2meta", 1249 .name = "gfs2meta",
1047 .fs_flags = FS_REQUIRES_DEV, 1250 .fs_flags = FS_REQUIRES_DEV,
1048 .get_sb = gfs2_get_sb_meta, 1251 .get_sb = gfs2_get_sb_meta,
1049 .kill_sb = gfs2_kill_sb_meta,
1050 .owner = THIS_MODULE, 1252 .owner = THIS_MODULE,
1051}; 1253};
1052 1254
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e2c62f73a778..534e1e2c65ca 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -159,9 +159,13 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
159 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); 159 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
160 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); 160 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
161 161
162 error = gfs2_glock_nq_m(2, ghs); 162 error = gfs2_glock_nq(ghs); /* parent */
163 if (error) 163 if (error)
164 goto out; 164 goto out_parent;
165
166 error = gfs2_glock_nq(ghs + 1); /* child */
167 if (error)
168 goto out_child;
165 169
166 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); 170 error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
167 if (error) 171 if (error)
@@ -245,8 +249,10 @@ out_alloc:
245 if (alloc_required) 249 if (alloc_required)
246 gfs2_alloc_put(dip); 250 gfs2_alloc_put(dip);
247out_gunlock: 251out_gunlock:
248 gfs2_glock_dq_m(2, ghs); 252 gfs2_glock_dq(ghs + 1);
249out: 253out_child:
254 gfs2_glock_dq(ghs);
255out_parent:
250 gfs2_holder_uninit(ghs); 256 gfs2_holder_uninit(ghs);
251 gfs2_holder_uninit(ghs + 1); 257 gfs2_holder_uninit(ghs + 1);
252 if (!error) { 258 if (!error) {
@@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
302 308
303 error = gfs2_unlink_ok(dip, &dentry->d_name, ip); 309 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
304 if (error) 310 if (error)
305 goto out_rgrp; 311 goto out_gunlock;
306 312
307 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); 313 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
308 if (error) 314 if (error)
@@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
316 322
317out_end_trans: 323out_end_trans:
318 gfs2_trans_end(sdp); 324 gfs2_trans_end(sdp);
325out_gunlock:
319 gfs2_glock_dq(ghs + 2); 326 gfs2_glock_dq(ghs + 2);
320out_rgrp: 327out_rgrp:
321 gfs2_holder_uninit(ghs + 2); 328 gfs2_holder_uninit(ghs + 2);
@@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
485 struct gfs2_holder ri_gh; 492 struct gfs2_holder ri_gh;
486 int error; 493 int error;
487 494
488
489 error = gfs2_rindex_hold(sdp, &ri_gh); 495 error = gfs2_rindex_hold(sdp, &ri_gh);
490 if (error) 496 if (error)
491 return error; 497 return error;
@@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
495 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); 501 rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
496 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); 502 gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
497 503
498 error = gfs2_glock_nq_m(3, ghs); 504 error = gfs2_glock_nq(ghs); /* parent */
499 if (error) 505 if (error)
500 goto out; 506 goto out_parent;
507
508 error = gfs2_glock_nq(ghs + 1); /* child */
509 if (error)
510 goto out_child;
511
512 error = gfs2_glock_nq(ghs + 2); /* rgrp */
513 if (error)
514 goto out_rgrp;
501 515
502 error = gfs2_unlink_ok(dip, &dentry->d_name, ip); 516 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
503 if (error) 517 if (error)
@@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
523 gfs2_trans_end(sdp); 537 gfs2_trans_end(sdp);
524 538
525out_gunlock: 539out_gunlock:
526 gfs2_glock_dq_m(3, ghs); 540 gfs2_glock_dq(ghs + 2);
527out: 541out_rgrp:
528 gfs2_holder_uninit(ghs);
529 gfs2_holder_uninit(ghs + 1);
530 gfs2_holder_uninit(ghs + 2); 542 gfs2_holder_uninit(ghs + 2);
543 gfs2_glock_dq(ghs + 1);
544out_child:
545 gfs2_holder_uninit(ghs + 1);
546 gfs2_glock_dq(ghs);
547out_parent:
548 gfs2_holder_uninit(ghs);
531 gfs2_glock_dq_uninit(&ri_gh); 549 gfs2_glock_dq_uninit(&ri_gh);
532 return error; 550 return error;
533} 551}
@@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
571 return 0; 589 return 0;
572} 590}
573 591
592/*
593 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
594 * @this: move this
595 * @to: to here
596 *
597 * Follow @to back to the root and make sure we don't encounter @this
598 * Assumes we already hold the rename lock.
599 *
600 * Returns: errno
601 */
602
603static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
604{
605 struct inode *dir = &to->i_inode;
606 struct super_block *sb = dir->i_sb;
607 struct inode *tmp;
608 struct qstr dotdot;
609 int error = 0;
610
611 gfs2_str2qstr(&dotdot, "..");
612
613 igrab(dir);
614
615 for (;;) {
616 if (dir == &this->i_inode) {
617 error = -EINVAL;
618 break;
619 }
620 if (dir == sb->s_root->d_inode) {
621 error = 0;
622 break;
623 }
624
625 tmp = gfs2_lookupi(dir, &dotdot, 1);
626 if (IS_ERR(tmp)) {
627 error = PTR_ERR(tmp);
628 break;
629 }
630
631 iput(dir);
632 dir = tmp;
633 }
634
635 iput(dir);
636
637 return error;
638}
639
574/** 640/**
575 * gfs2_rename - Rename a file 641 * gfs2_rename - Rename a file
576 * @odir: Parent directory of old file name 642 * @odir: Parent directory of old file name
@@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
589 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 655 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
590 struct gfs2_inode *nip = NULL; 656 struct gfs2_inode *nip = NULL;
591 struct gfs2_sbd *sdp = GFS2_SB(odir); 657 struct gfs2_sbd *sdp = GFS2_SB(odir);
592 struct gfs2_holder ghs[5], r_gh; 658 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
593 struct gfs2_rgrpd *nrgd; 659 struct gfs2_rgrpd *nrgd;
594 unsigned int num_gh; 660 unsigned int num_gh;
595 int dir_rename = 0; 661 int dir_rename = 0;
@@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
603 return 0; 669 return 0;
604 } 670 }
605 671
606 /* Make sure we aren't trying to move a dirctory into it's subdir */
607
608 if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
609 dir_rename = 1;
610 672
611 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, 673 if (odip != ndip) {
612 &r_gh); 674 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
675 0, &r_gh);
613 if (error) 676 if (error)
614 goto out; 677 goto out;
615 678
616 error = gfs2_ok_to_move(ip, ndip); 679 if (S_ISDIR(ip->i_inode.i_mode)) {
617 if (error) 680 dir_rename = 1;
618 goto out_gunlock_r; 681 /* don't move a dirctory into it's subdir */
682 error = gfs2_ok_to_move(ip, ndip);
683 if (error)
684 goto out_gunlock_r;
685 }
619 } 686 }
620 687
621 num_gh = 1; 688 num_gh = 1;
@@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
639 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); 706 gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
640 } 707 }
641 708
642 error = gfs2_glock_nq_m(num_gh, ghs); 709 for (x = 0; x < num_gh; x++) {
643 if (error) 710 error = gfs2_glock_nq(ghs + x);
644 goto out_uninit; 711 if (error)
712 goto out_gunlock;
713 }
645 714
646 /* Check out the old directory */ 715 /* Check out the old directory */
647 716
@@ -804,12 +873,12 @@ out_alloc:
804 if (alloc_required) 873 if (alloc_required)
805 gfs2_alloc_put(ndip); 874 gfs2_alloc_put(ndip);
806out_gunlock: 875out_gunlock:
807 gfs2_glock_dq_m(num_gh, ghs); 876 while (x--) {
808out_uninit: 877 gfs2_glock_dq(ghs + x);
809 for (x = 0; x < num_gh; x++)
810 gfs2_holder_uninit(ghs + x); 878 gfs2_holder_uninit(ghs + x);
879 }
811out_gunlock_r: 880out_gunlock_r:
812 if (dir_rename) 881 if (r_gh.gh_gl)
813 gfs2_glock_dq_uninit(&r_gh); 882 gfs2_glock_dq_uninit(&r_gh);
814out: 883out:
815 return error; 884 return error;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index f66ea0f7a356..d5355d9b5926 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -20,6 +20,7 @@
20#include <linux/gfs2_ondisk.h> 20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h> 21#include <linux/crc32.h>
22#include <linux/lm_interface.h> 22#include <linux/lm_interface.h>
23#include <linux/time.h>
23 24
24#include "gfs2.h" 25#include "gfs2.h"
25#include "incore.h" 26#include "incore.h"
@@ -38,6 +39,7 @@
38#include "dir.h" 39#include "dir.h"
39#include "eattr.h" 40#include "eattr.h"
40#include "bmap.h" 41#include "bmap.h"
42#include "meta_io.h"
41 43
42/** 44/**
43 * gfs2_write_inode - Make sure the inode is stable on the disk 45 * gfs2_write_inode - Make sure the inode is stable on the disk
@@ -50,16 +52,74 @@
50static int gfs2_write_inode(struct inode *inode, int sync) 52static int gfs2_write_inode(struct inode *inode, int sync)
51{ 53{
52 struct gfs2_inode *ip = GFS2_I(inode); 54 struct gfs2_inode *ip = GFS2_I(inode);
53 55 struct gfs2_sbd *sdp = GFS2_SB(inode);
54 /* Check this is a "normal" inode */ 56 struct gfs2_holder gh;
55 if (test_bit(GIF_USER, &ip->i_flags)) { 57 struct buffer_head *bh;
56 if (current->flags & PF_MEMALLOC) 58 struct timespec atime;
57 return 0; 59 struct gfs2_dinode *di;
58 if (sync) 60 int ret = 0;
59 gfs2_log_flush(GFS2_SB(inode), ip->i_gl); 61
62 /* Check this is a "normal" inode, etc */
63 if (!test_bit(GIF_USER, &ip->i_flags) ||
64 (current->flags & PF_MEMALLOC))
65 return 0;
66 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
67 if (ret)
68 goto do_flush;
69 ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
70 if (ret)
71 goto do_unlock;
72 ret = gfs2_meta_inode_buffer(ip, &bh);
73 if (ret == 0) {
74 di = (struct gfs2_dinode *)bh->b_data;
75 atime.tv_sec = be64_to_cpu(di->di_atime);
76 atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
77 if (timespec_compare(&inode->i_atime, &atime) > 0) {
78 gfs2_trans_add_bh(ip->i_gl, bh, 1);
79 gfs2_dinode_out(ip, bh->b_data);
80 }
81 brelse(bh);
60 } 82 }
83 gfs2_trans_end(sdp);
84do_unlock:
85 gfs2_glock_dq_uninit(&gh);
86do_flush:
87 if (sync != 0)
88 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
89 return ret;
90}
61 91
62 return 0; 92/**
93 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
94 * @sdp: the filesystem
95 *
96 * Returns: errno
97 */
98
99static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
100{
101 struct gfs2_holder t_gh;
102 int error;
103
104 gfs2_quota_sync(sdp);
105 gfs2_statfs_sync(sdp);
106
107 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
108 &t_gh);
109 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
110 return error;
111
112 gfs2_meta_syncfs(sdp);
113 gfs2_log_shutdown(sdp);
114
115 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
116
117 if (t_gh.gh_gl)
118 gfs2_glock_dq_uninit(&t_gh);
119
120 gfs2_quota_cleanup(sdp);
121
122 return error;
63} 123}
64 124
65/** 125/**
@@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb)
73 struct gfs2_sbd *sdp = sb->s_fs_info; 133 struct gfs2_sbd *sdp = sb->s_fs_info;
74 int error; 134 int error;
75 135
76 if (!sdp)
77 return;
78
79 if (!strncmp(sb->s_type->name, "gfs2meta", 8))
80 return; /* Nothing to do */
81
82 /* Unfreeze the filesystem, if we need to */ 136 /* Unfreeze the filesystem, if we need to */
83 137
84 mutex_lock(&sdp->sd_freeze_lock); 138 mutex_lock(&sdp->sd_freeze_lock);
@@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb)
101 155
102 /* Release stuff */ 156 /* Release stuff */
103 157
104 iput(sdp->sd_master_dir);
105 iput(sdp->sd_jindex); 158 iput(sdp->sd_jindex);
106 iput(sdp->sd_inum_inode); 159 iput(sdp->sd_inum_inode);
107 iput(sdp->sd_statfs_inode); 160 iput(sdp->sd_statfs_inode);
@@ -152,6 +205,7 @@ static void gfs2_write_super(struct super_block *sb)
152 * 205 *
153 * Flushes the log to disk. 206 * Flushes the log to disk.
154 */ 207 */
208
155static int gfs2_sync_fs(struct super_block *sb, int wait) 209static int gfs2_sync_fs(struct super_block *sb, int wait)
156{ 210{
157 sb->s_dirt = 0; 211 sb->s_dirt = 0;
@@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
270 } 324 }
271 } 325 }
272 326
273 if (*flags & (MS_NOATIME | MS_NODIRATIME))
274 set_bit(SDF_NOATIME, &sdp->sd_flags);
275 else
276 clear_bit(SDF_NOATIME, &sdp->sd_flags);
277
278 /* Don't let the VFS update atimes. GFS2 handles this itself. */
279 *flags |= MS_NOATIME | MS_NODIRATIME;
280
281 return error; 327 return error;
282} 328}
283 329
@@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
295 * inode's blocks, or alternatively pass the baton on to another 341 * inode's blocks, or alternatively pass the baton on to another
296 * node for later deallocation. 342 * node for later deallocation.
297 */ 343 */
344
298static void gfs2_drop_inode(struct inode *inode) 345static void gfs2_drop_inode(struct inode *inode)
299{ 346{
300 struct gfs2_inode *ip = GFS2_I(inode); 347 struct gfs2_inode *ip = GFS2_I(inode);
@@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode)
333 } 380 }
334} 381}
335 382
383static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
384{
385 do {
386 if (d1 == d2)
387 return 1;
388 d1 = d1->d_parent;
389 } while (!IS_ROOT(d1));
390 return 0;
391}
392
336/** 393/**
337 * gfs2_show_options - Show mount options for /proc/mounts 394 * gfs2_show_options - Show mount options for /proc/mounts
338 * @s: seq_file structure 395 * @s: seq_file structure
@@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
346 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; 403 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
347 struct gfs2_args *args = &sdp->sd_args; 404 struct gfs2_args *args = &sdp->sd_args;
348 405
406 if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
407 seq_printf(s, ",meta");
349 if (args->ar_lockproto[0]) 408 if (args->ar_lockproto[0])
350 seq_printf(s, ",lockproto=%s", args->ar_lockproto); 409 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
351 if (args->ar_locktable[0]) 410 if (args->ar_locktable[0])
@@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
414 * conversion on the iopen lock, but we can change that later. This 473 * conversion on the iopen lock, but we can change that later. This
415 * is safe, just less efficient. 474 * is safe, just less efficient.
416 */ 475 */
476
417static void gfs2_delete_inode(struct inode *inode) 477static void gfs2_delete_inode(struct inode *inode)
418{ 478{
419 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; 479 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
@@ -478,8 +538,6 @@ out:
478 clear_inode(inode); 538 clear_inode(inode);
479} 539}
480 540
481
482
483static struct inode *gfs2_alloc_inode(struct super_block *sb) 541static struct inode *gfs2_alloc_inode(struct super_block *sb)
484{ 542{
485 struct gfs2_inode *ip; 543 struct gfs2_inode *ip;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ca831991cbc2..c3ba3d9d0aac 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,313 +33,6 @@
33#include "trans.h" 33#include "trans.h"
34#include "util.h" 34#include "util.h"
35 35
36static const u32 gfs2_old_fs_formats[] = {
37 0
38};
39
40static const u32 gfs2_old_multihost_formats[] = {
41 0
42};
43
44/**
45 * gfs2_tune_init - Fill a gfs2_tune structure with default values
46 * @gt: tune
47 *
48 */
49
50void gfs2_tune_init(struct gfs2_tune *gt)
51{
52 spin_lock_init(&gt->gt_spin);
53
54 gt->gt_demote_secs = 300;
55 gt->gt_incore_log_blocks = 1024;
56 gt->gt_log_flush_secs = 60;
57 gt->gt_recoverd_secs = 60;
58 gt->gt_logd_secs = 1;
59 gt->gt_quotad_secs = 5;
60 gt->gt_quota_simul_sync = 64;
61 gt->gt_quota_warn_period = 10;
62 gt->gt_quota_scale_num = 1;
63 gt->gt_quota_scale_den = 1;
64 gt->gt_quota_cache_secs = 300;
65 gt->gt_quota_quantum = 60;
66 gt->gt_atime_quantum = 3600;
67 gt->gt_new_files_jdata = 0;
68 gt->gt_max_readahead = 1 << 18;
69 gt->gt_stall_secs = 600;
70 gt->gt_complain_secs = 10;
71 gt->gt_statfs_quantum = 30;
72 gt->gt_statfs_slow = 0;
73}
74
75/**
76 * gfs2_check_sb - Check superblock
77 * @sdp: the filesystem
78 * @sb: The superblock
79 * @silent: Don't print a message if the check fails
80 *
81 * Checks the version code of the FS is one that we understand how to
82 * read and that the sizes of the various on-disk structures have not
83 * changed.
84 */
85
86int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
87{
88 unsigned int x;
89
90 if (sb->sb_magic != GFS2_MAGIC ||
91 sb->sb_type != GFS2_METATYPE_SB) {
92 if (!silent)
93 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
94 return -EINVAL;
95 }
96
97 /* If format numbers match exactly, we're done. */
98
99 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
100 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
101 return 0;
102
103 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
104 for (x = 0; gfs2_old_fs_formats[x]; x++)
105 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
106 break;
107
108 if (!gfs2_old_fs_formats[x]) {
109 printk(KERN_WARNING
110 "GFS2: code version (%u, %u) is incompatible "
111 "with ondisk format (%u, %u)\n",
112 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
113 sb->sb_fs_format, sb->sb_multihost_format);
114 printk(KERN_WARNING
115 "GFS2: I don't know how to upgrade this FS\n");
116 return -EINVAL;
117 }
118 }
119
120 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
121 for (x = 0; gfs2_old_multihost_formats[x]; x++)
122 if (gfs2_old_multihost_formats[x] ==
123 sb->sb_multihost_format)
124 break;
125
126 if (!gfs2_old_multihost_formats[x]) {
127 printk(KERN_WARNING
128 "GFS2: code version (%u, %u) is incompatible "
129 "with ondisk format (%u, %u)\n",
130 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
131 sb->sb_fs_format, sb->sb_multihost_format);
132 printk(KERN_WARNING
133 "GFS2: I don't know how to upgrade this FS\n");
134 return -EINVAL;
135 }
136 }
137
138 if (!sdp->sd_args.ar_upgrade) {
139 printk(KERN_WARNING
140 "GFS2: code version (%u, %u) is incompatible "
141 "with ondisk format (%u, %u)\n",
142 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
143 sb->sb_fs_format, sb->sb_multihost_format);
144 printk(KERN_INFO
145 "GFS2: Use the \"upgrade\" mount option to upgrade "
146 "the FS\n");
147 printk(KERN_INFO "GFS2: See the manual for more details\n");
148 return -EINVAL;
149 }
150
151 return 0;
152}
153
154
155static void end_bio_io_page(struct bio *bio, int error)
156{
157 struct page *page = bio->bi_private;
158
159 if (!error)
160 SetPageUptodate(page);
161 else
162 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
163 unlock_page(page);
164}
165
166static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
167{
168 const struct gfs2_sb *str = buf;
169
170 sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
171 sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
172 sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
173 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
174 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
175 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
176 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
177 sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
178 sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
179 sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
180 sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
181
182 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
183 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
184}
185
186/**
187 * gfs2_read_super - Read the gfs2 super block from disk
188 * @sdp: The GFS2 super block
189 * @sector: The location of the super block
190 * @error: The error code to return
191 *
192 * This uses the bio functions to read the super block from disk
193 * because we want to be 100% sure that we never read cached data.
194 * A super block is read twice only during each GFS2 mount and is
195 * never written to by the filesystem. The first time its read no
196 * locks are held, and the only details which are looked at are those
197 * relating to the locking protocol. Once locking is up and working,
198 * the sb is read again under the lock to establish the location of
199 * the master directory (contains pointers to journals etc) and the
200 * root directory.
201 *
202 * Returns: 0 on success or error
203 */
204
205int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
206{
207 struct super_block *sb = sdp->sd_vfs;
208 struct gfs2_sb *p;
209 struct page *page;
210 struct bio *bio;
211
212 page = alloc_page(GFP_NOFS);
213 if (unlikely(!page))
214 return -ENOBUFS;
215
216 ClearPageUptodate(page);
217 ClearPageDirty(page);
218 lock_page(page);
219
220 bio = bio_alloc(GFP_NOFS, 1);
221 if (unlikely(!bio)) {
222 __free_page(page);
223 return -ENOBUFS;
224 }
225
226 bio->bi_sector = sector * (sb->s_blocksize >> 9);
227 bio->bi_bdev = sb->s_bdev;
228 bio_add_page(bio, page, PAGE_SIZE, 0);
229
230 bio->bi_end_io = end_bio_io_page;
231 bio->bi_private = page;
232 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
233 wait_on_page_locked(page);
234 bio_put(bio);
235 if (!PageUptodate(page)) {
236 __free_page(page);
237 return -EIO;
238 }
239 p = kmap(page);
240 gfs2_sb_in(&sdp->sd_sb, p);
241 kunmap(page);
242 __free_page(page);
243 return 0;
244}
245
246/**
247 * gfs2_read_sb - Read super block
248 * @sdp: The GFS2 superblock
249 * @gl: the glock for the superblock (assumed to be held)
250 * @silent: Don't print message if mount fails
251 *
252 */
253
254int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
255{
256 u32 hash_blocks, ind_blocks, leaf_blocks;
257 u32 tmp_blocks;
258 unsigned int x;
259 int error;
260
261 error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
262 if (error) {
263 if (!silent)
264 fs_err(sdp, "can't read superblock\n");
265 return error;
266 }
267
268 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
269 if (error)
270 return error;
271
272 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
273 GFS2_BASIC_BLOCK_SHIFT;
274 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
275 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
276 sizeof(struct gfs2_dinode)) / sizeof(u64);
277 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
278 sizeof(struct gfs2_meta_header)) / sizeof(u64);
279 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
280 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
281 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
282 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
283 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
284 sizeof(struct gfs2_meta_header)) /
285 sizeof(struct gfs2_quota_change);
286
287 /* Compute maximum reservation required to add a entry to a directory */
288
289 hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
290 sdp->sd_jbsize);
291
292 ind_blocks = 0;
293 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
294 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
295 ind_blocks += tmp_blocks;
296 }
297
298 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
299
300 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
301
302 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
303 sizeof(struct gfs2_dinode);
304 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
305 for (x = 2;; x++) {
306 u64 space, d;
307 u32 m;
308
309 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
310 d = space;
311 m = do_div(d, sdp->sd_inptrs);
312
313 if (d != sdp->sd_heightsize[x - 1] || m)
314 break;
315 sdp->sd_heightsize[x] = space;
316 }
317 sdp->sd_max_height = x;
318 sdp->sd_heightsize[x] = ~0;
319 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
320
321 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
322 sizeof(struct gfs2_dinode);
323 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
324 for (x = 2;; x++) {
325 u64 space, d;
326 u32 m;
327
328 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
329 d = space;
330 m = do_div(d, sdp->sd_inptrs);
331
332 if (d != sdp->sd_jheightsize[x - 1] || m)
333 break;
334 sdp->sd_jheightsize[x] = space;
335 }
336 sdp->sd_max_jheight = x;
337 sdp->sd_jheightsize[x] = ~0;
338 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
339
340 return 0;
341}
342
343/** 36/**
344 * gfs2_jindex_hold - Grab a lock on the jindex 37 * gfs2_jindex_hold - Grab a lock on the jindex
345 * @sdp: The GFS2 superblock 38 * @sdp: The GFS2 superblock
@@ -581,39 +274,6 @@ fail:
581 return error; 274 return error;
582} 275}
583 276
584/**
585 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
586 * @sdp: the filesystem
587 *
588 * Returns: errno
589 */
590
591int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
592{
593 struct gfs2_holder t_gh;
594 int error;
595
596 gfs2_quota_sync(sdp);
597 gfs2_statfs_sync(sdp);
598
599 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
600 &t_gh);
601 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
602 return error;
603
604 gfs2_meta_syncfs(sdp);
605 gfs2_log_shutdown(sdp);
606
607 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
608
609 if (t_gh.gh_gl)
610 gfs2_glock_dq_uninit(&t_gh);
611
612 gfs2_quota_cleanup(sdp);
613
614 return error;
615}
616
617static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) 277static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
618{ 278{
619 const struct gfs2_statfs_change *str = buf; 279 const struct gfs2_statfs_change *str = buf;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 44361ecc44f7..50a4c9b1215e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -12,11 +12,6 @@
12 12
13#include "incore.h" 13#include "incore.h"
14 14
15void gfs2_tune_init(struct gfs2_tune *gt);
16
17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
19int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
20void gfs2_lm_unmount(struct gfs2_sbd *sdp); 15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
21 16
22static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) 17static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
@@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
40 struct gfs2_inode **ipp); 35 struct gfs2_inode **ipp);
41 36
42int gfs2_make_fs_rw(struct gfs2_sbd *sdp); 37int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
43int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
44 38
45int gfs2_statfs_init(struct gfs2_sbd *sdp); 39int gfs2_statfs_init(struct gfs2_sbd *sdp);
46void gfs2_statfs_change(struct gfs2_sbd *sdp, 40void gfs2_statfs_change(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 74846559fc3f..7e1879f1a02c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -269,14 +269,6 @@ ARGS_ATTR(quota, "%u\n");
269ARGS_ATTR(suiddir, "%d\n"); 269ARGS_ATTR(suiddir, "%d\n");
270ARGS_ATTR(data, "%d\n"); 270ARGS_ATTR(data, "%d\n");
271 271
272/* one oddball doesn't fit the macro mold */
273static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
274{
275 return snprintf(buf, PAGE_SIZE, "%d\n",
276 !!test_bit(SDF_NOATIME, &sdp->sd_flags));
277}
278static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
279
280static struct attribute *args_attrs[] = { 272static struct attribute *args_attrs[] = {
281 &args_attr_lockproto.attr, 273 &args_attr_lockproto.attr,
282 &args_attr_locktable.attr, 274 &args_attr_locktable.attr,
@@ -292,7 +284,6 @@ static struct attribute *args_attrs[] = {
292 &args_attr_quota.attr, 284 &args_attr_quota.attr,
293 &args_attr_suiddir.attr, 285 &args_attr_suiddir.attr,
294 &args_attr_data.attr, 286 &args_attr_data.attr,
295 &args_attr_noatime.attr,
296 NULL, 287 NULL,
297}; 288};
298 289
@@ -407,7 +398,6 @@ TUNE_ATTR(incore_log_blocks, 0);
407TUNE_ATTR(log_flush_secs, 0); 398TUNE_ATTR(log_flush_secs, 0);
408TUNE_ATTR(quota_warn_period, 0); 399TUNE_ATTR(quota_warn_period, 0);
409TUNE_ATTR(quota_quantum, 0); 400TUNE_ATTR(quota_quantum, 0);
410TUNE_ATTR(atime_quantum, 0);
411TUNE_ATTR(max_readahead, 0); 401TUNE_ATTR(max_readahead, 0);
412TUNE_ATTR(complain_secs, 0); 402TUNE_ATTR(complain_secs, 0);
413TUNE_ATTR(statfs_slow, 0); 403TUNE_ATTR(statfs_slow, 0);
@@ -427,7 +417,6 @@ static struct attribute *tune_attrs[] = {
427 &tune_attr_log_flush_secs.attr, 417 &tune_attr_log_flush_secs.attr,
428 &tune_attr_quota_warn_period.attr, 418 &tune_attr_quota_warn_period.attr,
429 &tune_attr_quota_quantum.attr, 419 &tune_attr_quota_quantum.attr,
430 &tune_attr_atime_quantum.attr,
431 &tune_attr_max_readahead.attr, 420 &tune_attr_max_readahead.attr,
432 &tune_attr_complain_secs.attr, 421 &tune_attr_complain_secs.attr,
433 &tune_attr_statfs_slow.attr, 422 &tune_attr_statfs_slow.attr,
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index ba851576ebb1..6d98f116ca03 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -190,6 +190,10 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
190 190
191 fd->search_key->cat.ParID = rec.thread.ParID; 191 fd->search_key->cat.ParID = rec.thread.ParID;
192 len = fd->search_key->cat.CName.len = rec.thread.CName.len; 192 len = fd->search_key->cat.CName.len = rec.thread.CName.len;
193 if (len > HFS_NAMELEN) {
194 printk(KERN_ERR "hfs: bad catalog namelength\n");
195 return -EIO;
196 }
193 memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); 197 memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
194 return hfs_brec_find(fd); 198 return hfs_brec_find(fd);
195} 199}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4abb1047c689..3c7c7637719c 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -173,7 +173,7 @@ enum {
173 opt_err 173 opt_err
174}; 174};
175 175
176static match_table_t tokens = { 176static const match_table_t tokens = {
177 { opt_uid, "uid=%u" }, 177 { opt_uid, "uid=%u" },
178 { opt_gid, "gid=%u" }, 178 { opt_gid, "gid=%u" },
179 { opt_umask, "umask=%o" }, 179 { opt_umask, "umask=%o" },
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index d128a25b74d2..ea30afc2a03c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -32,6 +32,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
34 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); 34 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
35 if (IS_ERR(page)) {
36 start = size;
37 goto out;
38 }
35 pptr = kmap(page); 39 pptr = kmap(page);
36 curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; 40 curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
37 i = offset % 32; 41 i = offset % 32;
@@ -73,6 +77,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
73 break; 77 break;
74 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, 78 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
75 NULL); 79 NULL);
80 if (IS_ERR(page)) {
81 start = size;
82 goto out;
83 }
76 curr = pptr = kmap(page); 84 curr = pptr = kmap(page);
77 if ((size ^ offset) / PAGE_CACHE_BITS) 85 if ((size ^ offset) / PAGE_CACHE_BITS)
78 end = pptr + PAGE_CACHE_BITS / 32; 86 end = pptr + PAGE_CACHE_BITS / 32;
@@ -120,6 +128,10 @@ found:
120 offset += PAGE_CACHE_BITS; 128 offset += PAGE_CACHE_BITS;
121 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, 129 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
122 NULL); 130 NULL);
131 if (IS_ERR(page)) {
132 start = size;
133 goto out;
134 }
123 pptr = kmap(page); 135 pptr = kmap(page);
124 curr = pptr; 136 curr = pptr;
125 end = pptr + PAGE_CACHE_BITS / 32; 137 end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index ba117c445e78..f6874acb2cf2 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -168,6 +168,11 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
168 return -EIO; 168 return -EIO;
169 } 169 }
170 170
171 if (be16_to_cpu(tmp.thread.nodeName.length) > 255) {
172 printk(KERN_ERR "hfs: catalog name length corrupted\n");
173 return -EIO;
174 }
175
171 hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), 176 hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID),
172 &tmp.thread.nodeName); 177 &tmp.thread.nodeName);
173 return hfs_brec_find(fd); 178 return hfs_brec_find(fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 9997cbf8beb5..9699c56d323f 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -25,7 +25,7 @@ enum {
25 opt_force, opt_err 25 opt_force, opt_err
26}; 26};
27 27
28static match_table_t tokens = { 28static const match_table_t tokens = {
29 { opt_creator, "creator=%s" }, 29 { opt_creator, "creator=%s" },
30 { opt_type, "type=%s" }, 30 { opt_type, "type=%s" },
31 { opt_umask, "umask=%o" }, 31 { opt_umask, "umask=%o" },
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index e834e578c93f..eb74531a0a8e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -356,7 +356,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
356 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 356 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
357 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); 357 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
358 sb->s_flags |= MS_RDONLY; 358 sb->s_flags |= MS_RDONLY;
359 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { 359 } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) {
360 printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, " 360 printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, "
361 "use the force option at your own risk, mounting read-only.\n"); 361 "use the force option at your own risk, mounting read-only.\n");
362 sb->s_flags |= MS_RDONLY; 362 sb->s_flags |= MS_RDONLY;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b8ae9c90ada0..29ad461d568f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -215,7 +215,7 @@ enum {
215 Opt_timeshift, Opt_err, 215 Opt_timeshift, Opt_err,
216}; 216};
217 217
218static match_table_t tokens = { 218static const match_table_t tokens = {
219 {Opt_help, "help"}, 219 {Opt_help, "help"},
220 {Opt_uid, "uid=%u"}, 220 {Opt_uid, "uid=%u"},
221 {Opt_gid, "gid=%u"}, 221 {Opt_gid, "gid=%u"},
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3f58923fb39b..61edc701b0e6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -57,7 +57,7 @@ enum {
57 Opt_err, 57 Opt_err,
58}; 58};
59 59
60static match_table_t tokens = { 60static const match_table_t tokens = {
61 {Opt_size, "size=%s"}, 61 {Opt_size, "size=%s"},
62 {Opt_nr_inodes, "nr_inodes=%s"}, 62 {Opt_nr_inodes, "nr_inodes=%s"},
63 {Opt_mode, "mode=%o"}, 63 {Opt_mode, "mode=%o"},
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 60249429a253..d85c7d931cdf 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -323,7 +323,7 @@ out:
323} 323}
324 324
325/* 325/*
326 * remove_kevent - cleans up and ultimately frees the given kevent 326 * remove_kevent - cleans up the given kevent
327 * 327 *
328 * Caller must hold dev->ev_mutex. 328 * Caller must hold dev->ev_mutex.
329 */ 329 */
@@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,
334 334
335 dev->event_count--; 335 dev->event_count--;
336 dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; 336 dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
337}
337 338
339/*
340 * free_kevent - frees the given kevent.
341 */
342static void free_kevent(struct inotify_kernel_event *kevent)
343{
338 kfree(kevent->name); 344 kfree(kevent->name);
339 kmem_cache_free(event_cachep, kevent); 345 kmem_cache_free(event_cachep, kevent);
340} 346}
@@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
350 struct inotify_kernel_event *kevent; 356 struct inotify_kernel_event *kevent;
351 kevent = inotify_dev_get_event(dev); 357 kevent = inotify_dev_get_event(dev);
352 remove_kevent(dev, kevent); 358 remove_kevent(dev, kevent);
359 free_kevent(kevent);
353 } 360 }
354} 361}
355 362
@@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
433 dev = file->private_data; 440 dev = file->private_data;
434 441
435 while (1) { 442 while (1) {
436 int events;
437 443
438 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); 444 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
439 445
440 mutex_lock(&dev->ev_mutex); 446 mutex_lock(&dev->ev_mutex);
441 events = !list_empty(&dev->events); 447 if (!list_empty(&dev->events)) {
442 mutex_unlock(&dev->ev_mutex);
443 if (events) {
444 ret = 0; 448 ret = 0;
445 break; 449 break;
446 } 450 }
451 mutex_unlock(&dev->ev_mutex);
447 452
448 if (file->f_flags & O_NONBLOCK) { 453 if (file->f_flags & O_NONBLOCK) {
449 ret = -EAGAIN; 454 ret = -EAGAIN;
@@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
462 if (ret) 467 if (ret)
463 return ret; 468 return ret;
464 469
465 mutex_lock(&dev->ev_mutex);
466 while (1) { 470 while (1) {
467 struct inotify_kernel_event *kevent; 471 struct inotify_kernel_event *kevent;
468 472
@@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
481 } 485 }
482 break; 486 break;
483 } 487 }
488 remove_kevent(dev, kevent);
489
490 /*
491 * Must perform the copy_to_user outside the mutex in order
492 * to avoid a lock order reversal with mmap_sem.
493 */
494 mutex_unlock(&dev->ev_mutex);
484 495
485 if (copy_to_user(buf, &kevent->event, event_size)) { 496 if (copy_to_user(buf, &kevent->event, event_size)) {
486 ret = -EFAULT; 497 ret = -EFAULT;
@@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
498 count -= kevent->event.len; 509 count -= kevent->event.len;
499 } 510 }
500 511
501 remove_kevent(dev, kevent); 512 free_kevent(kevent);
513
514 mutex_lock(&dev->ev_mutex);
502 } 515 }
503 mutex_unlock(&dev->ev_mutex); 516 mutex_unlock(&dev->ev_mutex);
504 517
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3382d3..d152856c371b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,9 +13,14 @@
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/writeback.h>
17#include <linux/buffer_head.h>
16 18
17#include <asm/ioctls.h> 19#include <asm/ioctls.h>
18 20
21/* So that the fiemap access checks can't overflow on 32 bit machines. */
22#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
23
19/** 24/**
20 * vfs_ioctl - call filesystem specific ioctl methods 25 * vfs_ioctl - call filesystem specific ioctl methods
21 * @filp: open file to invoke ioctl method on 26 * @filp: open file to invoke ioctl method on
@@ -71,6 +76,276 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
71 return put_user(res, p); 76 return put_user(res, p);
72} 77}
73 78
79/**
80 * fiemap_fill_next_extent - Fiemap helper function
81 * @fieinfo: Fiemap context passed into ->fiemap
82 * @logical: Extent logical start offset, in bytes
83 * @phys: Extent physical start offset, in bytes
84 * @len: Extent length, in bytes
85 * @flags: FIEMAP_EXTENT flags that describe this extent
86 *
87 * Called from file system ->fiemap callback. Will populate extent
88 * info as passed in via arguments and copy to user memory. On
89 * success, extent count on fieinfo is incremented.
90 *
91 * Returns 0 on success, -errno on error, 1 if this was the last
92 * extent that will fit in user array.
93 */
94#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
95#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
96#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
97int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
98 u64 phys, u64 len, u32 flags)
99{
100 struct fiemap_extent extent;
101 struct fiemap_extent *dest = fieinfo->fi_extents_start;
102
103 /* only count the extents */
104 if (fieinfo->fi_extents_max == 0) {
105 fieinfo->fi_extents_mapped++;
106 return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
107 }
108
109 if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
110 return 1;
111
112 if (flags & SET_UNKNOWN_FLAGS)
113 flags |= FIEMAP_EXTENT_UNKNOWN;
114 if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
115 flags |= FIEMAP_EXTENT_ENCODED;
116 if (flags & SET_NOT_ALIGNED_FLAGS)
117 flags |= FIEMAP_EXTENT_NOT_ALIGNED;
118
119 memset(&extent, 0, sizeof(extent));
120 extent.fe_logical = logical;
121 extent.fe_physical = phys;
122 extent.fe_length = len;
123 extent.fe_flags = flags;
124
125 dest += fieinfo->fi_extents_mapped;
126 if (copy_to_user(dest, &extent, sizeof(extent)))
127 return -EFAULT;
128
129 fieinfo->fi_extents_mapped++;
130 if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
131 return 1;
132 return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
133}
134EXPORT_SYMBOL(fiemap_fill_next_extent);
135
136/**
137 * fiemap_check_flags - check validity of requested flags for fiemap
138 * @fieinfo: Fiemap context passed into ->fiemap
139 * @fs_flags: Set of fiemap flags that the file system understands
140 *
141 * Called from file system ->fiemap callback. This will compute the
142 * intersection of valid fiemap flags and those that the fs supports. That
143 * value is then compared against the user supplied flags. In case of bad user
144 * flags, the invalid values will be written into the fieinfo structure, and
145 * -EBADR is returned, which tells ioctl_fiemap() to return those values to
146 * userspace. For this reason, a return code of -EBADR should be preserved.
147 *
148 * Returns 0 on success, -EBADR on bad flags.
149 */
150int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
151{
152 u32 incompat_flags;
153
154 incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
155 if (incompat_flags) {
156 fieinfo->fi_flags = incompat_flags;
157 return -EBADR;
158 }
159 return 0;
160}
161EXPORT_SYMBOL(fiemap_check_flags);
162
163static int fiemap_check_ranges(struct super_block *sb,
164 u64 start, u64 len, u64 *new_len)
165{
166 *new_len = len;
167
168 if (len == 0)
169 return -EINVAL;
170
171 if (start > sb->s_maxbytes)
172 return -EFBIG;
173
174 /*
175 * Shrink request scope to what the fs can actually handle.
176 */
177 if ((len > sb->s_maxbytes) ||
178 (sb->s_maxbytes - len) < start)
179 *new_len = sb->s_maxbytes - start;
180
181 return 0;
182}
183
184static int ioctl_fiemap(struct file *filp, unsigned long arg)
185{
186 struct fiemap fiemap;
187 struct fiemap_extent_info fieinfo = { 0, };
188 struct inode *inode = filp->f_path.dentry->d_inode;
189 struct super_block *sb = inode->i_sb;
190 u64 len;
191 int error;
192
193 if (!inode->i_op->fiemap)
194 return -EOPNOTSUPP;
195
196 if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
197 sizeof(struct fiemap)))
198 return -EFAULT;
199
200 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
201 return -EINVAL;
202
203 error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
204 &len);
205 if (error)
206 return error;
207
208 fieinfo.fi_flags = fiemap.fm_flags;
209 fieinfo.fi_extents_max = fiemap.fm_extent_count;
210 fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
211
212 if (fiemap.fm_extent_count != 0 &&
213 !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
214 fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
215 return -EFAULT;
216
217 if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
218 filemap_write_and_wait(inode->i_mapping);
219
220 error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
221 fiemap.fm_flags = fieinfo.fi_flags;
222 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
223 if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
224 error = -EFAULT;
225
226 return error;
227}
228
229#ifdef CONFIG_BLOCK
230
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
233
234/*
235 * @inode - the inode to map
236 * @arg - the pointer to userspace where we copy everything to
237 * @get_block - the fs's get_block function
238 *
239 * This does FIEMAP for block based inodes. Basically it will just loop
240 * through get_block until we hit the number of extents we want to map, or we
241 * go past the end of the file and hit a hole.
242 *
243 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
244 * please do not use this function, it will stop at the first unmapped block
245 * beyond i_size
246 */
247int generic_block_fiemap(struct inode *inode,
248 struct fiemap_extent_info *fieinfo, u64 start,
249 u64 len, get_block_t *get_block)
250{
251 struct buffer_head tmp;
252 unsigned int start_blk;
253 long long length = 0, map_len = 0;
254 u64 logical = 0, phys = 0, size = 0;
255 u32 flags = FIEMAP_EXTENT_MERGED;
256 int ret = 0;
257
258 if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
259 return ret;
260
261 start_blk = logical_to_blk(inode, start);
262
263 /* guard against change */
264 mutex_lock(&inode->i_mutex);
265
266 length = (long long)min_t(u64, len, i_size_read(inode));
267 map_len = length;
268
269 do {
270 /*
271 * we set b_size to the total size we want so it will map as
272 * many contiguous blocks as possible at once
273 */
274 memset(&tmp, 0, sizeof(struct buffer_head));
275 tmp.b_size = map_len;
276
277 ret = get_block(inode, start_blk, &tmp, 0);
278 if (ret)
279 break;
280
281 /* HOLE */
282 if (!buffer_mapped(&tmp)) {
283 /*
284 * first hole after going past the EOF, this is our
285 * last extent
286 */
287 if (length <= 0) {
288 flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
289 ret = fiemap_fill_next_extent(fieinfo, logical,
290 phys, size,
291 flags);
292 break;
293 }
294
295 length -= blk_to_logical(inode, 1);
296
297 /* if we have holes up to/past EOF then we're done */
298 if (length <= 0)
299 break;
300
301 start_blk++;
302 } else {
303 if (length <= 0 && size) {
304 ret = fiemap_fill_next_extent(fieinfo, logical,
305 phys, size,
306 flags);
307 if (ret)
308 break;
309 }
310
311 logical = blk_to_logical(inode, start_blk);
312 phys = blk_to_logical(inode, tmp.b_blocknr);
313 size = tmp.b_size;
314 flags = FIEMAP_EXTENT_MERGED;
315
316 length -= tmp.b_size;
317 start_blk += logical_to_blk(inode, size);
318
319 /*
320 * if we are past the EOF we need to loop again to see
321 * if there is a hole so we can mark this extent as the
322 * last one, and if not keep mapping things until we
323 * find a hole, or we run out of slots in the extent
324 * array
325 */
326 if (length <= 0)
327 continue;
328
329 ret = fiemap_fill_next_extent(fieinfo, logical, phys,
330 size, flags);
331 if (ret)
332 break;
333 }
334 cond_resched();
335 } while (1);
336
337 mutex_unlock(&inode->i_mutex);
338
339 /* if ret is 1 then we just hit the end of the extent array */
340 if (ret == 1)
341 ret = 0;
342
343 return ret;
344}
345EXPORT_SYMBOL(generic_block_fiemap);
346
347#endif /* CONFIG_BLOCK */
348
74static int file_ioctl(struct file *filp, unsigned int cmd, 349static int file_ioctl(struct file *filp, unsigned int cmd,
75 unsigned long arg) 350 unsigned long arg)
76{ 351{
@@ -80,6 +355,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
80 switch (cmd) { 355 switch (cmd) {
81 case FIBMAP: 356 case FIBMAP:
82 return ioctl_fibmap(filp, p); 357 return ioctl_fibmap(filp, p);
358 case FS_IOC_FIEMAP:
359 return ioctl_fiemap(filp, arg);
83 case FIGETBSZ: 360 case FIGETBSZ:
84 return put_user(inode->i_sb->s_blocksize, p); 361 return put_user(inode->i_sb->s_blocksize, p);
85 case FIONREAD: 362 case FIONREAD:
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 26948a6033b6..3f8af0f1505b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -310,7 +310,7 @@ enum {
310 Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, 310 Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
311}; 311};
312 312
313static match_table_t tokens = { 313static const match_table_t tokens = {
314 {Opt_norock, "norock"}, 314 {Opt_norock, "norock"},
315 {Opt_nojoliet, "nojoliet"}, 315 {Opt_nojoliet, "nojoliet"},
316 {Opt_unhide, "unhide"}, 316 {Opt_unhide, "unhide"},
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 91389c8aee8a..9203c3332f17 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/jbd2.h> 22#include <linux/jbd2.h>
23#include <linux/marker.h>
23#include <linux/errno.h> 24#include <linux/errno.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
25 26
@@ -93,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
93 int ret = 0; 94 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh); 95 struct buffer_head *bh = jh2bh(jh);
95 96
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 97 if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list"); 99 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 100 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh); 101 jbd_unlock_bh_state(bh);
@@ -126,14 +128,29 @@ void __jbd2_log_wait_for_space(journal_t *journal)
126 128
127 /* 129 /*
128 * Test again, another process may have checkpointed while we 130 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock 131 * were waiting for the checkpoint lock. If there are no
132 * outstanding transactions there is nothing to checkpoint and
133 * we can't make progress. Abort the journal in this case.
130 */ 134 */
131 spin_lock(&journal->j_state_lock); 135 spin_lock(&journal->j_state_lock);
136 spin_lock(&journal->j_list_lock);
132 nblocks = jbd_space_needed(journal); 137 nblocks = jbd_space_needed(journal);
133 if (__jbd2_log_space_left(journal) < nblocks) { 138 if (__jbd2_log_space_left(journal) < nblocks) {
139 int chkpt = journal->j_checkpoint_transactions != NULL;
140
141 spin_unlock(&journal->j_list_lock);
134 spin_unlock(&journal->j_state_lock); 142 spin_unlock(&journal->j_state_lock);
135 jbd2_log_do_checkpoint(journal); 143 if (chkpt) {
144 jbd2_log_do_checkpoint(journal);
145 } else {
146 printk(KERN_ERR "%s: no transactions\n",
147 __func__);
148 jbd2_journal_abort(journal, 0);
149 }
150
136 spin_lock(&journal->j_state_lock); 151 spin_lock(&journal->j_state_lock);
152 } else {
153 spin_unlock(&journal->j_list_lock);
137 } 154 }
138 mutex_unlock(&journal->j_checkpoint_mutex); 155 mutex_unlock(&journal->j_checkpoint_mutex);
139 } 156 }
@@ -160,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
160 * buffers. Note that we take the buffers in the opposite ordering 177 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO. 178 * from the one in which they were submitted for IO.
162 * 179 *
180 * Return 0 on success, and return <0 if some buffers have failed
181 * to be written out.
182 *
163 * Called with j_list_lock held. 183 * Called with j_list_lock held.
164 */ 184 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction) 185static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{ 186{
167 struct journal_head *jh; 187 struct journal_head *jh;
168 struct buffer_head *bh; 188 struct buffer_head *bh;
169 tid_t this_tid; 189 tid_t this_tid;
170 int released = 0; 190 int released = 0;
191 int ret = 0;
171 192
172 this_tid = transaction->t_tid; 193 this_tid = transaction->t_tid;
173restart: 194restart:
174 /* Did somebody clean up the transaction in the meanwhile? */ 195 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction || 196 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid) 197 transaction->t_tid != this_tid)
177 return; 198 return ret;
178 while (!released && transaction->t_checkpoint_io_list) { 199 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list; 200 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh); 201 bh = jh2bh(jh);
@@ -194,6 +215,9 @@ restart:
194 spin_lock(&journal->j_list_lock); 215 spin_lock(&journal->j_list_lock);
195 goto restart; 216 goto restart;
196 } 217 }
218 if (unlikely(buffer_write_io_error(bh)))
219 ret = -EIO;
220
197 /* 221 /*
198 * Now in whatever state the buffer currently is, we know that 222 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list 223 * it has been written out and so we can drop it from the list
@@ -203,6 +227,8 @@ restart:
203 jbd2_journal_remove_journal_head(bh); 227 jbd2_journal_remove_journal_head(bh);
204 __brelse(bh); 228 __brelse(bh);
205 } 229 }
230
231 return ret;
206} 232}
207 233
208#define NR_BATCH 64 234#define NR_BATCH 64
@@ -226,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
226 * Try to flush one buffer from the checkpoint list to disk. 252 * Try to flush one buffer from the checkpoint list to disk.
227 * 253 *
228 * Return 1 if something happened which requires us to abort the current 254 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list. 255 * scan of the checkpoint list. Return <0 if the buffer has failed to
256 * be written out.
230 * 257 *
231 * Called with j_list_lock held and drops it if 1 is returned 258 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 259 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -258,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
258 jbd2_log_wait_commit(journal, tid); 285 jbd2_log_wait_commit(journal, tid);
259 ret = 1; 286 ret = 1;
260 } else if (!buffer_dirty(bh)) { 287 } else if (!buffer_dirty(bh)) {
288 ret = 1;
289 if (unlikely(buffer_write_io_error(bh)))
290 ret = -EIO;
261 J_ASSERT_JH(jh, !buffer_jbddirty(bh)); 291 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
262 BUFFER_TRACE(bh, "remove from checkpoint"); 292 BUFFER_TRACE(bh, "remove from checkpoint");
263 __jbd2_journal_remove_checkpoint(jh); 293 __jbd2_journal_remove_checkpoint(jh);
@@ -265,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
265 jbd_unlock_bh_state(bh); 295 jbd_unlock_bh_state(bh);
266 jbd2_journal_remove_journal_head(bh); 296 jbd2_journal_remove_journal_head(bh);
267 __brelse(bh); 297 __brelse(bh);
268 ret = 1;
269 } else { 298 } else {
270 /* 299 /*
271 * Important: we are about to write the buffer, and 300 * Important: we are about to write the buffer, and
@@ -298,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
298 * to disk. We submit larger chunks of data at once. 327 * to disk. We submit larger chunks of data at once.
299 * 328 *
300 * The journal should be locked before calling this function. 329 * The journal should be locked before calling this function.
330 * Called with j_checkpoint_mutex held.
301 */ 331 */
302int jbd2_log_do_checkpoint(journal_t *journal) 332int jbd2_log_do_checkpoint(journal_t *journal)
303{ 333{
@@ -313,6 +343,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
313 * journal straight away. 343 * journal straight away.
314 */ 344 */
315 result = jbd2_cleanup_journal_tail(journal); 345 result = jbd2_cleanup_journal_tail(journal);
346 trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
347 journal->j_devname, result);
316 jbd_debug(1, "cleanup_journal_tail returned %d\n", result); 348 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
317 if (result <= 0) 349 if (result <= 0)
318 return result; 350 return result;
@@ -321,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
321 * OK, we need to start writing disk blocks. Take one transaction 353 * OK, we need to start writing disk blocks. Take one transaction
322 * and write it. 354 * and write it.
323 */ 355 */
356 result = 0;
324 spin_lock(&journal->j_list_lock); 357 spin_lock(&journal->j_list_lock);
325 if (!journal->j_checkpoint_transactions) 358 if (!journal->j_checkpoint_transactions)
326 goto out; 359 goto out;
@@ -339,7 +372,7 @@ restart:
339 int batch_count = 0; 372 int batch_count = 0;
340 struct buffer_head *bhs[NR_BATCH]; 373 struct buffer_head *bhs[NR_BATCH];
341 struct journal_head *jh; 374 struct journal_head *jh;
342 int retry = 0; 375 int retry = 0, err;
343 376
344 while (!retry && transaction->t_checkpoint_list) { 377 while (!retry && transaction->t_checkpoint_list) {
345 struct buffer_head *bh; 378 struct buffer_head *bh;
@@ -353,6 +386,8 @@ restart:
353 } 386 }
354 retry = __process_buffer(journal, jh, bhs, &batch_count, 387 retry = __process_buffer(journal, jh, bhs, &batch_count,
355 transaction); 388 transaction);
389 if (retry < 0 && !result)
390 result = retry;
356 if (!retry && (need_resched() || 391 if (!retry && (need_resched() ||
357 spin_needbreak(&journal->j_list_lock))) { 392 spin_needbreak(&journal->j_list_lock))) {
358 spin_unlock(&journal->j_list_lock); 393 spin_unlock(&journal->j_list_lock);
@@ -377,14 +412,18 @@ restart:
377 * Now we have cleaned up the first transaction's checkpoint 412 * Now we have cleaned up the first transaction's checkpoint
378 * list. Let's clean up the second one 413 * list. Let's clean up the second one
379 */ 414 */
380 __wait_cp_io(journal, transaction); 415 err = __wait_cp_io(journal, transaction);
416 if (!result)
417 result = err;
381 } 418 }
382out: 419out:
383 spin_unlock(&journal->j_list_lock); 420 spin_unlock(&journal->j_list_lock);
384 result = jbd2_cleanup_journal_tail(journal);
385 if (result < 0) 421 if (result < 0)
386 return result; 422 jbd2_journal_abort(journal, result);
387 return 0; 423 else
424 result = jbd2_cleanup_journal_tail(journal);
425
426 return (result < 0) ? result : 0;
388} 427}
389 428
390/* 429/*
@@ -400,8 +439,9 @@ out:
400 * This is the only part of the journaling code which really needs to be 439 * This is the only part of the journaling code which really needs to be
401 * aware of transaction aborts. Checkpointing involves writing to the 440 * aware of transaction aborts. Checkpointing involves writing to the
402 * main filesystem area rather than to the journal, so it can proceed 441 * main filesystem area rather than to the journal, so it can proceed
403 * even in abort state, but we must not update the journal superblock if 442 * even in abort state, but we must not update the super block if
404 * we have an abort error outstanding. 443 * checkpointing may have failed. Otherwise, we would lose some metadata
444 * buffers which should be written-back to the filesystem.
405 */ 445 */
406 446
407int jbd2_cleanup_journal_tail(journal_t *journal) 447int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -410,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
410 tid_t first_tid; 450 tid_t first_tid;
411 unsigned long blocknr, freed; 451 unsigned long blocknr, freed;
412 452
453 if (is_journal_aborted(journal))
454 return 1;
455
413 /* OK, work out the oldest transaction remaining in the log, and 456 /* OK, work out the oldest transaction remaining in the log, and
414 * the log block it starts at. 457 * the log block it starts at.
415 * 458 *
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f2ad061e95ec..8b119e16aa36 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
16#include <linux/time.h> 16#include <linux/time.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/jbd2.h> 18#include <linux/jbd2.h>
19#include <linux/marker.h>
19#include <linux/errno.h> 20#include <linux/errno.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
21#include <linux/mm.h> 22#include <linux/mm.h>
@@ -126,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
126 127
127 JBUFFER_TRACE(descriptor, "submit commit block"); 128 JBUFFER_TRACE(descriptor, "submit commit block");
128 lock_buffer(bh); 129 lock_buffer(bh);
129 get_bh(bh); 130 clear_buffer_dirty(bh);
130 set_buffer_dirty(bh);
131 set_buffer_uptodate(bh); 131 set_buffer_uptodate(bh);
132 bh->b_end_io = journal_end_buffer_io_sync; 132 bh->b_end_io = journal_end_buffer_io_sync;
133 133
@@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
147 * to remember if we sent a barrier request 147 * to remember if we sent a barrier request
148 */ 148 */
149 if (ret == -EOPNOTSUPP && barrier_done) { 149 if (ret == -EOPNOTSUPP && barrier_done) {
150 char b[BDEVNAME_SIZE];
151
152 printk(KERN_WARNING 150 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - " 151 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n", 152 "disabling barriers\n", journal->j_devname);
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock); 153 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JBD2_BARRIER; 154 journal->j_flags &= ~JBD2_BARRIER;
158 spin_unlock(&journal->j_state_lock); 155 spin_unlock(&journal->j_state_lock);
@@ -160,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
160 /* And try again, without the barrier */ 157 /* And try again, without the barrier */
161 lock_buffer(bh); 158 lock_buffer(bh);
162 set_buffer_uptodate(bh); 159 set_buffer_uptodate(bh);
163 set_buffer_dirty(bh); 160 clear_buffer_dirty(bh);
164 ret = submit_bh(WRITE, bh); 161 ret = submit_bh(WRITE, bh);
165 } 162 }
166 *cbh = bh; 163 *cbh = bh;
@@ -371,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
371 commit_transaction = journal->j_running_transaction; 368 commit_transaction = journal->j_running_transaction;
372 J_ASSERT(commit_transaction->t_state == T_RUNNING); 369 J_ASSERT(commit_transaction->t_state == T_RUNNING);
373 370
371 trace_mark(jbd2_start_commit, "dev %s transaction %d",
372 journal->j_devname, commit_transaction->t_tid);
374 jbd_debug(1, "JBD: starting commit of transaction %d\n", 373 jbd_debug(1, "JBD: starting commit of transaction %d\n",
375 commit_transaction->t_tid); 374 commit_transaction->t_tid);
376 375
@@ -505,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
505 jh = commit_transaction->t_buffers; 504 jh = commit_transaction->t_buffers;
506 505
507 /* If we're in abort mode, we just un-journal the buffer and 506 /* If we're in abort mode, we just un-journal the buffer and
508 release it for background writing. */ 507 release it. */
509 508
510 if (is_journal_aborted(journal)) { 509 if (is_journal_aborted(journal)) {
510 clear_buffer_jbddirty(jh2bh(jh));
511 JBUFFER_TRACE(jh, "journal is aborting: refile"); 511 JBUFFER_TRACE(jh, "journal is aborting: refile");
512 jbd2_journal_refile_buffer(journal, jh); 512 jbd2_journal_refile_buffer(journal, jh);
513 /* If that was the last one, we need to clean up 513 /* If that was the last one, we need to clean up
@@ -681,11 +681,11 @@ start_journal_io:
681 */ 681 */
682 err = journal_finish_inode_data_buffers(journal, commit_transaction); 682 err = journal_finish_inode_data_buffers(journal, commit_transaction);
683 if (err) { 683 if (err) {
684 char b[BDEVNAME_SIZE];
685
686 printk(KERN_WARNING 684 printk(KERN_WARNING
687 "JBD2: Detected IO errors while flushing file data " 685 "JBD2: Detected IO errors while flushing file data "
688 "on %s\n", bdevname(journal->j_fs_dev, b)); 686 "on %s\n", journal->j_devname);
687 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
688 jbd2_journal_abort(journal, err);
689 err = 0; 689 err = 0;
690 } 690 }
691 691
@@ -786,6 +786,9 @@ wait_for_iobuf:
786 /* AKPM: bforget here */ 786 /* AKPM: bforget here */
787 } 787 }
788 788
789 if (err)
790 jbd2_journal_abort(journal, err);
791
789 jbd_debug(3, "JBD: commit phase 5\n"); 792 jbd_debug(3, "JBD: commit phase 5\n");
790 793
791 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 794 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -884,6 +887,8 @@ restart_loop:
884 if (buffer_jbddirty(bh)) { 887 if (buffer_jbddirty(bh)) {
885 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 888 JBUFFER_TRACE(jh, "add to new checkpointing trans");
886 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 889 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
890 if (is_journal_aborted(journal))
891 clear_buffer_jbddirty(bh);
887 JBUFFER_TRACE(jh, "refile for checkpoint writeback"); 892 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
888 __jbd2_journal_refile_buffer(jh); 893 __jbd2_journal_refile_buffer(jh);
889 jbd_unlock_bh_state(bh); 894 jbd_unlock_bh_state(bh);
@@ -990,6 +995,12 @@ restart_loop:
990 } 995 }
991 spin_unlock(&journal->j_list_lock); 996 spin_unlock(&journal->j_list_lock);
992 997
998 if (journal->j_commit_callback)
999 journal->j_commit_callback(journal, commit_transaction);
1000
1001 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1002 journal->j_devname, commit_transaction->t_tid,
1003 journal->j_tail_sequence);
993 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1004 jbd_debug(1, "JBD: commit %d complete, head %d\n",
994 journal->j_commit_sequence, journal->j_tail_sequence); 1005 journal->j_commit_sequence, journal->j_tail_sequence);
995 1006
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8207a01c4edb..783de118de92 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -597,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
597 if (ret) 597 if (ret)
598 *retp = ret; 598 *retp = ret;
599 else { 599 else {
600 char b[BDEVNAME_SIZE];
601
602 printk(KERN_ALERT "%s: journal block not found " 600 printk(KERN_ALERT "%s: journal block not found "
603 "at offset %lu on %s\n", 601 "at offset %lu on %s\n",
604 __func__, 602 __func__, blocknr, journal->j_devname);
605 blocknr,
606 bdevname(journal->j_dev, b));
607 err = -EIO; 603 err = -EIO;
608 __journal_abort_soft(journal, err); 604 __journal_abort_soft(journal, err);
609 } 605 }
@@ -901,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
901 897
902static void jbd2_stats_proc_init(journal_t *journal) 898static void jbd2_stats_proc_init(journal_t *journal)
903{ 899{
904 char name[BDEVNAME_SIZE]; 900 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
905
906 bdevname(journal->j_dev, name);
907 journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
908 if (journal->j_proc_entry) { 901 if (journal->j_proc_entry) {
909 proc_create_data("history", S_IRUGO, journal->j_proc_entry, 902 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
910 &jbd2_seq_history_fops, journal); 903 &jbd2_seq_history_fops, journal);
@@ -915,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
915 908
916static void jbd2_stats_proc_exit(journal_t *journal) 909static void jbd2_stats_proc_exit(journal_t *journal)
917{ 910{
918 char name[BDEVNAME_SIZE];
919
920 bdevname(journal->j_dev, name);
921 remove_proc_entry("info", journal->j_proc_entry); 911 remove_proc_entry("info", journal->j_proc_entry);
922 remove_proc_entry("history", journal->j_proc_entry); 912 remove_proc_entry("history", journal->j_proc_entry);
923 remove_proc_entry(name, proc_jbd2_stats); 913 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
924} 914}
925 915
926static void journal_init_stats(journal_t *journal) 916static void journal_init_stats(journal_t *journal)
@@ -1018,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1018{ 1008{
1019 journal_t *journal = journal_init_common(); 1009 journal_t *journal = journal_init_common();
1020 struct buffer_head *bh; 1010 struct buffer_head *bh;
1011 char *p;
1021 int n; 1012 int n;
1022 1013
1023 if (!journal) 1014 if (!journal)
@@ -1039,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1039 journal->j_fs_dev = fs_dev; 1030 journal->j_fs_dev = fs_dev;
1040 journal->j_blk_offset = start; 1031 journal->j_blk_offset = start;
1041 journal->j_maxlen = len; 1032 journal->j_maxlen = len;
1033 bdevname(journal->j_dev, journal->j_devname);
1034 p = journal->j_devname;
1035 while ((p = strchr(p, '/')))
1036 *p = '!';
1042 jbd2_stats_proc_init(journal); 1037 jbd2_stats_proc_init(journal);
1043 1038
1044 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1039 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1061,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1061{ 1056{
1062 struct buffer_head *bh; 1057 struct buffer_head *bh;
1063 journal_t *journal = journal_init_common(); 1058 journal_t *journal = journal_init_common();
1059 char *p;
1064 int err; 1060 int err;
1065 int n; 1061 int n;
1066 unsigned long long blocknr; 1062 unsigned long long blocknr;
@@ -1070,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1070 1066
1071 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; 1067 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
1072 journal->j_inode = inode; 1068 journal->j_inode = inode;
1069 bdevname(journal->j_dev, journal->j_devname);
1070 p = journal->j_devname;
1071 while ((p = strchr(p, '/')))
1072 *p = '!';
1073 p = journal->j_devname + strlen(journal->j_devname);
1074 sprintf(p, ":%lu", journal->j_inode->i_ino);
1073 jbd_debug(1, 1075 jbd_debug(1,
1074 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 1076 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1075 journal, inode->i_sb->s_id, inode->i_ino, 1077 journal, inode->i_sb->s_id, inode->i_ino,
@@ -1253,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1253 goto out; 1255 goto out;
1254 } 1256 }
1255 1257
1258 if (buffer_write_io_error(bh)) {
1259 /*
1260 * Oh, dear. A previous attempt to write the journal
1261 * superblock failed. This could happen because the
1262 * USB device was yanked out. Or it could happen to
1263 * be a transient write error and maybe the block will
1264 * be remapped. Nothing we can do but to retry the
1265 * write and hope for the best.
1266 */
1267 printk(KERN_ERR "JBD2: previous I/O error detected "
1268 "for journal superblock update for %s.\n",
1269 journal->j_devname);
1270 clear_buffer_write_io_error(bh);
1271 set_buffer_uptodate(bh);
1272 }
1273
1256 spin_lock(&journal->j_state_lock); 1274 spin_lock(&journal->j_state_lock);
1257 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1275 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
1258 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1276 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1264,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1264 1282
1265 BUFFER_TRACE(bh, "marking dirty"); 1283 BUFFER_TRACE(bh, "marking dirty");
1266 mark_buffer_dirty(bh); 1284 mark_buffer_dirty(bh);
1267 if (wait) 1285 if (wait) {
1268 sync_dirty_buffer(bh); 1286 sync_dirty_buffer(bh);
1269 else 1287 if (buffer_write_io_error(bh)) {
1288 printk(KERN_ERR "JBD2: I/O error detected "
1289 "when updating journal superblock for %s.\n",
1290 journal->j_devname);
1291 clear_buffer_write_io_error(bh);
1292 set_buffer_uptodate(bh);
1293 }
1294 } else
1270 ll_rw_block(SWRITE, 1, &bh); 1295 ll_rw_block(SWRITE, 1, &bh);
1271 1296
1272out: 1297out:
@@ -1426,9 +1451,12 @@ recovery_error:
1426 * 1451 *
1427 * Release a journal_t structure once it is no longer in use by the 1452 * Release a journal_t structure once it is no longer in use by the
1428 * journaled object. 1453 * journaled object.
1454 * Return <0 if we couldn't clean up the journal.
1429 */ 1455 */
1430void jbd2_journal_destroy(journal_t *journal) 1456int jbd2_journal_destroy(journal_t *journal)
1431{ 1457{
1458 int err = 0;
1459
1432 /* Wait for the commit thread to wake up and die. */ 1460 /* Wait for the commit thread to wake up and die. */
1433 journal_kill_thread(journal); 1461 journal_kill_thread(journal);
1434 1462
@@ -1451,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
1451 J_ASSERT(journal->j_checkpoint_transactions == NULL); 1479 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1452 spin_unlock(&journal->j_list_lock); 1480 spin_unlock(&journal->j_list_lock);
1453 1481
1454 /* We can now mark the journal as empty. */
1455 journal->j_tail = 0;
1456 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1457 if (journal->j_sb_buffer) { 1482 if (journal->j_sb_buffer) {
1458 jbd2_journal_update_superblock(journal, 1); 1483 if (!is_journal_aborted(journal)) {
1484 /* We can now mark the journal as empty. */
1485 journal->j_tail = 0;
1486 journal->j_tail_sequence =
1487 ++journal->j_transaction_sequence;
1488 jbd2_journal_update_superblock(journal, 1);
1489 } else {
1490 err = -EIO;
1491 }
1459 brelse(journal->j_sb_buffer); 1492 brelse(journal->j_sb_buffer);
1460 } 1493 }
1461 1494
@@ -1467,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
1467 jbd2_journal_destroy_revoke(journal); 1500 jbd2_journal_destroy_revoke(journal);
1468 kfree(journal->j_wbuf); 1501 kfree(journal->j_wbuf);
1469 kfree(journal); 1502 kfree(journal);
1503
1504 return err;
1470} 1505}
1471 1506
1472 1507
@@ -1692,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
1692 spin_lock(&journal->j_list_lock); 1727 spin_lock(&journal->j_list_lock);
1693 while (!err && journal->j_checkpoint_transactions != NULL) { 1728 while (!err && journal->j_checkpoint_transactions != NULL) {
1694 spin_unlock(&journal->j_list_lock); 1729 spin_unlock(&journal->j_list_lock);
1730 mutex_lock(&journal->j_checkpoint_mutex);
1695 err = jbd2_log_do_checkpoint(journal); 1731 err = jbd2_log_do_checkpoint(journal);
1732 mutex_unlock(&journal->j_checkpoint_mutex);
1696 spin_lock(&journal->j_list_lock); 1733 spin_lock(&journal->j_list_lock);
1697 } 1734 }
1698 spin_unlock(&journal->j_list_lock); 1735 spin_unlock(&journal->j_list_lock);
1736
1737 if (is_journal_aborted(journal))
1738 return -EIO;
1739
1699 jbd2_cleanup_journal_tail(journal); 1740 jbd2_cleanup_journal_tail(journal);
1700 1741
1701 /* Finally, mark the journal as really needing no recovery. 1742 /* Finally, mark the journal as really needing no recovery.
@@ -1717,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
1717 J_ASSERT(journal->j_head == journal->j_tail); 1758 J_ASSERT(journal->j_head == journal->j_tail);
1718 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1759 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1719 spin_unlock(&journal->j_state_lock); 1760 spin_unlock(&journal->j_state_lock);
1720 return err; 1761 return 0;
1721} 1762}
1722 1763
1723/** 1764/**
@@ -1761,23 +1802,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1761} 1802}
1762 1803
1763/* 1804/*
1764 * journal_dev_name: format a character string to describe on what
1765 * device this journal is present.
1766 */
1767
1768static const char *journal_dev_name(journal_t *journal, char *buffer)
1769{
1770 struct block_device *bdev;
1771
1772 if (journal->j_inode)
1773 bdev = journal->j_inode->i_sb->s_bdev;
1774 else
1775 bdev = journal->j_dev;
1776
1777 return bdevname(bdev, buffer);
1778}
1779
1780/*
1781 * Journal abort has very specific semantics, which we describe 1805 * Journal abort has very specific semantics, which we describe
1782 * for journal abort. 1806 * for journal abort.
1783 * 1807 *
@@ -1793,13 +1817,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
1793void __jbd2_journal_abort_hard(journal_t *journal) 1817void __jbd2_journal_abort_hard(journal_t *journal)
1794{ 1818{
1795 transaction_t *transaction; 1819 transaction_t *transaction;
1796 char b[BDEVNAME_SIZE];
1797 1820
1798 if (journal->j_flags & JBD2_ABORT) 1821 if (journal->j_flags & JBD2_ABORT)
1799 return; 1822 return;
1800 1823
1801 printk(KERN_ERR "Aborting journal on device %s.\n", 1824 printk(KERN_ERR "Aborting journal on device %s.\n",
1802 journal_dev_name(journal, b)); 1825 journal->j_devname);
1803 1826
1804 spin_lock(&journal->j_state_lock); 1827 spin_lock(&journal->j_state_lock);
1805 journal->j_flags |= JBD2_ABORT; 1828 journal->j_flags |= JBD2_ABORT;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 058f50f65b76..73063285b13f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,7 +225,7 @@ do { \
225 */ 225 */
226int jbd2_journal_recover(journal_t *journal) 226int jbd2_journal_recover(journal_t *journal)
227{ 227{
228 int err; 228 int err, err2;
229 journal_superblock_t * sb; 229 journal_superblock_t * sb;
230 230
231 struct recovery_info info; 231 struct recovery_info info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
263 journal->j_transaction_sequence = ++info.end_transaction; 263 journal->j_transaction_sequence = ++info.end_transaction;
264 264
265 jbd2_journal_clear_revoke(journal); 265 jbd2_journal_clear_revoke(journal);
266 sync_blockdev(journal->j_fs_dev); 266 err2 = sync_blockdev(journal->j_fs_dev);
267 if (!err)
268 err = err2;
269
267 return err; 270 return err;
268} 271}
269 272
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5d540588fa9..39b7805a599a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
52 transaction->t_expires = jiffies + journal->j_commit_interval; 52 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 53 spin_lock_init(&transaction->t_handle_lock);
54 INIT_LIST_HEAD(&transaction->t_inode_list); 54 INIT_LIST_HEAD(&transaction->t_inode_list);
55 INIT_LIST_HEAD(&transaction->t_private_list);
55 56
56 /* Set up the commit timer for the new transaction. */ 57 /* Set up the commit timer for the new transaction. */
57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 58 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 31559f45fdde..4c41db91eaa4 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -12,7 +12,6 @@
12#ifndef _JFFS2_FS_I 12#ifndef _JFFS2_FS_I
13#define _JFFS2_FS_I 13#define _JFFS2_FS_I
14 14
15#include <linux/version.h>
16#include <linux/rbtree.h> 15#include <linux/rbtree.h>
17#include <linux/posix_acl.h> 16#include <linux/posix_acl.h>
18#include <linux/mutex.h> 17#include <linux/mutex.h>
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3630718be395..0dae345e481b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -199,7 +199,7 @@ enum {
199 Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask 199 Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask
200}; 200};
201 201
202static match_table_t tokens = { 202static const match_table_t tokens = {
203 {Opt_integrity, "integrity"}, 203 {Opt_integrity, "integrity"},
204 {Opt_nointegrity, "nointegrity"}, 204 {Opt_nointegrity, "nointegrity"},
205 {Opt_iocharset, "iocharset=%s"}, 205 {Opt_iocharset, "iocharset=%s"},
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 7725a0a9a555..97f6073ab339 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
5obj-$(CONFIG_LOCKD) += lockd.o 5obj-$(CONFIG_LOCKD) += lockd.o
6 6
7lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ 7lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
8 svcproc.o svcsubs.o mon.o xdr.o 8 svcproc.o svcsubs.o mon.o xdr.o grace.o
9lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o 9lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
10lockd-objs := $(lockd-objs-y) 10lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0b45fd3a4bfd..8307dd64bf46 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
54 u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; 54 u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
55 int status; 55 int status;
56 56
57 status = lockd_up(nlm_init->protocol); 57 status = lockd_up();
58 if (status < 0) 58 if (status < 0)
59 return ERR_PTR(status); 59 return ERR_PTR(status);
60 60
61 host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address, 61 host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
62 nlm_init->protocol, nlm_version, 62 nlm_init->protocol, nlm_version,
63 nlm_init->hostname, 63 nlm_init->hostname);
64 strlen(nlm_init->hostname));
65 if (host == NULL) { 64 if (host == NULL) {
66 lockd_down(); 65 lockd_down();
67 return ERR_PTR(-ENOLCK); 66 return ERR_PTR(-ENOLCK);
@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
142/* 141/*
143 * The server lockd has called us back to tell us the lock was granted 142 * The server lockd has called us back to tell us the lock was granted
144 */ 143 */
145__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) 144__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
146{ 145{
147 const struct file_lock *fl = &lock->fl; 146 const struct file_lock *fl = &lock->fl;
148 const struct nfs_fh *fh = &lock->fh; 147 const struct nfs_fh *fh = &lock->fh;
@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
166 */ 165 */
167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 166 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
168 continue; 167 continue;
169 if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) 168 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
170 continue; 169 continue;
171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 170 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
172 continue; 171 continue;
@@ -216,7 +215,7 @@ reclaimer(void *ptr)
216 /* This one ensures that our parent doesn't terminate while the 215 /* This one ensures that our parent doesn't terminate while the
217 * reclaim is in progress */ 216 * reclaim is in progress */
218 lock_kernel(); 217 lock_kernel();
219 lockd_up(0); /* note: this cannot fail as lockd is already running */ 218 lockd_up(); /* note: this cannot fail as lockd is already running */
220 219
221 dprintk("lockd: reclaiming locks for host %s\n", host->h_name); 220 dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
222 221
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 000000000000..183cc1f0af1c
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
1/*
2 * Common code for control of lockd and nfsv4 grace periods.
3 */
4
5#include <linux/module.h>
6#include <linux/lockd/bind.h>
7
8static LIST_HEAD(grace_list);
9static DEFINE_SPINLOCK(grace_lock);
10
11/**
12 * locks_start_grace
13 * @lm: who this grace period is for
14 *
15 * A grace period is a period during which locks should not be given
16 * out. Currently grace periods are only enforced by the two lock
17 * managers (lockd and nfsd), using the locks_in_grace() function to
18 * check when they are in a grace period.
19 *
20 * This function is called to start a grace period.
21 */
22void locks_start_grace(struct lock_manager *lm)
23{
24 spin_lock(&grace_lock);
25 list_add(&lm->list, &grace_list);
26 spin_unlock(&grace_lock);
27}
28EXPORT_SYMBOL_GPL(locks_start_grace);
29
30/**
31 * locks_end_grace
32 * @lm: who this grace period is for
33 *
34 * Call this function to state that the given lock manager is ready to
35 * resume regular locking. The grace period will not end until all lock
36 * managers that called locks_start_grace() also call locks_end_grace().
37 * Note that callers count on it being safe to call this more than once,
38 * and the second call should be a no-op.
39 */
40void locks_end_grace(struct lock_manager *lm)
41{
42 spin_lock(&grace_lock);
43 list_del_init(&lm->list);
44 spin_unlock(&grace_lock);
45}
46EXPORT_SYMBOL_GPL(locks_end_grace);
47
48/**
49 * locks_in_grace
50 *
51 * Lock managers call this function to determine when it is OK for them
52 * to answer ordinary lock requests, and when they should accept only
53 * lock reclaims.
54 */
55int locks_in_grace(void)
56{
57 return !list_empty(&grace_list);
58}
59EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a17664c7eacc..9fd8889097b7 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -11,16 +11,17 @@
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/in.h> 13#include <linux/in.h>
14#include <linux/in6.h>
14#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
16#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
17#include <linux/lockd/sm_inter.h> 18#include <linux/lockd/sm_inter.h>
18#include <linux/mutex.h> 19#include <linux/mutex.h>
19 20
21#include <net/ipv6.h>
20 22
21#define NLMDBG_FACILITY NLMDBG_HOSTCACHE 23#define NLMDBG_FACILITY NLMDBG_HOSTCACHE
22#define NLM_HOST_NRHASH 32 24#define NLM_HOST_NRHASH 32
23#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1))
24#define NLM_HOST_REBIND (60 * HZ) 25#define NLM_HOST_REBIND (60 * HZ)
25#define NLM_HOST_EXPIRE (300 * HZ) 26#define NLM_HOST_EXPIRE (300 * HZ)
26#define NLM_HOST_COLLECT (120 * HZ) 27#define NLM_HOST_COLLECT (120 * HZ)
@@ -30,42 +31,115 @@ static unsigned long next_gc;
30static int nrhosts; 31static int nrhosts;
31static DEFINE_MUTEX(nlm_host_mutex); 32static DEFINE_MUTEX(nlm_host_mutex);
32 33
33
34static void nlm_gc_hosts(void); 34static void nlm_gc_hosts(void);
35static struct nsm_handle * __nsm_find(const struct sockaddr_in *, 35static struct nsm_handle *nsm_find(const struct sockaddr *sap,
36 const char *, unsigned int, int); 36 const size_t salen,
37static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, 37 const char *hostname,
38 const char *hostname, 38 const size_t hostname_len,
39 unsigned int hostname_len); 39 const int create);
40
41struct nlm_lookup_host_info {
42 const int server; /* search for server|client */
43 const struct sockaddr *sap; /* address to search for */
44 const size_t salen; /* it's length */
45 const unsigned short protocol; /* transport to search for*/
46 const u32 version; /* NLM version to search for */
47 const char *hostname; /* remote's hostname */
48 const size_t hostname_len; /* it's length */
49 const struct sockaddr *src_sap; /* our address (optional) */
50 const size_t src_len; /* it's length */
51};
52
53/*
54 * Hash function must work well on big- and little-endian platforms
55 */
56static unsigned int __nlm_hash32(const __be32 n)
57{
58 unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
59 return hash ^ (hash >> 8);
60}
61
62static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
63{
64 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
65 return __nlm_hash32(sin->sin_addr.s_addr);
66}
67
68static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
69{
70 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
71 const struct in6_addr addr = sin6->sin6_addr;
72 return __nlm_hash32(addr.s6_addr32[0]) ^
73 __nlm_hash32(addr.s6_addr32[1]) ^
74 __nlm_hash32(addr.s6_addr32[2]) ^
75 __nlm_hash32(addr.s6_addr32[3]);
76}
77
78static unsigned int nlm_hash_address(const struct sockaddr *sap)
79{
80 unsigned int hash;
81
82 switch (sap->sa_family) {
83 case AF_INET:
84 hash = __nlm_hash_addr4(sap);
85 break;
86 case AF_INET6:
87 hash = __nlm_hash_addr6(sap);
88 break;
89 default:
90 hash = 0;
91 }
92 return hash & (NLM_HOST_NRHASH - 1);
93}
94
95static void nlm_clear_port(struct sockaddr *sap)
96{
97 switch (sap->sa_family) {
98 case AF_INET:
99 ((struct sockaddr_in *)sap)->sin_port = 0;
100 break;
101 case AF_INET6:
102 ((struct sockaddr_in6 *)sap)->sin6_port = 0;
103 break;
104 }
105}
106
107static void nlm_display_address(const struct sockaddr *sap,
108 char *buf, const size_t len)
109{
110 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
111 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
112
113 switch (sap->sa_family) {
114 case AF_UNSPEC:
115 snprintf(buf, len, "unspecified");
116 break;
117 case AF_INET:
118 snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
119 break;
120 case AF_INET6:
121 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
122 snprintf(buf, len, NIPQUAD_FMT,
123 NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
124 else
125 snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
126 break;
127 default:
128 snprintf(buf, len, "unsupported address family");
129 break;
130 }
131}
40 132
41/* 133/*
42 * Common host lookup routine for server & client 134 * Common host lookup routine for server & client
43 */ 135 */
44static struct nlm_host *nlm_lookup_host(int server, 136static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
45 const struct sockaddr_in *sin,
46 int proto, u32 version,
47 const char *hostname,
48 unsigned int hostname_len,
49 const struct sockaddr_in *ssin)
50{ 137{
51 struct hlist_head *chain; 138 struct hlist_head *chain;
52 struct hlist_node *pos; 139 struct hlist_node *pos;
53 struct nlm_host *host; 140 struct nlm_host *host;
54 struct nsm_handle *nsm = NULL; 141 struct nsm_handle *nsm = NULL;
55 int hash;
56
57 dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
58 ", p=%d, v=%u, my role=%s, name=%.*s)\n",
59 NIPQUAD(ssin->sin_addr.s_addr),
60 NIPQUAD(sin->sin_addr.s_addr), proto, version,
61 server? "server" : "client",
62 hostname_len,
63 hostname? hostname : "<none>");
64 142
65
66 hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
67
68 /* Lock hash table */
69 mutex_lock(&nlm_host_mutex); 143 mutex_lock(&nlm_host_mutex);
70 144
71 if (time_after_eq(jiffies, next_gc)) 145 if (time_after_eq(jiffies, next_gc))
@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
78 * different NLM rpc_clients into one single nlm_host object. 152 * different NLM rpc_clients into one single nlm_host object.
79 * This would allow us to have one nlm_host per address. 153 * This would allow us to have one nlm_host per address.
80 */ 154 */
81 chain = &nlm_hosts[hash]; 155 chain = &nlm_hosts[nlm_hash_address(ni->sap)];
82 hlist_for_each_entry(host, pos, chain, h_hash) { 156 hlist_for_each_entry(host, pos, chain, h_hash) {
83 if (!nlm_cmp_addr(&host->h_addr, sin)) 157 if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
84 continue; 158 continue;
85 159
86 /* See if we have an NSM handle for this client */ 160 /* See if we have an NSM handle for this client */
87 if (!nsm) 161 if (!nsm)
88 nsm = host->h_nsmhandle; 162 nsm = host->h_nsmhandle;
89 163
90 if (host->h_proto != proto) 164 if (host->h_proto != ni->protocol)
91 continue; 165 continue;
92 if (host->h_version != version) 166 if (host->h_version != ni->version)
93 continue; 167 continue;
94 if (host->h_server != server) 168 if (host->h_server != ni->server)
95 continue; 169 continue;
96 if (!nlm_cmp_addr(&host->h_saddr, ssin)) 170 if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
97 continue; 171 continue;
98 172
99 /* Move to head of hash chain. */ 173 /* Move to head of hash chain. */
@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
101 hlist_add_head(&host->h_hash, chain); 175 hlist_add_head(&host->h_hash, chain);
102 176
103 nlm_get_host(host); 177 nlm_get_host(host);
178 dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
179 host->h_name, host->h_addrbuf);
104 goto out; 180 goto out;
105 } 181 }
106 if (nsm)
107 atomic_inc(&nsm->sm_count);
108
109 host = NULL;
110 182
111 /* Sadly, the host isn't in our hash table yet. See if 183 /*
112 * we have an NSM handle for it. If not, create one. 184 * The host wasn't in our hash table. If we don't
185 * have an NSM handle for it yet, create one.
113 */ 186 */
114 if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len))) 187 if (nsm)
115 goto out; 188 atomic_inc(&nsm->sm_count);
189 else {
190 host = NULL;
191 nsm = nsm_find(ni->sap, ni->salen,
192 ni->hostname, ni->hostname_len, 1);
193 if (!nsm) {
194 dprintk("lockd: nlm_lookup_host failed; "
195 "no nsm handle\n");
196 goto out;
197 }
198 }
116 199
117 host = kzalloc(sizeof(*host), GFP_KERNEL); 200 host = kzalloc(sizeof(*host), GFP_KERNEL);
118 if (!host) { 201 if (!host) {
119 nsm_release(nsm); 202 nsm_release(nsm);
203 dprintk("lockd: nlm_lookup_host failed; no memory\n");
120 goto out; 204 goto out;
121 } 205 }
122 host->h_name = nsm->sm_name; 206 host->h_name = nsm->sm_name;
123 host->h_addr = *sin; 207 memcpy(nlm_addr(host), ni->sap, ni->salen);
124 host->h_addr.sin_port = 0; /* ouch! */ 208 host->h_addrlen = ni->salen;
125 host->h_saddr = *ssin; 209 nlm_clear_port(nlm_addr(host));
126 host->h_version = version; 210 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
127 host->h_proto = proto; 211 host->h_version = ni->version;
212 host->h_proto = ni->protocol;
128 host->h_rpcclnt = NULL; 213 host->h_rpcclnt = NULL;
129 mutex_init(&host->h_mutex); 214 mutex_init(&host->h_mutex);
130 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 215 host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
135 host->h_state = 0; /* pseudo NSM state */ 220 host->h_state = 0; /* pseudo NSM state */
136 host->h_nsmstate = 0; /* real NSM state */ 221 host->h_nsmstate = 0; /* real NSM state */
137 host->h_nsmhandle = nsm; 222 host->h_nsmhandle = nsm;
138 host->h_server = server; 223 host->h_server = ni->server;
139 hlist_add_head(&host->h_hash, chain); 224 hlist_add_head(&host->h_hash, chain);
140 INIT_LIST_HEAD(&host->h_lockowners); 225 INIT_LIST_HEAD(&host->h_lockowners);
141 spin_lock_init(&host->h_lock); 226 spin_lock_init(&host->h_lock);
@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
143 INIT_LIST_HEAD(&host->h_reclaim); 228 INIT_LIST_HEAD(&host->h_reclaim);
144 229
145 nrhosts++; 230 nrhosts++;
231
232 nlm_display_address((struct sockaddr *)&host->h_addr,
233 host->h_addrbuf, sizeof(host->h_addrbuf));
234 nlm_display_address((struct sockaddr *)&host->h_srcaddr,
235 host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
236
237 dprintk("lockd: nlm_lookup_host created host %s\n",
238 host->h_name);
239
146out: 240out:
147 mutex_unlock(&nlm_host_mutex); 241 mutex_unlock(&nlm_host_mutex);
148 return host; 242 return host;
@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
170 kfree(host); 264 kfree(host);
171} 265}
172 266
173/* 267/**
174 * Find an NLM server handle in the cache. If there is none, create it. 268 * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
269 * @sap: network address of server
270 * @salen: length of server address
271 * @protocol: transport protocol to use
272 * @version: NLM protocol version
273 * @hostname: '\0'-terminated hostname of server
274 *
275 * Returns an nlm_host structure that matches the passed-in
276 * [server address, transport protocol, NLM version, server hostname].
277 * If one doesn't already exist in the host cache, a new handle is
278 * created and returned.
175 */ 279 */
176struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, 280struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
177 int proto, u32 version, 281 const size_t salen,
178 const char *hostname, 282 const unsigned short protocol,
179 unsigned int hostname_len) 283 const u32 version, const char *hostname)
180{ 284{
181 struct sockaddr_in ssin = {0}; 285 const struct sockaddr source = {
182 286 .sa_family = AF_UNSPEC,
183 return nlm_lookup_host(0, sin, proto, version, 287 };
184 hostname, hostname_len, &ssin); 288 struct nlm_lookup_host_info ni = {
289 .server = 0,
290 .sap = sap,
291 .salen = salen,
292 .protocol = protocol,
293 .version = version,
294 .hostname = hostname,
295 .hostname_len = strlen(hostname),
296 .src_sap = &source,
297 .src_len = sizeof(source),
298 };
299
300 dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
301 (hostname ? hostname : "<none>"), version,
302 (protocol == IPPROTO_UDP ? "udp" : "tcp"));
303
304 return nlm_lookup_host(&ni);
185} 305}
186 306
187/* 307/**
188 * Find an NLM client handle in the cache. If there is none, create it. 308 * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
309 * @rqstp: incoming NLM request
310 * @hostname: name of client host
311 * @hostname_len: length of client hostname
312 *
313 * Returns an nlm_host structure that matches the [client address,
314 * transport protocol, NLM version, client hostname] of the passed-in
315 * NLM request. If one doesn't already exist in the host cache, a
316 * new handle is created and returned.
317 *
318 * Before possibly creating a new nlm_host, construct a sockaddr
319 * for a specific source address in case the local system has
320 * multiple network addresses. The family of the address in
321 * rq_daddr is guaranteed to be the same as the family of the
322 * address in rq_addr, so it's safe to use the same family for
323 * the source address.
189 */ 324 */
190struct nlm_host * 325struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
191nlmsvc_lookup_host(struct svc_rqst *rqstp, 326 const char *hostname,
192 const char *hostname, unsigned int hostname_len) 327 const size_t hostname_len)
193{ 328{
194 struct sockaddr_in ssin = {0}; 329 struct sockaddr_in sin = {
330 .sin_family = AF_INET,
331 };
332 struct sockaddr_in6 sin6 = {
333 .sin6_family = AF_INET6,
334 };
335 struct nlm_lookup_host_info ni = {
336 .server = 1,
337 .sap = svc_addr(rqstp),
338 .salen = rqstp->rq_addrlen,
339 .protocol = rqstp->rq_prot,
340 .version = rqstp->rq_vers,
341 .hostname = hostname,
342 .hostname_len = hostname_len,
343 .src_len = rqstp->rq_addrlen,
344 };
345
346 dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
347 (int)hostname_len, hostname, rqstp->rq_vers,
348 (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
349
350 switch (ni.sap->sa_family) {
351 case AF_INET:
352 sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
353 ni.src_sap = (struct sockaddr *)&sin;
354 break;
355 case AF_INET6:
356 ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
357 ni.src_sap = (struct sockaddr *)&sin6;
358 break;
359 default:
360 return NULL;
361 }
195 362
196 ssin.sin_addr = rqstp->rq_daddr.addr; 363 return nlm_lookup_host(&ni);
197 return nlm_lookup_host(1, svc_addr_in(rqstp),
198 rqstp->rq_prot, rqstp->rq_vers,
199 hostname, hostname_len, &ssin);
200} 364}
201 365
202/* 366/*
@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
207{ 371{
208 struct rpc_clnt *clnt; 372 struct rpc_clnt *clnt;
209 373
210 dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n", 374 dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
211 NIPQUAD(host->h_saddr.sin_addr), 375 host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
212 NIPQUAD(host->h_addr.sin_addr));
213 376
214 /* Lock host handle */ 377 /* Lock host handle */
215 mutex_lock(&host->h_mutex); 378 mutex_lock(&host->h_mutex);
@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
221 if (time_after_eq(jiffies, host->h_nextrebind)) { 384 if (time_after_eq(jiffies, host->h_nextrebind)) {
222 rpc_force_rebind(clnt); 385 rpc_force_rebind(clnt);
223 host->h_nextrebind = jiffies + NLM_HOST_REBIND; 386 host->h_nextrebind = jiffies + NLM_HOST_REBIND;
224 dprintk("lockd: next rebind in %ld jiffies\n", 387 dprintk("lockd: next rebind in %lu jiffies\n",
225 host->h_nextrebind - jiffies); 388 host->h_nextrebind - jiffies);
226 } 389 }
227 } else { 390 } else {
@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
234 }; 397 };
235 struct rpc_create_args args = { 398 struct rpc_create_args args = {
236 .protocol = host->h_proto, 399 .protocol = host->h_proto,
237 .address = (struct sockaddr *)&host->h_addr, 400 .address = nlm_addr(host),
238 .addrsize = sizeof(host->h_addr), 401 .addrsize = host->h_addrlen,
239 .saddress = (struct sockaddr *)&host->h_saddr, 402 .saddress = nlm_srcaddr(host),
240 .timeout = &timeparms, 403 .timeout = &timeparms,
241 .servername = host->h_name, 404 .servername = host->h_name,
242 .program = &nlm_program, 405 .program = &nlm_program,
@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
324 struct nsm_handle *nsm; 487 struct nsm_handle *nsm;
325 struct nlm_host *host; 488 struct nlm_host *host;
326 489
327 dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n", 490 nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
328 hostname, NIPQUAD(sin->sin_addr)); 491 hostname, hostname_len, 0);
329 492 if (nsm == NULL) {
330 /* Find the NSM handle for this peer */ 493 dprintk("lockd: never saw rebooted peer '%.*s' before\n",
331 if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0))) 494 hostname_len, hostname);
332 return; 495 return;
496 }
497
498 dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
499 hostname_len, hostname, nsm->sm_addrbuf);
333 500
334 /* When reclaiming locks on this peer, make sure that 501 /* When reclaiming locks on this peer, make sure that
335 * we set up a new notification */ 502 * we set up a new notification */
@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
461static LIST_HEAD(nsm_handles); 628static LIST_HEAD(nsm_handles);
462static DEFINE_SPINLOCK(nsm_lock); 629static DEFINE_SPINLOCK(nsm_lock);
463 630
464static struct nsm_handle * 631static struct nsm_handle *nsm_find(const struct sockaddr *sap,
465__nsm_find(const struct sockaddr_in *sin, 632 const size_t salen,
466 const char *hostname, unsigned int hostname_len, 633 const char *hostname,
467 int create) 634 const size_t hostname_len,
635 const int create)
468{ 636{
469 struct nsm_handle *nsm = NULL; 637 struct nsm_handle *nsm = NULL;
470 struct nsm_handle *pos; 638 struct nsm_handle *pos;
471 639
472 if (!sin) 640 if (!sap)
473 return NULL; 641 return NULL;
474 642
475 if (hostname && memchr(hostname, '/', hostname_len) != NULL) { 643 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
476 if (printk_ratelimit()) { 644 if (printk_ratelimit()) {
477 printk(KERN_WARNING "Invalid hostname \"%.*s\" " 645 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
478 "in NFS lock request\n", 646 "in NFS lock request\n",
479 hostname_len, hostname); 647 (int)hostname_len, hostname);
480 } 648 }
481 return NULL; 649 return NULL;
482 } 650 }
@@ -489,7 +657,7 @@ retry:
489 if (strlen(pos->sm_name) != hostname_len 657 if (strlen(pos->sm_name) != hostname_len
490 || memcmp(pos->sm_name, hostname, hostname_len)) 658 || memcmp(pos->sm_name, hostname, hostname_len))
491 continue; 659 continue;
492 } else if (!nlm_cmp_addr(&pos->sm_addr, sin)) 660 } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
493 continue; 661 continue;
494 atomic_inc(&pos->sm_count); 662 atomic_inc(&pos->sm_count);
495 kfree(nsm); 663 kfree(nsm);
@@ -509,10 +677,13 @@ retry:
509 if (nsm == NULL) 677 if (nsm == NULL)
510 return NULL; 678 return NULL;
511 679
512 nsm->sm_addr = *sin; 680 memcpy(nsm_addr(nsm), sap, salen);
681 nsm->sm_addrlen = salen;
513 nsm->sm_name = (char *) (nsm + 1); 682 nsm->sm_name = (char *) (nsm + 1);
514 memcpy(nsm->sm_name, hostname, hostname_len); 683 memcpy(nsm->sm_name, hostname, hostname_len);
515 nsm->sm_name[hostname_len] = '\0'; 684 nsm->sm_name[hostname_len] = '\0';
685 nlm_display_address((struct sockaddr *)&nsm->sm_addr,
686 nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
516 atomic_set(&nsm->sm_count, 1); 687 atomic_set(&nsm->sm_count, 1);
517 goto retry; 688 goto retry;
518 689
@@ -521,13 +692,6 @@ found:
521 return nsm; 692 return nsm;
522} 693}
523 694
524static struct nsm_handle *
525nsm_find(const struct sockaddr_in *sin, const char *hostname,
526 unsigned int hostname_len)
527{
528 return __nsm_find(sin, hostname, hostname_len, 1);
529}
530
531/* 695/*
532 * Release an NSM handle 696 * Release an NSM handle
533 */ 697 */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4d563543b11..4e7e958e8f67 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
51 51
52 memset(&args, 0, sizeof(args)); 52 memset(&args, 0, sizeof(args));
53 args.mon_name = nsm->sm_name; 53 args.mon_name = nsm->sm_name;
54 args.addr = nsm->sm_addr.sin_addr.s_addr; 54 args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
55 args.prog = NLM_PROGRAM; 55 args.prog = NLM_PROGRAM;
56 args.vers = 3; 56 args.vers = 3;
57 args.proc = NLMPROC_NSM_NOTIFY; 57 args.proc = NLMPROC_NSM_NOTIFY;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 5bd9bf0fa9df..c631a83931ce 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex);
51static unsigned int nlmsvc_users; 51static unsigned int nlmsvc_users;
52static struct task_struct *nlmsvc_task; 52static struct task_struct *nlmsvc_task;
53static struct svc_rqst *nlmsvc_rqst; 53static struct svc_rqst *nlmsvc_rqst;
54int nlmsvc_grace_period;
55unsigned long nlmsvc_timeout; 54unsigned long nlmsvc_timeout;
56 55
57/* 56/*
@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
85 return nlm_timeout * 5 * HZ; 84 return nlm_timeout * 5 * HZ;
86} 85}
87 86
88unsigned long get_nfs_grace_period(void) 87static struct lock_manager lockd_manager = {
89{ 88};
90 unsigned long lockdgrace = get_lockd_grace_period();
91 unsigned long nfsdgrace = 0;
92
93 if (nlmsvc_ops)
94 nfsdgrace = nlmsvc_ops->get_grace_period();
95
96 return max(lockdgrace, nfsdgrace);
97}
98EXPORT_SYMBOL(get_nfs_grace_period);
99 89
100static unsigned long set_grace_period(void) 90static void grace_ender(struct work_struct *not_used)
101{ 91{
102 nlmsvc_grace_period = 1; 92 locks_end_grace(&lockd_manager);
103 return get_nfs_grace_period() + jiffies;
104} 93}
105 94
106static inline void clear_grace_period(void) 95static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
96
97static void set_grace_period(void)
107{ 98{
108 nlmsvc_grace_period = 0; 99 unsigned long grace_period = get_lockd_grace_period();
100
101 locks_start_grace(&lockd_manager);
102 cancel_delayed_work_sync(&grace_period_end);
103 schedule_delayed_work(&grace_period_end, grace_period);
109} 104}
110 105
111/* 106/*
@@ -116,7 +111,6 @@ lockd(void *vrqstp)
116{ 111{
117 int err = 0, preverr = 0; 112 int err = 0, preverr = 0;
118 struct svc_rqst *rqstp = vrqstp; 113 struct svc_rqst *rqstp = vrqstp;
119 unsigned long grace_period_expire;
120 114
121 /* try_to_freeze() is called from svc_recv() */ 115 /* try_to_freeze() is called from svc_recv() */
122 set_freezable(); 116 set_freezable();
@@ -139,7 +133,7 @@ lockd(void *vrqstp)
139 nlm_timeout = LOCKD_DFLT_TIMEO; 133 nlm_timeout = LOCKD_DFLT_TIMEO;
140 nlmsvc_timeout = nlm_timeout * HZ; 134 nlmsvc_timeout = nlm_timeout * HZ;
141 135
142 grace_period_expire = set_grace_period(); 136 set_grace_period();
143 137
144 /* 138 /*
145 * The main request loop. We don't terminate until the last 139 * The main request loop. We don't terminate until the last
@@ -153,21 +147,12 @@ lockd(void *vrqstp)
153 flush_signals(current); 147 flush_signals(current);
154 if (nlmsvc_ops) { 148 if (nlmsvc_ops) {
155 nlmsvc_invalidate_all(); 149 nlmsvc_invalidate_all();
156 grace_period_expire = set_grace_period(); 150 set_grace_period();
157 } 151 }
158 continue; 152 continue;
159 } 153 }
160 154
161 /* 155 timeout = nlmsvc_retry_blocked();
162 * Retry any blocked locks that have been notified by
163 * the VFS. Don't do this during grace period.
164 * (Theoretically, there shouldn't even be blocked locks
165 * during grace period).
166 */
167 if (!nlmsvc_grace_period) {
168 timeout = nlmsvc_retry_blocked();
169 } else if (time_before(grace_period_expire, jiffies))
170 clear_grace_period();
171 156
172 /* 157 /*
173 * Find a socket with data available and call its 158 * Find a socket with data available and call its
@@ -195,6 +180,7 @@ lockd(void *vrqstp)
195 svc_process(rqstp); 180 svc_process(rqstp);
196 } 181 }
197 flush_signals(current); 182 flush_signals(current);
183 cancel_delayed_work_sync(&grace_period_end);
198 if (nlmsvc_ops) 184 if (nlmsvc_ops)
199 nlmsvc_invalidate_all(); 185 nlmsvc_invalidate_all();
200 nlm_shutdown_hosts(); 186 nlm_shutdown_hosts();
@@ -203,25 +189,28 @@ lockd(void *vrqstp)
203} 189}
204 190
205/* 191/*
206 * Make any sockets that are needed but not present. 192 * Ensure there are active UDP and TCP listeners for lockd.
207 * If nlm_udpport or nlm_tcpport were set as module 193 *
208 * options, make those sockets unconditionally 194 * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
195 * local services (such as rpc.statd) still require UDP, and
196 * some NFS servers do not yet support NLM over TCP.
197 *
198 * Returns zero if all listeners are available; otherwise a
199 * negative errno value is returned.
209 */ 200 */
210static int make_socks(struct svc_serv *serv, int proto) 201static int make_socks(struct svc_serv *serv)
211{ 202{
212 static int warned; 203 static int warned;
213 struct svc_xprt *xprt; 204 struct svc_xprt *xprt;
214 int err = 0; 205 int err = 0;
215 206
216 if (proto == IPPROTO_UDP || nlm_udpport) { 207 xprt = svc_find_xprt(serv, "udp", 0, 0);
217 xprt = svc_find_xprt(serv, "udp", 0, 0); 208 if (!xprt)
218 if (!xprt) 209 err = svc_create_xprt(serv, "udp", nlm_udpport,
219 err = svc_create_xprt(serv, "udp", nlm_udpport, 210 SVC_SOCK_DEFAULTS);
220 SVC_SOCK_DEFAULTS); 211 else
221 else 212 svc_xprt_put(xprt);
222 svc_xprt_put(xprt); 213 if (err >= 0) {
223 }
224 if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
225 xprt = svc_find_xprt(serv, "tcp", 0, 0); 214 xprt = svc_find_xprt(serv, "tcp", 0, 0);
226 if (!xprt) 215 if (!xprt)
227 err = svc_create_xprt(serv, "tcp", nlm_tcpport, 216 err = svc_create_xprt(serv, "tcp", nlm_tcpport,
@@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto)
241/* 230/*
242 * Bring up the lockd process if it's not already up. 231 * Bring up the lockd process if it's not already up.
243 */ 232 */
244int 233int lockd_up(void)
245lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
246{ 234{
247 struct svc_serv *serv; 235 struct svc_serv *serv;
248 int error = 0; 236 int error = 0;
@@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
251 /* 239 /*
252 * Check whether we're already up and running. 240 * Check whether we're already up and running.
253 */ 241 */
254 if (nlmsvc_rqst) { 242 if (nlmsvc_rqst)
255 if (proto)
256 error = make_socks(nlmsvc_rqst->rq_server, proto);
257 goto out; 243 goto out;
258 }
259 244
260 /* 245 /*
261 * Sanity check: if there's no pid, 246 * Sanity check: if there's no pid,
@@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
266 "lockd_up: no pid, %d users??\n", nlmsvc_users); 251 "lockd_up: no pid, %d users??\n", nlmsvc_users);
267 252
268 error = -ENOMEM; 253 error = -ENOMEM;
269 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); 254 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
270 if (!serv) { 255 if (!serv) {
271 printk(KERN_WARNING "lockd_up: create service failed\n"); 256 printk(KERN_WARNING "lockd_up: create service failed\n");
272 goto out; 257 goto out;
273 } 258 }
274 259
275 if ((error = make_socks(serv, proto)) < 0) 260 error = make_socks(serv);
261 if (error < 0)
276 goto destroy_and_out; 262 goto destroy_and_out;
277 263
278 /* 264 /*
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4a714f64515b..014f6ce48172 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
88 dprintk("lockd: TEST4 called\n"); 88 dprintk("lockd: TEST4 called\n");
89 resp->cookie = argp->cookie; 89 resp->cookie = argp->cookie;
90 90
91 /* Don't accept test requests during grace period */
92 if (nlmsvc_grace_period) {
93 resp->status = nlm_lck_denied_grace_period;
94 return rc;
95 }
96
97 /* Obtain client and file */ 91 /* Obtain client and file */
98 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 92 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
99 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; 93 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
122 116
123 resp->cookie = argp->cookie; 117 resp->cookie = argp->cookie;
124 118
125 /* Don't accept new lock requests during grace period */
126 if (nlmsvc_grace_period && !argp->reclaim) {
127 resp->status = nlm_lck_denied_grace_period;
128 return rc;
129 }
130
131 /* Obtain client and file */ 119 /* Obtain client and file */
132 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 120 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
133 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; 121 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
146 134
147 /* Now try to lock the file */ 135 /* Now try to lock the file */
148 resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, 136 resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
149 argp->block, &argp->cookie); 137 argp->block, &argp->cookie,
138 argp->reclaim);
150 if (resp->status == nlm_drop_reply) 139 if (resp->status == nlm_drop_reply)
151 rc = rpc_drop_reply; 140 rc = rpc_drop_reply;
152 else 141 else
@@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
169 resp->cookie = argp->cookie; 158 resp->cookie = argp->cookie;
170 159
171 /* Don't accept requests during grace period */ 160 /* Don't accept requests during grace period */
172 if (nlmsvc_grace_period) { 161 if (locks_in_grace()) {
173 resp->status = nlm_lck_denied_grace_period; 162 resp->status = nlm_lck_denied_grace_period;
174 return rpc_success; 163 return rpc_success;
175 } 164 }
@@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
202 resp->cookie = argp->cookie; 191 resp->cookie = argp->cookie;
203 192
204 /* Don't accept new lock requests during grace period */ 193 /* Don't accept new lock requests during grace period */
205 if (nlmsvc_grace_period) { 194 if (locks_in_grace()) {
206 resp->status = nlm_lck_denied_grace_period; 195 resp->status = nlm_lck_denied_grace_period;
207 return rpc_success; 196 return rpc_success;
208 } 197 }
@@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
231 resp->cookie = argp->cookie; 220 resp->cookie = argp->cookie;
232 221
233 dprintk("lockd: GRANTED called\n"); 222 dprintk("lockd: GRANTED called\n");
234 resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); 223 resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
235 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); 224 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
236 return rpc_success; 225 return rpc_success;
237} 226}
@@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
341 resp->cookie = argp->cookie; 330 resp->cookie = argp->cookie;
342 331
343 /* Don't accept new lock requests during grace period */ 332 /* Don't accept new lock requests during grace period */
344 if (nlmsvc_grace_period && !argp->reclaim) { 333 if (locks_in_grace() && !argp->reclaim) {
345 resp->status = nlm_lck_denied_grace_period; 334 resp->status = nlm_lck_denied_grace_period;
346 return rpc_success; 335 return rpc_success;
347 } 336 }
@@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
374 resp->cookie = argp->cookie; 363 resp->cookie = argp->cookie;
375 364
376 /* Don't accept requests during grace period */ 365 /* Don't accept requests during grace period */
377 if (nlmsvc_grace_period) { 366 if (locks_in_grace()) {
378 resp->status = nlm_lck_denied_grace_period; 367 resp->status = nlm_lck_denied_grace_period;
379 return rpc_success; 368 return rpc_success;
380 } 369 }
@@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
432{ 421{
433 struct sockaddr_in saddr; 422 struct sockaddr_in saddr;
434 423
435 memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
436
437 dprintk("lockd: SM_NOTIFY called\n"); 424 dprintk("lockd: SM_NOTIFY called\n");
438 if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) 425
439 || ntohs(saddr.sin_port) >= 1024) { 426 if (!nlm_privileged_requester(rqstp)) {
440 char buf[RPC_MAX_ADDRBUFLEN]; 427 char buf[RPC_MAX_ADDRBUFLEN];
441 printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", 428 printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
442 svc_print_addr(rqstp, buf, sizeof(buf))); 429 svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cf0d5c2c318d..6063a8e4b9f3 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
360__be32 360__be32
361nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, 361nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
362 struct nlm_host *host, struct nlm_lock *lock, int wait, 362 struct nlm_host *host, struct nlm_lock *lock, int wait,
363 struct nlm_cookie *cookie) 363 struct nlm_cookie *cookie, int reclaim)
364{ 364{
365 struct nlm_block *block = NULL; 365 struct nlm_block *block = NULL;
366 int error; 366 int error;
@@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
406 goto out; 406 goto out;
407 } 407 }
408 408
409 if (locks_in_grace() && !reclaim) {
410 ret = nlm_lck_denied_grace_period;
411 goto out;
412 }
413 if (reclaim && !locks_in_grace()) {
414 ret = nlm_lck_denied_grace_period;
415 goto out;
416 }
417
409 if (!wait) 418 if (!wait)
410 lock->fl.fl_flags &= ~FL_SLEEP; 419 lock->fl.fl_flags &= ~FL_SLEEP;
411 error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); 420 error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
@@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
502 goto out; 511 goto out;
503 } 512 }
504 513
514 if (locks_in_grace()) {
515 ret = nlm_lck_denied_grace_period;
516 goto out;
517 }
505 error = vfs_test_lock(file->f_file, &lock->fl); 518 error = vfs_test_lock(file->f_file, &lock->fl);
506 if (error == FILE_LOCK_DEFERRED) { 519 if (error == FILE_LOCK_DEFERRED) {
507 ret = nlmsvc_defer_lock_rqst(rqstp, block); 520 ret = nlmsvc_defer_lock_rqst(rqstp, block);
@@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
582 (long long)lock->fl.fl_start, 595 (long long)lock->fl.fl_start,
583 (long long)lock->fl.fl_end); 596 (long long)lock->fl.fl_end);
584 597
598 if (locks_in_grace())
599 return nlm_lck_denied_grace_period;
600
585 mutex_lock(&file->f_mutex); 601 mutex_lock(&file->f_mutex);
586 block = nlmsvc_lookup_block(file, lock); 602 block = nlmsvc_lookup_block(file, lock);
587 mutex_unlock(&file->f_mutex); 603 mutex_unlock(&file->f_mutex);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 76262c1986f2..548b0bb2b84d 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
117 dprintk("lockd: TEST called\n"); 117 dprintk("lockd: TEST called\n");
118 resp->cookie = argp->cookie; 118 resp->cookie = argp->cookie;
119 119
120 /* Don't accept test requests during grace period */
121 if (nlmsvc_grace_period) {
122 resp->status = nlm_lck_denied_grace_period;
123 return rc;
124 }
125
126 /* Obtain client and file */ 120 /* Obtain client and file */
127 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 121 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
128 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; 122 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
152 146
153 resp->cookie = argp->cookie; 147 resp->cookie = argp->cookie;
154 148
155 /* Don't accept new lock requests during grace period */
156 if (nlmsvc_grace_period && !argp->reclaim) {
157 resp->status = nlm_lck_denied_grace_period;
158 return rc;
159 }
160
161 /* Obtain client and file */ 149 /* Obtain client and file */
162 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 150 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
163 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; 151 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
176 164
177 /* Now try to lock the file */ 165 /* Now try to lock the file */
178 resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, 166 resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
179 argp->block, &argp->cookie)); 167 argp->block, &argp->cookie,
168 argp->reclaim));
180 if (resp->status == nlm_drop_reply) 169 if (resp->status == nlm_drop_reply)
181 rc = rpc_drop_reply; 170 rc = rpc_drop_reply;
182 else 171 else
@@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
199 resp->cookie = argp->cookie; 188 resp->cookie = argp->cookie;
200 189
201 /* Don't accept requests during grace period */ 190 /* Don't accept requests during grace period */
202 if (nlmsvc_grace_period) { 191 if (locks_in_grace()) {
203 resp->status = nlm_lck_denied_grace_period; 192 resp->status = nlm_lck_denied_grace_period;
204 return rpc_success; 193 return rpc_success;
205 } 194 }
@@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
232 resp->cookie = argp->cookie; 221 resp->cookie = argp->cookie;
233 222
234 /* Don't accept new lock requests during grace period */ 223 /* Don't accept new lock requests during grace period */
235 if (nlmsvc_grace_period) { 224 if (locks_in_grace()) {
236 resp->status = nlm_lck_denied_grace_period; 225 resp->status = nlm_lck_denied_grace_period;
237 return rpc_success; 226 return rpc_success;
238 } 227 }
@@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
261 resp->cookie = argp->cookie; 250 resp->cookie = argp->cookie;
262 251
263 dprintk("lockd: GRANTED called\n"); 252 dprintk("lockd: GRANTED called\n");
264 resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); 253 resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
265 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); 254 dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
266 return rpc_success; 255 return rpc_success;
267} 256}
@@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
373 resp->cookie = argp->cookie; 362 resp->cookie = argp->cookie;
374 363
375 /* Don't accept new lock requests during grace period */ 364 /* Don't accept new lock requests during grace period */
376 if (nlmsvc_grace_period && !argp->reclaim) { 365 if (locks_in_grace() && !argp->reclaim) {
377 resp->status = nlm_lck_denied_grace_period; 366 resp->status = nlm_lck_denied_grace_period;
378 return rpc_success; 367 return rpc_success;
379 } 368 }
@@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
406 resp->cookie = argp->cookie; 395 resp->cookie = argp->cookie;
407 396
408 /* Don't accept requests during grace period */ 397 /* Don't accept requests during grace period */
409 if (nlmsvc_grace_period) { 398 if (locks_in_grace()) {
410 resp->status = nlm_lck_denied_grace_period; 399 resp->status = nlm_lck_denied_grace_period;
411 return rpc_success; 400 return rpc_success;
412 } 401 }
@@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
464{ 453{
465 struct sockaddr_in saddr; 454 struct sockaddr_in saddr;
466 455
467 memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
468
469 dprintk("lockd: SM_NOTIFY called\n"); 456 dprintk("lockd: SM_NOTIFY called\n");
470 if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) 457
471 || ntohs(saddr.sin_port) >= 1024) { 458 if (!nlm_privileged_requester(rqstp)) {
472 char buf[RPC_MAX_ADDRBUFLEN]; 459 char buf[RPC_MAX_ADDRBUFLEN];
473 printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", 460 printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
474 svc_print_addr(rqstp, buf, sizeof(buf))); 461 svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 198b4e55b373..34c2766e27c7 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
418static int 418static int
419nlmsvc_match_ip(void *datap, struct nlm_host *host) 419nlmsvc_match_ip(void *datap, struct nlm_host *host)
420{ 420{
421 return nlm_cmp_addr(&host->h_saddr, datap); 421 return nlm_cmp_addr(nlm_srcaddr(host), datap);
422} 422}
423 423
424/** 424/**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 3e459e18cc31..1f226290c67c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
351 argp->state = ntohl(*p++); 351 argp->state = ntohl(*p++);
352 /* Preserve the address in network byte order */ 352 /* Preserve the address in network byte order */
353 argp->addr = *p++; 353 argp->addr = *p++;
354 argp->vers = *p++;
355 argp->proto = *p++;
356 return xdr_argsize_check(rqstp, p); 354 return xdr_argsize_check(rqstp, p);
357} 355}
358 356
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 43ff9397e6c6..50c493a8ad8e 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
358 argp->state = ntohl(*p++); 358 argp->state = ntohl(*p++);
359 /* Preserve the address in network byte order */ 359 /* Preserve the address in network byte order */
360 argp->addr = *p++; 360 argp->addr = *p++;
361 argp->vers = *p++;
362 argp->proto = *p++;
363 return xdr_argsize_check(rqstp, p); 361 return xdr_argsize_check(rqstp, p);
364} 362}
365 363
diff --git a/fs/mpage.c b/fs/mpage.c
index dbcc7af76a15..552b80b3facc 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -6,7 +6,7 @@
6 * Contains functions related to preparing and submitting BIOs which contain 6 * Contains functions related to preparing and submitting BIOs which contain
7 * multiple pagecache pages. 7 * multiple pagecache pages.
8 * 8 *
9 * 15May2002 akpm@zip.com.au 9 * 15May2002 Andrew Morton
10 * Initial version 10 * Initial version
11 * 27Jun2002 axboe@suse.de 11 * 27Jun2002 axboe@suse.de
12 * use bio_add_page() to build bio's just the right size 12 * use bio_add_page() to build bio's just the right size
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f447f4b4476c..6a09760c5960 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -105,7 +105,8 @@ int nfs_callback_up(void)
105 mutex_lock(&nfs_callback_mutex); 105 mutex_lock(&nfs_callback_mutex);
106 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) 106 if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
107 goto out; 107 goto out;
108 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); 108 serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
109 AF_INET, NULL);
109 ret = -ENOMEM; 110 ret = -ENOMEM;
110 if (!serv) 111 if (!serv)
111 goto out_err; 112 goto out_err;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5ee23e7058b3..7547600b6174 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server,
675 server->nfs_client = clp; 675 server->nfs_client = clp;
676 676
677 /* Initialise the client representation from the mount data */ 677 /* Initialise the client representation from the mount data */
678 server->flags = data->flags & NFS_MOUNT_FLAGMASK; 678 server->flags = data->flags;
679 679
680 if (data->rsize) 680 if (data->rsize)
681 server->rsize = nfs_block_size(data->rsize, NULL); 681 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void)
850 INIT_LIST_HEAD(&server->client_link); 850 INIT_LIST_HEAD(&server->client_link);
851 INIT_LIST_HEAD(&server->master_link); 851 INIT_LIST_HEAD(&server->master_link);
852 852
853 init_waitqueue_head(&server->active_wq);
854 atomic_set(&server->active, 0); 853 atomic_set(&server->active, 0);
855 854
856 server->io_stats = nfs_alloc_iostats(); 855 server->io_stats = nfs_alloc_iostats();
@@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server,
1073 goto error; 1072 goto error;
1074 1073
1075 /* Initialise the client representation from the mount data */ 1074 /* Initialise the client representation from the mount data */
1076 server->flags = data->flags & NFS_MOUNT_FLAGMASK; 1075 server->flags = data->flags;
1077 server->caps |= NFS_CAP_ATOMIC_OPEN; 1076 server->caps |= NFS_CAP_ATOMIC_OPEN;
1078 1077
1079 if (data->rsize) 1078 if (data->rsize)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 74f92b717f78..2ab70d46ecbc 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -156,6 +156,7 @@ typedef struct {
156 decode_dirent_t decode; 156 decode_dirent_t decode;
157 int plus; 157 int plus;
158 unsigned long timestamp; 158 unsigned long timestamp;
159 unsigned long gencount;
159 int timestamp_valid; 160 int timestamp_valid;
160} nfs_readdir_descriptor_t; 161} nfs_readdir_descriptor_t;
161 162
@@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
177 struct file *file = desc->file; 178 struct file *file = desc->file;
178 struct inode *inode = file->f_path.dentry->d_inode; 179 struct inode *inode = file->f_path.dentry->d_inode;
179 struct rpc_cred *cred = nfs_file_cred(file); 180 struct rpc_cred *cred = nfs_file_cred(file);
180 unsigned long timestamp; 181 unsigned long timestamp, gencount;
181 int error; 182 int error;
182 183
183 dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", 184 dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
@@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
186 187
187 again: 188 again:
188 timestamp = jiffies; 189 timestamp = jiffies;
190 gencount = nfs_inc_attr_generation_counter();
189 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, 191 error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
190 NFS_SERVER(inode)->dtsize, desc->plus); 192 NFS_SERVER(inode)->dtsize, desc->plus);
191 if (error < 0) { 193 if (error < 0) {
@@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
199 goto error; 201 goto error;
200 } 202 }
201 desc->timestamp = timestamp; 203 desc->timestamp = timestamp;
204 desc->gencount = gencount;
202 desc->timestamp_valid = 1; 205 desc->timestamp_valid = 1;
203 SetPageUptodate(page); 206 SetPageUptodate(page);
204 /* Ensure consistent page alignment of the data. 207 /* Ensure consistent page alignment of the data.
@@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
224 if (IS_ERR(p)) 227 if (IS_ERR(p))
225 return PTR_ERR(p); 228 return PTR_ERR(p);
226 desc->ptr = p; 229 desc->ptr = p;
227 if (desc->timestamp_valid) 230 if (desc->timestamp_valid) {
228 desc->entry->fattr->time_start = desc->timestamp; 231 desc->entry->fattr->time_start = desc->timestamp;
229 else 232 desc->entry->fattr->gencount = desc->gencount;
233 } else
230 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; 234 desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
231 return 0; 235 return 0;
232} 236}
@@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
471 struct rpc_cred *cred = nfs_file_cred(file); 475 struct rpc_cred *cred = nfs_file_cred(file);
472 struct page *page = NULL; 476 struct page *page = NULL;
473 int status; 477 int status;
474 unsigned long timestamp; 478 unsigned long timestamp, gencount;
475 479
476 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", 480 dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
477 (unsigned long long)*desc->dir_cookie); 481 (unsigned long long)*desc->dir_cookie);
@@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
482 goto out; 486 goto out;
483 } 487 }
484 timestamp = jiffies; 488 timestamp = jiffies;
489 gencount = nfs_inc_attr_generation_counter();
485 status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, 490 status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
486 *desc->dir_cookie, page, 491 *desc->dir_cookie, page,
487 NFS_SERVER(inode)->dtsize, 492 NFS_SERVER(inode)->dtsize,
@@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
490 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 495 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
491 if (status >= 0) { 496 if (status >= 0) {
492 desc->timestamp = timestamp; 497 desc->timestamp = timestamp;
498 desc->gencount = gencount;
493 desc->timestamp_valid = 1; 499 desc->timestamp_valid = 1;
494 if ((status = dir_decode(desc)) == 0) 500 if ((status = dir_decode(desc)) == 0)
495 desc->entry->prev_cookie = *desc->dir_cookie; 501 desc->entry->prev_cookie = *desc->dir_cookie;
@@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
655 */ 661 */
656void nfs_force_lookup_revalidate(struct inode *dir) 662void nfs_force_lookup_revalidate(struct inode *dir)
657{ 663{
658 NFS_I(dir)->cache_change_attribute = jiffies; 664 NFS_I(dir)->cache_change_attribute++;
659} 665}
660 666
661/* 667/*
@@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
667{ 673{
668 if (IS_ROOT(dentry)) 674 if (IS_ROOT(dentry))
669 return 1; 675 return 1;
676 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
677 return 0;
670 if (!nfs_verify_change_attribute(dir, dentry->d_time)) 678 if (!nfs_verify_change_attribute(dir, dentry->d_time))
671 return 0; 679 return 0;
672 /* Revalidate nfsi->cache_change_attribute before we declare a match */ 680 /* Revalidate nfsi->cache_change_attribute before we declare a match */
@@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
750 /* Don't revalidate a negative dentry if we're creating a new file */ 758 /* Don't revalidate a negative dentry if we're creating a new file */
751 if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) 759 if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0)
752 return 0; 760 return 0;
761 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
762 return 1;
753 return !nfs_check_verifier(dir, dentry); 763 return !nfs_check_verifier(dir, dentry);
754} 764}
755 765
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 78460657f5cb..d319b49f8f06 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
188 /* origin == SEEK_END => we must revalidate the cached file length */ 188 /* origin == SEEK_END => we must revalidate the cached file length */
189 if (origin == SEEK_END) { 189 if (origin == SEEK_END) {
190 struct inode *inode = filp->f_mapping->host; 190 struct inode *inode = filp->f_mapping->host;
191
191 int retval = nfs_revalidate_file_size(inode, filp); 192 int retval = nfs_revalidate_file_size(inode, filp);
192 if (retval < 0) 193 if (retval < 0)
193 return (loff_t)retval; 194 return (loff_t)retval;
194 } 195
195 lock_kernel(); /* BKL needed? */ 196 spin_lock(&inode->i_lock);
196 loff = generic_file_llseek_unlocked(filp, offset, origin); 197 loff = generic_file_llseek_unlocked(filp, offset, origin);
197 unlock_kernel(); 198 spin_unlock(&inode->i_lock);
199 } else
200 loff = generic_file_llseek_unlocked(filp, offset, origin);
198 return loff; 201 return loff;
199} 202}
200 203
@@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
699 filp->f_path.dentry->d_name.name, 702 filp->f_path.dentry->d_name.name,
700 fl->fl_type, fl->fl_flags); 703 fl->fl_type, fl->fl_flags);
701 704
702 /*
703 * No BSD flocks over NFS allowed.
704 * Note: we could try to fake a POSIX lock request here by
705 * using ((u32) filp | 0x80000000) or some such as the pid.
706 * Not sure whether that would be unique, though, or whether
707 * that would break in other places.
708 */
709 if (!(fl->fl_flags & FL_FLOCK)) 705 if (!(fl->fl_flags & FL_FLOCK))
710 return -ENOLCK; 706 return -ENOLCK;
711 707
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 52daefa2f521..b9195c02a863 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
305 init_special_inode(inode, inode->i_mode, fattr->rdev); 305 init_special_inode(inode, inode->i_mode, fattr->rdev);
306 306
307 nfsi->read_cache_jiffies = fattr->time_start; 307 nfsi->read_cache_jiffies = fattr->time_start;
308 nfsi->last_updated = now; 308 nfsi->attr_gencount = fattr->gencount;
309 nfsi->cache_change_attribute = now;
310 inode->i_atime = fattr->atime; 309 inode->i_atime = fattr->atime;
311 inode->i_mtime = fattr->mtime; 310 inode->i_mtime = fattr->mtime;
312 inode->i_ctime = fattr->ctime; 311 inode->i_ctime = fattr->ctime;
@@ -453,6 +452,7 @@ out_big:
453void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) 452void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
454{ 453{
455 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { 454 if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
455 spin_lock(&inode->i_lock);
456 if ((attr->ia_valid & ATTR_MODE) != 0) { 456 if ((attr->ia_valid & ATTR_MODE) != 0) {
457 int mode = attr->ia_mode & S_IALLUGO; 457 int mode = attr->ia_mode & S_IALLUGO;
458 mode |= inode->i_mode & ~S_IALLUGO; 458 mode |= inode->i_mode & ~S_IALLUGO;
@@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
462 inode->i_uid = attr->ia_uid; 462 inode->i_uid = attr->ia_uid;
463 if ((attr->ia_valid & ATTR_GID) != 0) 463 if ((attr->ia_valid & ATTR_GID) != 0)
464 inode->i_gid = attr->ia_gid; 464 inode->i_gid = attr->ia_gid;
465 spin_lock(&inode->i_lock);
466 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 465 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
467 spin_unlock(&inode->i_lock); 466 spin_unlock(&inode->i_lock);
468 } 467 }
@@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
472 } 471 }
473} 472}
474 473
475static int nfs_wait_schedule(void *word)
476{
477 if (signal_pending(current))
478 return -ERESTARTSYS;
479 schedule();
480 return 0;
481}
482
483/*
484 * Wait for the inode to get unlocked.
485 */
486static int nfs_wait_on_inode(struct inode *inode)
487{
488 struct nfs_inode *nfsi = NFS_I(inode);
489 int error;
490
491 error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
492 nfs_wait_schedule, TASK_KILLABLE);
493
494 return error;
495}
496
497static void nfs_wake_up_inode(struct inode *inode)
498{
499 struct nfs_inode *nfsi = NFS_I(inode);
500
501 clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
502 smp_mb__after_clear_bit();
503 wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
504}
505
506int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 474int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
507{ 475{
508 struct inode *inode = dentry->d_inode; 476 struct inode *inode = dentry->d_inode;
@@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
697 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 665 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
698 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 666 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
699 667
700 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
701 if (is_bad_inode(inode)) 668 if (is_bad_inode(inode))
702 goto out_nowait; 669 goto out;
703 if (NFS_STALE(inode)) 670 if (NFS_STALE(inode))
704 goto out_nowait;
705
706 status = nfs_wait_on_inode(inode);
707 if (status < 0)
708 goto out; 671 goto out;
709 672
710 status = -ESTALE;
711 if (NFS_STALE(inode)) 673 if (NFS_STALE(inode))
712 goto out; 674 goto out;
713 675
676 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
714 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 677 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
715 if (status != 0) { 678 if (status != 0) {
716 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 679 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
@@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
724 goto out; 687 goto out;
725 } 688 }
726 689
727 spin_lock(&inode->i_lock); 690 status = nfs_refresh_inode(inode, &fattr);
728 status = nfs_update_inode(inode, &fattr);
729 if (status) { 691 if (status) {
730 spin_unlock(&inode->i_lock);
731 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 692 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
732 inode->i_sb->s_id, 693 inode->i_sb->s_id,
733 (long long)NFS_FILEID(inode), status); 694 (long long)NFS_FILEID(inode), status);
734 goto out; 695 goto out;
735 } 696 }
736 spin_unlock(&inode->i_lock);
737 697
738 if (nfsi->cache_validity & NFS_INO_INVALID_ACL) 698 if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
739 nfs_zap_acl_cache(inode); 699 nfs_zap_acl_cache(inode);
@@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
743 (long long)NFS_FILEID(inode)); 703 (long long)NFS_FILEID(inode));
744 704
745 out: 705 out:
746 nfs_wake_up_inode(inode);
747
748 out_nowait:
749 return status; 706 return status;
750} 707}
751 708
@@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
908 return -EIO; 865 return -EIO;
909 } 866 }
910 867
911 /* Do atomic weak cache consistency updates */
912 nfs_wcc_update_inode(inode, fattr);
913
914 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 868 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
915 nfsi->change_attr != fattr->change_attr) 869 nfsi->change_attr != fattr->change_attr)
916 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 870 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
@@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
939 893
940 if (invalid != 0) 894 if (invalid != 0)
941 nfsi->cache_validity |= invalid; 895 nfsi->cache_validity |= invalid;
942 else
943 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
944 | NFS_INO_INVALID_ATIME
945 | NFS_INO_REVAL_PAGECACHE);
946 896
947 nfsi->read_cache_jiffies = fattr->time_start; 897 nfsi->read_cache_jiffies = fattr->time_start;
948 return 0; 898 return 0;
949} 899}
950 900
901static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
902{
903 return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
904}
905
906static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
907{
908 return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
909}
910
911static unsigned long nfs_attr_generation_counter;
912
913static unsigned long nfs_read_attr_generation_counter(void)
914{
915 smp_rmb();
916 return nfs_attr_generation_counter;
917}
918
919unsigned long nfs_inc_attr_generation_counter(void)
920{
921 unsigned long ret;
922 smp_rmb();
923 ret = ++nfs_attr_generation_counter;
924 smp_wmb();
925 return ret;
926}
927
928void nfs_fattr_init(struct nfs_fattr *fattr)
929{
930 fattr->valid = 0;
931 fattr->time_start = jiffies;
932 fattr->gencount = nfs_inc_attr_generation_counter();
933}
934
935/**
936 * nfs_inode_attrs_need_update - check if the inode attributes need updating
937 * @inode - pointer to inode
938 * @fattr - attributes
939 *
940 * Attempt to divine whether or not an RPC call reply carrying stale
941 * attributes got scheduled after another call carrying updated ones.
942 *
943 * To do so, the function first assumes that a more recent ctime means
944 * that the attributes in fattr are newer, however it also attempt to
945 * catch the case where ctime either didn't change, or went backwards
946 * (if someone reset the clock on the server) by looking at whether
947 * or not this RPC call was started after the inode was last updated.
948 * Note also the check for wraparound of 'attr_gencount'
949 *
950 * The function returns 'true' if it thinks the attributes in 'fattr' are
951 * more recent than the ones cached in the inode.
952 *
953 */
954static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
955{
956 const struct nfs_inode *nfsi = NFS_I(inode);
957
958 return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
959 nfs_ctime_need_update(inode, fattr) ||
960 nfs_size_need_update(inode, fattr) ||
961 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
962}
963
964static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
965{
966 if (nfs_inode_attrs_need_update(inode, fattr))
967 return nfs_update_inode(inode, fattr);
968 return nfs_check_inode_attributes(inode, fattr);
969}
970
951/** 971/**
952 * nfs_refresh_inode - try to update the inode attribute cache 972 * nfs_refresh_inode - try to update the inode attribute cache
953 * @inode - pointer to inode 973 * @inode - pointer to inode
@@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
960 */ 980 */
961int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) 981int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
962{ 982{
963 struct nfs_inode *nfsi = NFS_I(inode);
964 int status; 983 int status;
965 984
966 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 985 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
967 return 0; 986 return 0;
968 spin_lock(&inode->i_lock); 987 spin_lock(&inode->i_lock);
969 if (time_after(fattr->time_start, nfsi->last_updated)) 988 status = nfs_refresh_inode_locked(inode, fattr);
970 status = nfs_update_inode(inode, fattr);
971 else
972 status = nfs_check_inode_attributes(inode, fattr);
973
974 spin_unlock(&inode->i_lock); 989 spin_unlock(&inode->i_lock);
975 return status; 990 return status;
976} 991}
977 992
993static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
994{
995 struct nfs_inode *nfsi = NFS_I(inode);
996
997 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
998 if (S_ISDIR(inode->i_mode))
999 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
1000 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1001 return 0;
1002 return nfs_refresh_inode_locked(inode, fattr);
1003}
1004
978/** 1005/**
979 * nfs_post_op_update_inode - try to update the inode attribute cache 1006 * nfs_post_op_update_inode - try to update the inode attribute cache
980 * @inode - pointer to inode 1007 * @inode - pointer to inode
@@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
991 */ 1018 */
992int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) 1019int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
993{ 1020{
994 struct nfs_inode *nfsi = NFS_I(inode); 1021 int status;
995 1022
996 spin_lock(&inode->i_lock); 1023 spin_lock(&inode->i_lock);
997 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1024 status = nfs_post_op_update_inode_locked(inode, fattr);
998 if (S_ISDIR(inode->i_mode))
999 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
1000 spin_unlock(&inode->i_lock); 1025 spin_unlock(&inode->i_lock);
1001 return nfs_refresh_inode(inode, fattr); 1026 return status;
1002} 1027}
1003 1028
1004/** 1029/**
@@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1014 */ 1039 */
1015int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) 1040int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
1016{ 1041{
1042 int status;
1043
1044 spin_lock(&inode->i_lock);
1045 /* Don't do a WCC update if these attributes are already stale */
1046 if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
1047 !nfs_inode_attrs_need_update(inode, fattr)) {
1048 fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
1049 goto out_noforce;
1050 }
1017 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 1051 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
1018 (fattr->valid & NFS_ATTR_WCC_V4) == 0) { 1052 (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
1019 fattr->pre_change_attr = NFS_I(inode)->change_attr; 1053 fattr->pre_change_attr = NFS_I(inode)->change_attr;
@@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
1026 fattr->pre_size = i_size_read(inode); 1060 fattr->pre_size = i_size_read(inode);
1027 fattr->valid |= NFS_ATTR_WCC; 1061 fattr->valid |= NFS_ATTR_WCC;
1028 } 1062 }
1029 return nfs_post_op_update_inode(inode, fattr); 1063out_noforce:
1064 status = nfs_post_op_update_inode_locked(inode, fattr);
1065 spin_unlock(&inode->i_lock);
1066 return status;
1030} 1067}
1031 1068
1032/* 1069/*
@@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1092 } 1129 }
1093 /* If ctime has changed we should definitely clear access+acl caches */ 1130 /* If ctime has changed we should definitely clear access+acl caches */
1094 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) 1131 if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
1095 invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1132 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1096 } else if (nfsi->change_attr != fattr->change_attr) { 1133 } else if (nfsi->change_attr != fattr->change_attr) {
1097 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1134 dprintk("NFS: change_attr change on server for file %s/%ld\n",
1098 inode->i_sb->s_id, inode->i_ino); 1135 inode->i_sb->s_id, inode->i_ino);
@@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1126 inode->i_gid != fattr->gid) 1163 inode->i_gid != fattr->gid)
1127 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1164 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1128 1165
1166 if (inode->i_nlink != fattr->nlink)
1167 invalid |= NFS_INO_INVALID_ATTR;
1168
1129 inode->i_mode = fattr->mode; 1169 inode->i_mode = fattr->mode;
1130 inode->i_nlink = fattr->nlink; 1170 inode->i_nlink = fattr->nlink;
1131 inode->i_uid = fattr->uid; 1171 inode->i_uid = fattr->uid;
@@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1145 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 1185 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
1146 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 1186 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
1147 nfsi->attrtimeo_timestamp = now; 1187 nfsi->attrtimeo_timestamp = now;
1148 nfsi->last_updated = now; 1188 nfsi->attr_gencount = nfs_inc_attr_generation_counter();
1149 } else { 1189 } else {
1150 if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { 1190 if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
1151 if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) 1191 if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
1152 nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); 1192 nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
1153 nfsi->attrtimeo_timestamp = now; 1193 nfsi->attrtimeo_timestamp = now;
1154 } 1194 }
1155 /*
1156 * Avoid jiffy wraparound issues with nfsi->last_updated
1157 */
1158 if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
1159 nfsi->last_updated = nfsi->read_cache_jiffies;
1160 } 1195 }
1161 invalid &= ~NFS_INO_INVALID_ATTR; 1196 invalid &= ~NFS_INO_INVALID_ATTR;
1162 /* Don't invalidate the data if we were to blame */ 1197 /* Don't invalidate the data if we were to blame */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 24241fcbb98d..d212ee41caf2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *);
153void nfs_zap_acl_cache(struct inode *inode); 153void nfs_zap_acl_cache(struct inode *inode);
154 154
155/* super.c */ 155/* super.c */
156void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
156extern struct file_system_type nfs_xdev_fs_type; 157extern struct file_system_type nfs_xdev_fs_type;
157#ifdef CONFIG_NFS_V4 158#ifdef CONFIG_NFS_V4
158extern struct file_system_type nfs4_xdev_fs_type; 159extern struct file_system_type nfs4_xdev_fs_type;
@@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat;
163 164
164extern int __init register_nfs_fs(void); 165extern int __init register_nfs_fs(void);
165extern void __exit unregister_nfs_fs(void); 166extern void __exit unregister_nfs_fs(void);
166extern void nfs_sb_active(struct nfs_server *server); 167extern void nfs_sb_active(struct super_block *sb);
167extern void nfs_sb_deactive(struct nfs_server *server); 168extern void nfs_sb_deactive(struct super_block *sb);
168 169
169/* namespace.c */ 170/* namespace.c */
170extern char *nfs_path(const char *base, 171extern char *nfs_path(const char *base,
@@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
276 PAGE_SIZE - 1) >> PAGE_SHIFT; 277 PAGE_SIZE - 1) >> PAGE_SHIFT;
277} 278}
278 279
280#define IPV6_SCOPE_DELIMITER '%'
281
282/*
283 * Set the port number in an address. Be agnostic about the address
284 * family.
285 */
286static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
287{
288 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
289 struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
290
291 switch (sap->sa_family) {
292 case AF_INET:
293 ap->sin_port = htons(port);
294 break;
295 case AF_INET6:
296 ap6->sin6_port = htons(port);
297 break;
298 }
299}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 779d2eb649c5..086a6830d785 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -14,6 +14,7 @@
14#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/sched.h> 15#include <linux/sunrpc/sched.h>
16#include <linux/nfs_fs.h> 16#include <linux/nfs_fs.h>
17#include "internal.h"
17 18
18#ifdef RPC_DEBUG 19#ifdef RPC_DEBUG
19# define NFSDBG_FACILITY NFSDBG_MOUNT 20# define NFSDBG_FACILITY NFSDBG_MOUNT
@@ -98,7 +99,7 @@ out_call_err:
98 99
99out_mnt_err: 100out_mnt_err:
100 dprintk("NFS: MNT server returned result %d\n", result.status); 101 dprintk("NFS: MNT server returned result %d\n", result.status);
101 status = -EACCES; 102 status = nfs_stat_to_errno(result.status);
102 goto out; 103 goto out;
103} 104}
104 105
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 66df08dd1caf..64a288ee046d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
105 105
106 dprintk("--> nfs_follow_mountpoint()\n"); 106 dprintk("--> nfs_follow_mountpoint()\n");
107 107
108 BUG_ON(IS_ROOT(dentry)); 108 err = -ESTALE;
109 if (IS_ROOT(dentry))
110 goto out_err;
111
109 dprintk("%s: enter\n", __func__); 112 dprintk("%s: enter\n", __func__);
110 dput(nd->path.dentry); 113 dput(nd->path.dentry);
111 nd->path.dentry = dget(dentry); 114 nd->path.dentry = dget(dentry);
@@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
189 struct nfs_clone_mount *mountdata) 192 struct nfs_clone_mount *mountdata)
190{ 193{
191#ifdef CONFIG_NFS_V4 194#ifdef CONFIG_NFS_V4
192 struct vfsmount *mnt = NULL; 195 struct vfsmount *mnt = ERR_PTR(-EINVAL);
193 switch (server->nfs_client->rpc_ops->version) { 196 switch (server->nfs_client->rpc_ops->version) {
194 case 2: 197 case 2:
195 case 3: 198 case 3:
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 423842f51ac9..cef62557c87d 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
229 229
230 dprintk("NFS call getacl\n"); 230 dprintk("NFS call getacl\n");
231 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; 231 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
232 nfs_fattr_init(&fattr);
232 status = rpc_call_sync(server->client_acl, &msg, 0); 233 status = rpc_call_sync(server->client_acl, &msg, 0);
233 dprintk("NFS reply getacl: %d\n", status); 234 dprintk("NFS reply getacl: %d\n", status);
234 235
@@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
322 323
323 dprintk("NFS call setacl\n"); 324 dprintk("NFS call setacl\n");
324 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 325 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
326 nfs_fattr_init(&fattr);
325 status = rpc_call_sync(server->client_acl, &msg, 0); 327 status = rpc_call_sync(server->client_acl, &msg, 0);
326 nfs_access_zap_cache(inode); 328 nfs_access_zap_cache(inode);
327 nfs_zap_acl_cache(inode); 329 nfs_zap_acl_cache(inode);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1e750e4574a9..c55be7a7679e 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
699} 699}
700 700
701static int 701static int
702nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, 702do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
703 struct nfs_fsinfo *info) 703 struct nfs_fsinfo *info)
704{ 704{
705 struct rpc_message msg = { 705 struct rpc_message msg = {
@@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
711 711
712 dprintk("NFS call fsinfo\n"); 712 dprintk("NFS call fsinfo\n");
713 nfs_fattr_init(info->fattr); 713 nfs_fattr_init(info->fattr);
714 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); 714 status = rpc_call_sync(client, &msg, 0);
715 dprintk("NFS reply fsinfo: %d\n", status); 715 dprintk("NFS reply fsinfo: %d\n", status);
716 return status; 716 return status;
717} 717}
718 718
719/*
720 * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
721 * nfs_create_server
722 */
723static int
724nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
725 struct nfs_fsinfo *info)
726{
727 int status;
728
729 status = do_proc_fsinfo(server->client, fhandle, info);
730 if (status && server->nfs_client->cl_rpcclient != server->client)
731 status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
732 return status;
733}
734
719static int 735static int
720nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, 736nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
721 struct nfs_pathconf *info) 737 struct nfs_pathconf *info)
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index b112857301f7..30befc39b3c6 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
93 return 0; 93 return 0;
94} 94}
95 95
96/* 96static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
97 * Check if the string represents a "valid" IPv4 address 97 char *page, char *page2,
98 */ 98 const struct nfs4_fs_location *location)
99static inline int valid_ipaddr4(const char *buf)
100{ 99{
101 int rc, count, in[4]; 100 struct vfsmount *mnt = ERR_PTR(-ENOENT);
102 101 char *mnt_path;
103 rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); 102 int page2len;
104 if (rc != 4) 103 unsigned int s;
105 return -EINVAL; 104
106 for (count = 0; count < 4; count++) { 105 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
107 if (in[count] > 255) 106 if (IS_ERR(mnt_path))
108 return -EINVAL; 107 return mnt;
108 mountdata->mnt_path = mnt_path;
109 page2 += strlen(mnt_path) + 1;
110 page2len = PAGE_SIZE - strlen(mnt_path) - 1;
111
112 for (s = 0; s < location->nservers; s++) {
113 const struct nfs4_string *buf = &location->servers[s];
114 struct sockaddr_storage addr;
115
116 if (buf->len <= 0 || buf->len >= PAGE_SIZE)
117 continue;
118
119 mountdata->addr = (struct sockaddr *)&addr;
120
121 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
122 continue;
123 nfs_parse_ip_address(buf->data, buf->len,
124 mountdata->addr, &mountdata->addrlen);
125 if (mountdata->addr->sa_family == AF_UNSPEC)
126 continue;
127 nfs_set_port(mountdata->addr, NFS_PORT);
128
129 strncpy(page2, buf->data, page2len);
130 page2[page2len] = '\0';
131 mountdata->hostname = page2;
132
133 snprintf(page, PAGE_SIZE, "%s:%s",
134 mountdata->hostname,
135 mountdata->mnt_path);
136
137 mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
138 if (!IS_ERR(mnt))
139 break;
109 } 140 }
110 return 0; 141 return mnt;
111} 142}
112 143
113/** 144/**
@@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
128 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, 159 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
129 }; 160 };
130 char *page = NULL, *page2 = NULL; 161 char *page = NULL, *page2 = NULL;
131 unsigned int s;
132 int loc, error; 162 int loc, error;
133 163
134 if (locations == NULL || locations->nlocations <= 0) 164 if (locations == NULL || locations->nlocations <= 0)
@@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
152 goto out; 182 goto out;
153 } 183 }
154 184
155 loc = 0; 185 for (loc = 0; loc < locations->nlocations; loc++) {
156 while (loc < locations->nlocations && IS_ERR(mnt)) {
157 const struct nfs4_fs_location *location = &locations->locations[loc]; 186 const struct nfs4_fs_location *location = &locations->locations[loc];
158 char *mnt_path;
159 187
160 if (location == NULL || location->nservers <= 0 || 188 if (location == NULL || location->nservers <= 0 ||
161 location->rootpath.ncomponents == 0) { 189 location->rootpath.ncomponents == 0)
162 loc++;
163 continue; 190 continue;
164 }
165 191
166 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); 192 mnt = try_location(&mountdata, page, page2, location);
167 if (IS_ERR(mnt_path)) { 193 if (!IS_ERR(mnt))
168 loc++; 194 break;
169 continue;
170 }
171 mountdata.mnt_path = mnt_path;
172
173 s = 0;
174 while (s < location->nservers) {
175 struct sockaddr_in addr = {
176 .sin_family = AF_INET,
177 .sin_port = htons(NFS_PORT),
178 };
179
180 if (location->servers[s].len <= 0 ||
181 valid_ipaddr4(location->servers[s].data) < 0) {
182 s++;
183 continue;
184 }
185
186 mountdata.hostname = location->servers[s].data;
187 addr.sin_addr.s_addr = in_aton(mountdata.hostname),
188 mountdata.addr = (struct sockaddr *)&addr;
189 mountdata.addrlen = sizeof(addr);
190
191 snprintf(page, PAGE_SIZE, "%s:%s",
192 mountdata.hostname,
193 mountdata.mnt_path);
194
195 mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata);
196 if (!IS_ERR(mnt)) {
197 break;
198 }
199 s++;
200 }
201 loc++;
202 } 195 }
203 196
204out: 197out:
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 46763d1cd397..8478fc25daee 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -127,7 +127,7 @@ enum {
127 Opt_err 127 Opt_err
128}; 128};
129 129
130static match_table_t __initdata tokens = { 130static match_table_t __initconst tokens = {
131 {Opt_port, "port=%u"}, 131 {Opt_port, "port=%u"},
132 {Opt_rsize, "rsize=%u"}, 132 {Opt_rsize, "rsize=%u"},
133 {Opt_wsize, "wsize=%u"}, 133 {Opt_wsize, "wsize=%u"},
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4dbb84df1b68..193465210d7c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
65 65
66 dprintk("%s: call getattr\n", __func__); 66 dprintk("%s: call getattr\n", __func__);
67 nfs_fattr_init(fattr); 67 nfs_fattr_init(fattr);
68 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); 68 status = rpc_call_sync(server->client, &msg, 0);
69 /* Retry with default authentication if different */
70 if (status && server->nfs_client->cl_rpcclient != server->client)
71 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
69 dprintk("%s: reply getattr: %d\n", __func__, status); 72 dprintk("%s: reply getattr: %d\n", __func__, status);
70 if (status) 73 if (status)
71 return status; 74 return status;
72 dprintk("%s: call statfs\n", __func__); 75 dprintk("%s: call statfs\n", __func__);
73 msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; 76 msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
74 msg.rpc_resp = &fsinfo; 77 msg.rpc_resp = &fsinfo;
75 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); 78 status = rpc_call_sync(server->client, &msg, 0);
79 /* Retry with default authentication if different */
80 if (status && server->nfs_client->cl_rpcclient != server->client)
81 status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
76 dprintk("%s: reply statfs: %d\n", __func__, status); 82 dprintk("%s: reply statfs: %d\n", __func__, status);
77 if (status) 83 if (status)
78 return status; 84 return status;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9abcd2b329f7..8b28b95c9e44 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -91,6 +91,7 @@ enum {
91 /* Mount options that take string arguments */ 91 /* Mount options that take string arguments */
92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, 92 Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
93 Opt_addr, Opt_mountaddr, Opt_clientaddr, 93 Opt_addr, Opt_mountaddr, Opt_clientaddr,
94 Opt_lookupcache,
94 95
95 /* Special mount options */ 96 /* Special mount options */
96 Opt_userspace, Opt_deprecated, Opt_sloppy, 97 Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -98,7 +99,7 @@ enum {
98 Opt_err 99 Opt_err
99}; 100};
100 101
101static match_table_t nfs_mount_option_tokens = { 102static const match_table_t nfs_mount_option_tokens = {
102 { Opt_userspace, "bg" }, 103 { Opt_userspace, "bg" },
103 { Opt_userspace, "fg" }, 104 { Opt_userspace, "fg" },
104 { Opt_userspace, "retry=%s" }, 105 { Opt_userspace, "retry=%s" },
@@ -154,6 +155,8 @@ static match_table_t nfs_mount_option_tokens = {
154 { Opt_mounthost, "mounthost=%s" }, 155 { Opt_mounthost, "mounthost=%s" },
155 { Opt_mountaddr, "mountaddr=%s" }, 156 { Opt_mountaddr, "mountaddr=%s" },
156 157
158 { Opt_lookupcache, "lookupcache=%s" },
159
157 { Opt_err, NULL } 160 { Opt_err, NULL }
158}; 161};
159 162
@@ -163,7 +166,7 @@ enum {
163 Opt_xprt_err 166 Opt_xprt_err
164}; 167};
165 168
166static match_table_t nfs_xprt_protocol_tokens = { 169static const match_table_t nfs_xprt_protocol_tokens = {
167 { Opt_xprt_udp, "udp" }, 170 { Opt_xprt_udp, "udp" },
168 { Opt_xprt_tcp, "tcp" }, 171 { Opt_xprt_tcp, "tcp" },
169 { Opt_xprt_rdma, "rdma" }, 172 { Opt_xprt_rdma, "rdma" },
@@ -180,7 +183,7 @@ enum {
180 Opt_sec_err 183 Opt_sec_err
181}; 184};
182 185
183static match_table_t nfs_secflavor_tokens = { 186static const match_table_t nfs_secflavor_tokens = {
184 { Opt_sec_none, "none" }, 187 { Opt_sec_none, "none" },
185 { Opt_sec_none, "null" }, 188 { Opt_sec_none, "null" },
186 { Opt_sec_sys, "sys" }, 189 { Opt_sec_sys, "sys" },
@@ -200,6 +203,22 @@ static match_table_t nfs_secflavor_tokens = {
200 { Opt_sec_err, NULL } 203 { Opt_sec_err, NULL }
201}; 204};
202 205
206enum {
207 Opt_lookupcache_all, Opt_lookupcache_positive,
208 Opt_lookupcache_none,
209
210 Opt_lookupcache_err
211};
212
213static match_table_t nfs_lookupcache_tokens = {
214 { Opt_lookupcache_all, "all" },
215 { Opt_lookupcache_positive, "pos" },
216 { Opt_lookupcache_positive, "positive" },
217 { Opt_lookupcache_none, "none" },
218
219 { Opt_lookupcache_err, NULL }
220};
221
203 222
204static void nfs_umount_begin(struct super_block *); 223static void nfs_umount_begin(struct super_block *);
205static int nfs_statfs(struct dentry *, struct kstatfs *); 224static int nfs_statfs(struct dentry *, struct kstatfs *);
@@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
209static int nfs_xdev_get_sb(struct file_system_type *fs_type, 228static int nfs_xdev_get_sb(struct file_system_type *fs_type,
210 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 229 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
211static void nfs_kill_super(struct super_block *); 230static void nfs_kill_super(struct super_block *);
212static void nfs_put_super(struct super_block *);
213static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); 231static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
214 232
215static struct file_system_type nfs_fs_type = { 233static struct file_system_type nfs_fs_type = {
@@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = {
232 .alloc_inode = nfs_alloc_inode, 250 .alloc_inode = nfs_alloc_inode,
233 .destroy_inode = nfs_destroy_inode, 251 .destroy_inode = nfs_destroy_inode,
234 .write_inode = nfs_write_inode, 252 .write_inode = nfs_write_inode,
235 .put_super = nfs_put_super,
236 .statfs = nfs_statfs, 253 .statfs = nfs_statfs,
237 .clear_inode = nfs_clear_inode, 254 .clear_inode = nfs_clear_inode,
238 .umount_begin = nfs_umount_begin, 255 .umount_begin = nfs_umount_begin,
@@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void)
337 unregister_filesystem(&nfs_fs_type); 354 unregister_filesystem(&nfs_fs_type);
338} 355}
339 356
340void nfs_sb_active(struct nfs_server *server) 357void nfs_sb_active(struct super_block *sb)
341{ 358{
342 atomic_inc(&server->active); 359 struct nfs_server *server = NFS_SB(sb);
343}
344 360
345void nfs_sb_deactive(struct nfs_server *server) 361 if (atomic_inc_return(&server->active) == 1)
346{ 362 atomic_inc(&sb->s_active);
347 if (atomic_dec_and_test(&server->active))
348 wake_up(&server->active_wq);
349} 363}
350 364
351static void nfs_put_super(struct super_block *sb) 365void nfs_sb_deactive(struct super_block *sb)
352{ 366{
353 struct nfs_server *server = NFS_SB(sb); 367 struct nfs_server *server = NFS_SB(sb);
354 /* 368
355 * Make sure there are no outstanding ops to this server. 369 if (atomic_dec_and_test(&server->active))
356 * If so, wait for them to finish before allowing the 370 deactivate_super(sb);
357 * unmount to continue.
358 */
359 wait_event(server->active_wq, atomic_read(&server->active) == 0);
360} 371}
361 372
362/* 373/*
@@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb)
664} 675}
665 676
666/* 677/*
667 * Set the port number in an address. Be agnostic about the address family.
668 */
669static void nfs_set_port(struct sockaddr *sap, unsigned short port)
670{
671 switch (sap->sa_family) {
672 case AF_INET: {
673 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
674 ap->sin_port = htons(port);
675 break;
676 }
677 case AF_INET6: {
678 struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
679 ap->sin6_port = htons(port);
680 break;
681 }
682 }
683}
684
685/*
686 * Sanity-check a server address provided by the mount command. 678 * Sanity-check a server address provided by the mount command.
687 * 679 *
688 * Address family must be initialized, and address must not be 680 * Address family must be initialized, and address must not be
@@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len,
724 *addr_len = 0; 716 *addr_len = 0;
725} 717}
726 718
727#define IPV6_SCOPE_DELIMITER '%'
728
729#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 719#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
730static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, 720static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
731 const char *delim, 721 const char *delim,
732 struct sockaddr_in6 *sin6) 722 struct sockaddr_in6 *sin6)
733{ 723{
734 char *p; 724 char *p;
735 size_t len; 725 size_t len;
736 726
737 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) 727 if ((string + str_len) == delim)
738 return ; 728 return 1;
729
739 if (*delim != IPV6_SCOPE_DELIMITER) 730 if (*delim != IPV6_SCOPE_DELIMITER)
740 return; 731 return 0;
732
733 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
734 return 0;
741 735
742 len = (string + str_len) - delim - 1; 736 len = (string + str_len) - delim - 1;
743 p = kstrndup(delim + 1, len, GFP_KERNEL); 737 p = kstrndup(delim + 1, len, GFP_KERNEL);
@@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
750 scope_id = dev->ifindex; 744 scope_id = dev->ifindex;
751 dev_put(dev); 745 dev_put(dev);
752 } else { 746 } else {
753 /* scope_id is set to zero on error */ 747 if (strict_strtoul(p, 10, &scope_id) == 0) {
754 strict_strtoul(p, 10, &scope_id); 748 kfree(p);
749 return 0;
750 }
755 } 751 }
756 752
757 kfree(p); 753 kfree(p);
754
758 sin6->sin6_scope_id = scope_id; 755 sin6->sin6_scope_id = scope_id;
759 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); 756 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
757 return 1;
760 } 758 }
759
760 return 0;
761} 761}
762 762
763static void nfs_parse_ipv6_address(char *string, size_t str_len, 763static void nfs_parse_ipv6_address(char *string, size_t str_len,
@@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
773 773
774 sin6->sin6_family = AF_INET6; 774 sin6->sin6_family = AF_INET6;
775 *addr_len = sizeof(*sin6); 775 *addr_len = sizeof(*sin6);
776 if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { 776 if (in6_pton(string, str_len, addr,
777 nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); 777 IPV6_SCOPE_DELIMITER, &delim) != 0) {
778 return; 778 if (nfs_parse_ipv6_scope_id(string, str_len,
779 delim, sin6) != 0)
780 return;
779 } 781 }
780 } 782 }
781 783
@@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
798 * If there is a problem constructing the new sockaddr, set the address 800 * If there is a problem constructing the new sockaddr, set the address
799 * family to AF_UNSPEC. 801 * family to AF_UNSPEC.
800 */ 802 */
801static void nfs_parse_ip_address(char *string, size_t str_len, 803void nfs_parse_ip_address(char *string, size_t str_len,
802 struct sockaddr *sap, size_t *addr_len) 804 struct sockaddr *sap, size_t *addr_len)
803{ 805{
804 unsigned int i, colons; 806 unsigned int i, colons;
@@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw,
1258 &mnt->mount_server.addrlen); 1260 &mnt->mount_server.addrlen);
1259 kfree(string); 1261 kfree(string);
1260 break; 1262 break;
1263 case Opt_lookupcache:
1264 string = match_strdup(args);
1265 if (string == NULL)
1266 goto out_nomem;
1267 token = match_token(string,
1268 nfs_lookupcache_tokens, args);
1269 kfree(string);
1270 switch (token) {
1271 case Opt_lookupcache_all:
1272 mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
1273 break;
1274 case Opt_lookupcache_positive:
1275 mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
1276 mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
1277 break;
1278 case Opt_lookupcache_none:
1279 mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
1280 break;
1281 default:
1282 errors++;
1283 dfprintk(MOUNT, "NFS: invalid "
1284 "lookupcache argument\n");
1285 };
1286 break;
1261 1287
1262 /* 1288 /*
1263 * Special options 1289 * Special options
@@ -1279,6 +1305,12 @@ static int nfs_parse_mount_options(char *raw,
1279 } 1305 }
1280 } 1306 }
1281 1307
1308 if (errors > 0) {
1309 dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
1310 errors, (errors == 1 ? "" : "s"));
1311 if (!sloppy)
1312 return 0;
1313 }
1282 return 1; 1314 return 1;
1283 1315
1284out_nomem: 1316out_nomem:
@@ -1552,7 +1584,7 @@ static int nfs_validate_mount_data(void *options,
1552 * Translate to nfs_parsed_mount_data, which nfs_fill_super 1584 * Translate to nfs_parsed_mount_data, which nfs_fill_super
1553 * can deal with. 1585 * can deal with.
1554 */ 1586 */
1555 args->flags = data->flags; 1587 args->flags = data->flags & NFS_MOUNT_FLAGMASK;
1556 args->rsize = data->rsize; 1588 args->rsize = data->rsize;
1557 args->wsize = data->wsize; 1589 args->wsize = data->wsize;
1558 args->timeo = data->timeo; 1590 args->timeo = data->timeo;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index f089e5839d7d..ecc295347775 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata)
99 99
100 nfs_dec_sillycount(data->dir); 100 nfs_dec_sillycount(data->dir);
101 nfs_free_unlinkdata(data); 101 nfs_free_unlinkdata(data);
102 nfs_sb_deactive(NFS_SB(sb)); 102 nfs_sb_deactive(sb);
103} 103}
104 104
105static const struct rpc_call_ops nfs_unlink_ops = { 105static const struct rpc_call_ops nfs_unlink_ops = {
@@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
118 .rpc_message = &msg, 118 .rpc_message = &msg,
119 .callback_ops = &nfs_unlink_ops, 119 .callback_ops = &nfs_unlink_ops,
120 .callback_data = data, 120 .callback_data = data,
121 .workqueue = nfsiod_workqueue,
121 .flags = RPC_TASK_ASYNC, 122 .flags = RPC_TASK_ASYNC,
122 }; 123 };
123 struct rpc_task *task; 124 struct rpc_task *task;
@@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
149 nfs_dec_sillycount(dir); 150 nfs_dec_sillycount(dir);
150 return 0; 151 return 0;
151 } 152 }
152 nfs_sb_active(NFS_SERVER(dir)); 153 nfs_sb_active(dir->i_sb);
153 data->args.fh = NFS_FH(dir); 154 data->args.fh = NFS_FH(dir);
154 nfs_fattr_init(&data->res.dir_attr); 155 nfs_fattr_init(&data->res.dir_attr);
155 156
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3229e217c773..9f9845859fc1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1427 .bdi = mapping->backing_dev_info, 1427 .bdi = mapping->backing_dev_info,
1428 .sync_mode = WB_SYNC_NONE, 1428 .sync_mode = WB_SYNC_NONE,
1429 .nr_to_write = LONG_MAX, 1429 .nr_to_write = LONG_MAX,
1430 .range_start = 0,
1431 .range_end = LLONG_MAX,
1430 .for_writepages = 1, 1432 .for_writepages = 1,
1431 .range_cyclic = 1,
1432 }; 1433 };
1433 int ret; 1434 int ret;
1434 1435
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 15c6faeec77c..b2786a5f9afe 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -70,7 +70,6 @@ nlm_fclose(struct file *filp)
70static struct nlmsvc_binding nfsd_nlm_ops = { 70static struct nlmsvc_binding nfsd_nlm_ops = {
71 .fopen = nlm_fopen, /* open file for locking */ 71 .fopen = nlm_fopen, /* open file for locking */
72 .fclose = nlm_fclose, /* close file */ 72 .fclose = nlm_fclose, /* close file */
73 .get_grace_period = get_nfs4_grace_period,
74}; 73};
75 74
76void 75void
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 4d617ea28cfc..9dbd2eb91281 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
63 SVCFH_fmt(&argp->fh)); 63 SVCFH_fmt(&argp->fh));
64 64
65 fh_copy(&resp->fh, &argp->fh); 65 fh_copy(&resp->fh, &argp->fh);
66 nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); 66 nfserr = fh_verify(rqstp, &resp->fh, 0,
67 NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
67 if (nfserr) 68 if (nfserr)
68 RETURN_STATUS(nfserr); 69 RETURN_STATUS(nfserr);
69 70
@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
530 dprintk("nfsd: FSSTAT(3) %s\n", 531 dprintk("nfsd: FSSTAT(3) %s\n",
531 SVCFH_fmt(&argp->fh)); 532 SVCFH_fmt(&argp->fh));
532 533
533 nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); 534 nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
534 fh_put(&argp->fh); 535 fh_put(&argp->fh);
535 RETURN_STATUS(nfserr); 536 RETURN_STATUS(nfserr);
536} 537}
@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
558 resp->f_maxfilesize = ~(u32) 0; 559 resp->f_maxfilesize = ~(u32) 0;
559 resp->f_properties = NFS3_FSF_DEFAULT; 560 resp->f_properties = NFS3_FSF_DEFAULT;
560 561
561 nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); 562 nfserr = fh_verify(rqstp, &argp->fh, 0,
563 NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
562 564
563 /* Check special features of the file system. May request 565 /* Check special features of the file system. May request
564 * different read/write sizes for file systems known to have 566 * different read/write sizes for file systems known to have
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index b6ed38380ab8..54b8b4140c8f 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt)
443 * enough space for either: 443 * enough space for either:
444 */ 444 */
445 alloc = sizeof(struct posix_ace_state_array) 445 alloc = sizeof(struct posix_ace_state_array)
446 + cnt*sizeof(struct posix_ace_state); 446 + cnt*sizeof(struct posix_user_ace_state);
447 state->users = kzalloc(alloc, GFP_KERNEL); 447 state->users = kzalloc(alloc, GFP_KERNEL);
448 if (!state->users) 448 if (!state->users)
449 return -ENOMEM; 449 return -ENOMEM;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 702fa577aa6e..094747a1227c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
225 225
226 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 226 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
227 WRITE32(OP_CB_RECALL); 227 WRITE32(OP_CB_RECALL);
228 WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t)); 228 WRITE32(cb_rec->cbr_stateid.si_generation);
229 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
229 WRITE32(cb_rec->cbr_trunc); 230 WRITE32(cb_rec->cbr_trunc);
230 WRITE32(len); 231 WRITE32(len);
231 WRITEMEM(cb_rec->cbr_fhval, len); 232 WRITEMEM(cb_rec->cbr_fhval, len);
@@ -379,6 +380,7 @@ static int do_probe_callback(void *data)
379 .addrsize = sizeof(addr), 380 .addrsize = sizeof(addr),
380 .timeout = &timeparms, 381 .timeout = &timeparms,
381 .program = &cb_program, 382 .program = &cb_program,
383 .prognumber = cb->cb_prog,
382 .version = nfs_cb_version[1]->number, 384 .version = nfs_cb_version[1]->number,
383 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 385 .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 386 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
396 addr.sin_port = htons(cb->cb_port); 398 addr.sin_port = htons(cb->cb_port);
397 addr.sin_addr.s_addr = htonl(cb->cb_addr); 399 addr.sin_addr.s_addr = htonl(cb->cb_addr);
398 400
399 /* Initialize rpc_stat */
400 memset(args.program->stats, 0, sizeof(struct rpc_stat));
401
402 /* Create RPC client */ 401 /* Create RPC client */
403 client = rpc_create(&args); 402 client = rpc_create(&args);
404 if (IS_ERR(client)) { 403 if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2e51adac65de..669461e291ae 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
201 /* Openowner is now set, so sequence id will get bumped. Now we need 201 /* Openowner is now set, so sequence id will get bumped. Now we need
202 * these checks before we do any creates: */ 202 * these checks before we do any creates: */
203 status = nfserr_grace; 203 status = nfserr_grace;
204 if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) 204 if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
205 goto out; 205 goto out;
206 status = nfserr_no_grace; 206 status = nfserr_no_grace;
207 if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) 207 if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
208 goto out; 208 goto out;
209 209
210 switch (open->op_claim_type) { 210 switch (open->op_claim_type) {
@@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
575{ 575{
576 __be32 status; 576 __be32 status;
577 577
578 if (nfs4_in_grace()) 578 if (locks_in_grace())
579 return nfserr_grace; 579 return nfserr_grace;
580 status = nfsd_unlink(rqstp, &cstate->current_fh, 0, 580 status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
581 remove->rm_name, remove->rm_namelen); 581 remove->rm_name, remove->rm_namelen);
@@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
596 596
597 if (!cstate->save_fh.fh_dentry) 597 if (!cstate->save_fh.fh_dentry)
598 return status; 598 return status;
599 if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags 599 if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
600 & NFSEXP_NOSUBTREECHECK)) 600 & NFSEXP_NOSUBTREECHECK))
601 return nfserr_grace; 601 return nfserr_grace;
602 status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, 602 status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
@@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
867 int slack_bytes; 867 int slack_bytes;
868 __be32 status; 868 __be32 status;
869 869
870 status = nfserr_resource;
871 cstate = cstate_alloc();
872 if (cstate == NULL)
873 goto out;
874
875 resp->xbuf = &rqstp->rq_res; 870 resp->xbuf = &rqstp->rq_res;
876 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; 871 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
877 resp->tagp = resp->p; 872 resp->tagp = resp->p;
@@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
890 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) 885 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
891 goto out; 886 goto out;
892 887
888 status = nfserr_resource;
889 cstate = cstate_alloc();
890 if (cstate == NULL)
891 goto out;
892
893 status = nfs_ok; 893 status = nfs_ok;
894 while (!status && resp->opcnt < args->opcnt) { 894 while (!status && resp->opcnt < args->opcnt) {
895 op = &args->ops[resp->opcnt++]; 895 op = &args->ops[resp->opcnt++];
@@ -957,9 +957,9 @@ encode_op:
957 nfsd4_increment_op_stats(op->opnum); 957 nfsd4_increment_op_stats(op->opnum);
958 } 958 }
959 959
960 cstate_free(cstate);
960out: 961out:
961 nfsd4_release_compoundargs(args); 962 nfsd4_release_compoundargs(args);
962 cstate_free(cstate);
963 dprintk("nfsv4 compound returned %d\n", ntohl(status)); 963 dprintk("nfsv4 compound returned %d\n", ntohl(status));
964 return status; 964 return status;
965} 965}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1578d7a2667e..0cc7ff5d5ab5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
61static time_t lease_time = 90; /* default lease time */ 61static time_t lease_time = 90; /* default lease time */
62static time_t user_lease_time = 90; 62static time_t user_lease_time = 90;
63static time_t boot_time; 63static time_t boot_time;
64static int in_grace = 1;
65static u32 current_ownerid = 1; 64static u32 current_ownerid = 1;
66static u32 current_fileid = 1; 65static u32 current_fileid = 1;
67static u32 current_delegid = 1; 66static u32 current_delegid = 1;
@@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
1640 case NFS4_OPEN_CLAIM_NULL: 1639 case NFS4_OPEN_CLAIM_NULL:
1641 /* Let's not give out any delegations till everyone's 1640 /* Let's not give out any delegations till everyone's
1642 * had the chance to reclaim theirs.... */ 1641 * had the chance to reclaim theirs.... */
1643 if (nfs4_in_grace()) 1642 if (locks_in_grace())
1644 goto out; 1643 goto out;
1645 if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) 1644 if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
1646 goto out; 1645 goto out;
@@ -1816,12 +1815,15 @@ out:
1816 return status; 1815 return status;
1817} 1816}
1818 1817
1818struct lock_manager nfsd4_manager = {
1819};
1820
1819static void 1821static void
1820end_grace(void) 1822nfsd4_end_grace(void)
1821{ 1823{
1822 dprintk("NFSD: end of grace period\n"); 1824 dprintk("NFSD: end of grace period\n");
1823 nfsd4_recdir_purge_old(); 1825 nfsd4_recdir_purge_old();
1824 in_grace = 0; 1826 locks_end_grace(&nfsd4_manager);
1825} 1827}
1826 1828
1827static time_t 1829static time_t
@@ -1838,8 +1840,8 @@ nfs4_laundromat(void)
1838 nfs4_lock_state(); 1840 nfs4_lock_state();
1839 1841
1840 dprintk("NFSD: laundromat service - starting\n"); 1842 dprintk("NFSD: laundromat service - starting\n");
1841 if (in_grace) 1843 if (locks_in_grace())
1842 end_grace(); 1844 nfsd4_end_grace();
1843 list_for_each_safe(pos, next, &client_lru) { 1845 list_for_each_safe(pos, next, &client_lru) {
1844 clp = list_entry(pos, struct nfs4_client, cl_lru); 1846 clp = list_entry(pos, struct nfs4_client, cl_lru);
1845 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { 1847 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1974 return nfserr_bad_stateid; 1976 return nfserr_bad_stateid;
1975 else if (ONE_STATEID(stateid) && (flags & RD_STATE)) 1977 else if (ONE_STATEID(stateid) && (flags & RD_STATE))
1976 return nfs_ok; 1978 return nfs_ok;
1977 else if (nfs4_in_grace()) { 1979 else if (locks_in_grace()) {
1978 /* Answer in remaining cases depends on existance of 1980 /* Answer in remaining cases depends on existance of
1979 * conflicting state; so we must wait out the grace period. */ 1981 * conflicting state; so we must wait out the grace period. */
1980 return nfserr_grace; 1982 return nfserr_grace;
@@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1993static inline int 1995static inline int
1994io_during_grace_disallowed(struct inode *inode, int flags) 1996io_during_grace_disallowed(struct inode *inode, int flags)
1995{ 1997{
1996 return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE)) 1998 return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
1997 && mandatory_lock(inode); 1999 && mandatory_lock(inode);
1998} 2000}
1999 2001
@@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2693 filp = lock_stp->st_vfs_file; 2695 filp = lock_stp->st_vfs_file;
2694 2696
2695 status = nfserr_grace; 2697 status = nfserr_grace;
2696 if (nfs4_in_grace() && !lock->lk_reclaim) 2698 if (locks_in_grace() && !lock->lk_reclaim)
2697 goto out; 2699 goto out;
2698 status = nfserr_no_grace; 2700 status = nfserr_no_grace;
2699 if (!nfs4_in_grace() && lock->lk_reclaim) 2701 if (!locks_in_grace() && lock->lk_reclaim)
2700 goto out; 2702 goto out;
2701 2703
2702 locks_init_lock(&file_lock); 2704 locks_init_lock(&file_lock);
@@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2779 int error; 2781 int error;
2780 __be32 status; 2782 __be32 status;
2781 2783
2782 if (nfs4_in_grace()) 2784 if (locks_in_grace())
2783 return nfserr_grace; 2785 return nfserr_grace;
2784 2786
2785 if (check_lock_length(lockt->lt_offset, lockt->lt_length)) 2787 if (check_lock_length(lockt->lt_offset, lockt->lt_length))
@@ -3192,9 +3194,9 @@ __nfs4_state_start(void)
3192 unsigned long grace_time; 3194 unsigned long grace_time;
3193 3195
3194 boot_time = get_seconds(); 3196 boot_time = get_seconds();
3195 grace_time = get_nfs_grace_period(); 3197 grace_time = get_nfs4_grace_period();
3196 lease_time = user_lease_time; 3198 lease_time = user_lease_time;
3197 in_grace = 1; 3199 locks_start_grace(&nfsd4_manager);
3198 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 3200 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
3199 grace_time/HZ); 3201 grace_time/HZ);
3200 laundry_wq = create_singlethread_workqueue("nfsd4"); 3202 laundry_wq = create_singlethread_workqueue("nfsd4");
@@ -3213,12 +3215,6 @@ nfs4_state_start(void)
3213 return; 3215 return;
3214} 3216}
3215 3217
3216int
3217nfs4_in_grace(void)
3218{
3219 return in_grace;
3220}
3221
3222time_t 3218time_t
3223nfs4_lease_time(void) 3219nfs4_lease_time(void)
3224{ 3220{
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 14ba4d9b2859..afcdf4b76843 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -413,6 +413,18 @@ out_nfserr:
413} 413}
414 414
415static __be32 415static __be32
416nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
417{
418 DECODE_HEAD;
419
420 READ_BUF(sizeof(stateid_t));
421 READ32(sid->si_generation);
422 COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
423
424 DECODE_TAIL;
425}
426
427static __be32
416nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) 428nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
417{ 429{
418 DECODE_HEAD; 430 DECODE_HEAD;
@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
429 DECODE_HEAD; 441 DECODE_HEAD;
430 442
431 close->cl_stateowner = NULL; 443 close->cl_stateowner = NULL;
432 READ_BUF(4 + sizeof(stateid_t)); 444 READ_BUF(4);
433 READ32(close->cl_seqid); 445 READ32(close->cl_seqid);
434 READ32(close->cl_stateid.si_generation); 446 return nfsd4_decode_stateid(argp, &close->cl_stateid);
435 COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
436 447
437 DECODE_TAIL; 448 DECODE_TAIL;
438} 449}
@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
493static inline __be32 504static inline __be32
494nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) 505nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
495{ 506{
496 DECODE_HEAD; 507 return nfsd4_decode_stateid(argp, &dr->dr_stateid);
497
498 READ_BUF(sizeof(stateid_t));
499 READ32(dr->dr_stateid.si_generation);
500 COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
501
502 DECODE_TAIL;
503} 508}
504 509
505static inline __be32 510static inline __be32
@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
542 READ32(lock->lk_is_new); 547 READ32(lock->lk_is_new);
543 548
544 if (lock->lk_is_new) { 549 if (lock->lk_is_new) {
545 READ_BUF(36); 550 READ_BUF(4);
546 READ32(lock->lk_new_open_seqid); 551 READ32(lock->lk_new_open_seqid);
547 READ32(lock->lk_new_open_stateid.si_generation); 552 status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
548 553 if (status)
549 COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t)); 554 return status;
555 READ_BUF(8 + sizeof(clientid_t));
550 READ32(lock->lk_new_lock_seqid); 556 READ32(lock->lk_new_lock_seqid);
551 COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); 557 COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
552 READ32(lock->lk_new_owner.len); 558 READ32(lock->lk_new_owner.len);
553 READ_BUF(lock->lk_new_owner.len); 559 READ_BUF(lock->lk_new_owner.len);
554 READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); 560 READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
555 } else { 561 } else {
556 READ_BUF(20); 562 status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
557 READ32(lock->lk_old_lock_stateid.si_generation); 563 if (status)
558 COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t)); 564 return status;
565 READ_BUF(4);
559 READ32(lock->lk_old_lock_seqid); 566 READ32(lock->lk_old_lock_seqid);
560 } 567 }
561 568
@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
587 DECODE_HEAD; 594 DECODE_HEAD;
588 595
589 locku->lu_stateowner = NULL; 596 locku->lu_stateowner = NULL;
590 READ_BUF(24 + sizeof(stateid_t)); 597 READ_BUF(8);
591 READ32(locku->lu_type); 598 READ32(locku->lu_type);
592 if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) 599 if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
593 goto xdr_error; 600 goto xdr_error;
594 READ32(locku->lu_seqid); 601 READ32(locku->lu_seqid);
595 READ32(locku->lu_stateid.si_generation); 602 status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
596 COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); 603 if (status)
604 return status;
605 READ_BUF(16);
597 READ64(locku->lu_offset); 606 READ64(locku->lu_offset);
598 READ64(locku->lu_length); 607 READ64(locku->lu_length);
599 608
@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
678 READ32(open->op_delegate_type); 687 READ32(open->op_delegate_type);
679 break; 688 break;
680 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 689 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
681 READ_BUF(sizeof(stateid_t) + 4); 690 status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
682 COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t)); 691 if (status)
692 return status;
693 READ_BUF(4);
683 READ32(open->op_fname.len); 694 READ32(open->op_fname.len);
684 READ_BUF(open->op_fname.len); 695 READ_BUF(open->op_fname.len);
685 SAVEMEM(open->op_fname.data, open->op_fname.len); 696 SAVEMEM(open->op_fname.data, open->op_fname.len);
@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
699 DECODE_HEAD; 710 DECODE_HEAD;
700 711
701 open_conf->oc_stateowner = NULL; 712 open_conf->oc_stateowner = NULL;
702 READ_BUF(4 + sizeof(stateid_t)); 713 status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
703 READ32(open_conf->oc_req_stateid.si_generation); 714 if (status)
704 COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t)); 715 return status;
716 READ_BUF(4);
705 READ32(open_conf->oc_seqid); 717 READ32(open_conf->oc_seqid);
706 718
707 DECODE_TAIL; 719 DECODE_TAIL;
@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
713 DECODE_HEAD; 725 DECODE_HEAD;
714 726
715 open_down->od_stateowner = NULL; 727 open_down->od_stateowner = NULL;
716 READ_BUF(12 + sizeof(stateid_t)); 728 status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
717 READ32(open_down->od_stateid.si_generation); 729 if (status)
718 COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t)); 730 return status;
731 READ_BUF(12);
719 READ32(open_down->od_seqid); 732 READ32(open_down->od_seqid);
720 READ32(open_down->od_share_access); 733 READ32(open_down->od_share_access);
721 READ32(open_down->od_share_deny); 734 READ32(open_down->od_share_deny);
@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
743{ 756{
744 DECODE_HEAD; 757 DECODE_HEAD;
745 758
746 READ_BUF(sizeof(stateid_t) + 12); 759 status = nfsd4_decode_stateid(argp, &read->rd_stateid);
747 READ32(read->rd_stateid.si_generation); 760 if (status)
748 COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t)); 761 return status;
762 READ_BUF(12);
749 READ64(read->rd_offset); 763 READ64(read->rd_offset);
750 READ32(read->rd_length); 764 READ32(read->rd_length);
751 765
@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
834static __be32 848static __be32
835nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 849nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
836{ 850{
837 DECODE_HEAD; 851 __be32 status;
838
839 READ_BUF(sizeof(stateid_t));
840 READ32(setattr->sa_stateid.si_generation);
841 COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
842 if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
843 goto out;
844 852
845 DECODE_TAIL; 853 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
854 if (status)
855 return status;
856 return nfsd4_decode_fattr(argp, setattr->sa_bmval,
857 &setattr->sa_iattr, &setattr->sa_acl);
846} 858}
847 859
848static __be32 860static __be32
@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
927 int len; 939 int len;
928 DECODE_HEAD; 940 DECODE_HEAD;
929 941
930 READ_BUF(sizeof(stateid_opaque_t) + 20); 942 status = nfsd4_decode_stateid(argp, &write->wr_stateid);
931 READ32(write->wr_stateid.si_generation); 943 if (status)
932 COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t)); 944 return status;
945 READ_BUF(16);
933 READ64(write->wr_offset); 946 READ64(write->wr_offset);
934 READ32(write->wr_stable_how); 947 READ32(write->wr_stable_how);
935 if (write->wr_stable_how > 2) 948 if (write->wr_stable_how > 2)
@@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1183 * Header routine to setup seqid operation replay cache 1196 * Header routine to setup seqid operation replay cache
1184 */ 1197 */
1185#define ENCODE_SEQID_OP_HEAD \ 1198#define ENCODE_SEQID_OP_HEAD \
1186 __be32 *p; \
1187 __be32 *save; \ 1199 __be32 *save; \
1188 \ 1200 \
1189 save = resp->p; 1201 save = resp->p;
@@ -1950,6 +1962,17 @@ fail:
1950 return -EINVAL; 1962 return -EINVAL;
1951} 1963}
1952 1964
1965static void
1966nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
1967{
1968 ENCODE_HEAD;
1969
1970 RESERVE_SPACE(sizeof(stateid_t));
1971 WRITE32(sid->si_generation);
1972 WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
1973 ADJUST_ARGS();
1974}
1975
1953static __be32 1976static __be32
1954nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) 1977nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
1955{ 1978{
@@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
1969{ 1992{
1970 ENCODE_SEQID_OP_HEAD; 1993 ENCODE_SEQID_OP_HEAD;
1971 1994
1972 if (!nfserr) { 1995 if (!nfserr)
1973 RESERVE_SPACE(sizeof(stateid_t)); 1996 nfsd4_encode_stateid(resp, &close->cl_stateid);
1974 WRITE32(close->cl_stateid.si_generation); 1997
1975 WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
1976 ADJUST_ARGS();
1977 }
1978 ENCODE_SEQID_OP_TAIL(close->cl_stateowner); 1998 ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
1979 return nfserr; 1999 return nfserr;
1980} 2000}
@@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
2074{ 2094{
2075 ENCODE_SEQID_OP_HEAD; 2095 ENCODE_SEQID_OP_HEAD;
2076 2096
2077 if (!nfserr) { 2097 if (!nfserr)
2078 RESERVE_SPACE(4 + sizeof(stateid_t)); 2098 nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
2079 WRITE32(lock->lk_resp_stateid.si_generation); 2099 else if (nfserr == nfserr_denied)
2080 WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
2081 ADJUST_ARGS();
2082 } else if (nfserr == nfserr_denied)
2083 nfsd4_encode_lock_denied(resp, &lock->lk_denied); 2100 nfsd4_encode_lock_denied(resp, &lock->lk_denied);
2084 2101
2085 ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); 2102 ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
@@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
2099{ 2116{
2100 ENCODE_SEQID_OP_HEAD; 2117 ENCODE_SEQID_OP_HEAD;
2101 2118
2102 if (!nfserr) { 2119 if (!nfserr)
2103 RESERVE_SPACE(sizeof(stateid_t)); 2120 nfsd4_encode_stateid(resp, &locku->lu_stateid);
2104 WRITE32(locku->lu_stateid.si_generation); 2121
2105 WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
2106 ADJUST_ARGS();
2107 }
2108
2109 ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); 2122 ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
2110 return nfserr; 2123 return nfserr;
2111} 2124}
@@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
2128static __be32 2141static __be32
2129nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) 2142nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
2130{ 2143{
2144 ENCODE_HEAD;
2131 ENCODE_SEQID_OP_HEAD; 2145 ENCODE_SEQID_OP_HEAD;
2132 2146
2133 if (nfserr) 2147 if (nfserr)
2134 goto out; 2148 goto out;
2135 2149
2136 RESERVE_SPACE(36 + sizeof(stateid_t)); 2150 nfsd4_encode_stateid(resp, &open->op_stateid);
2137 WRITE32(open->op_stateid.si_generation); 2151 RESERVE_SPACE(40);
2138 WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
2139 WRITECINFO(open->op_cinfo); 2152 WRITECINFO(open->op_cinfo);
2140 WRITE32(open->op_rflags); 2153 WRITE32(open->op_rflags);
2141 WRITE32(2); 2154 WRITE32(2);
@@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
2148 case NFS4_OPEN_DELEGATE_NONE: 2161 case NFS4_OPEN_DELEGATE_NONE:
2149 break; 2162 break;
2150 case NFS4_OPEN_DELEGATE_READ: 2163 case NFS4_OPEN_DELEGATE_READ:
2151 RESERVE_SPACE(20 + sizeof(stateid_t)); 2164 nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
2152 WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); 2165 RESERVE_SPACE(20);
2153 WRITE32(open->op_recall); 2166 WRITE32(open->op_recall);
2154 2167
2155 /* 2168 /*
@@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
2162 ADJUST_ARGS(); 2175 ADJUST_ARGS();
2163 break; 2176 break;
2164 case NFS4_OPEN_DELEGATE_WRITE: 2177 case NFS4_OPEN_DELEGATE_WRITE:
2165 RESERVE_SPACE(32 + sizeof(stateid_t)); 2178 nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
2166 WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); 2179 RESERVE_SPACE(32);
2167 WRITE32(0); 2180 WRITE32(0);
2168 2181
2169 /* 2182 /*
@@ -2195,13 +2208,9 @@ static __be32
2195nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) 2208nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
2196{ 2209{
2197 ENCODE_SEQID_OP_HEAD; 2210 ENCODE_SEQID_OP_HEAD;
2198 2211
2199 if (!nfserr) { 2212 if (!nfserr)
2200 RESERVE_SPACE(sizeof(stateid_t)); 2213 nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
2201 WRITE32(oc->oc_resp_stateid.si_generation);
2202 WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
2203 ADJUST_ARGS();
2204 }
2205 2214
2206 ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); 2215 ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
2207 return nfserr; 2216 return nfserr;
@@ -2211,13 +2220,9 @@ static __be32
2211nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) 2220nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
2212{ 2221{
2213 ENCODE_SEQID_OP_HEAD; 2222 ENCODE_SEQID_OP_HEAD;
2214 2223
2215 if (!nfserr) { 2224 if (!nfserr)
2216 RESERVE_SPACE(sizeof(stateid_t)); 2225 nfsd4_encode_stateid(resp, &od->od_stateid);
2217 WRITE32(od->od_stateid.si_generation);
2218 WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
2219 ADJUST_ARGS();
2220 }
2221 2226
2222 ENCODE_SEQID_OP_TAIL(od->od_stateowner); 2227 ENCODE_SEQID_OP_TAIL(od->od_stateowner);
2223 return nfserr; 2228 return nfserr;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c53e65f8f3a2..97543df58242 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
614 return -EINVAL; 614 return -EINVAL;
615 err = nfsd_create_serv(); 615 err = nfsd_create_serv();
616 if (!err) { 616 if (!err) {
617 int proto = 0; 617 err = svc_addsock(nfsd_serv, fd, buf);
618 err = svc_addsock(nfsd_serv, fd, buf, &proto);
619 if (err >= 0) { 618 if (err >= 0) {
620 err = lockd_up(proto); 619 err = lockd_up();
621 if (err < 0) 620 if (err < 0)
622 svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); 621 svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
623 } 622 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ea37c96f0445..cd25d91895a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
302 if (error) 302 if (error)
303 goto out; 303 goto out;
304 304
305 if (!(access & NFSD_MAY_LOCK)) { 305 /*
306 /* 306 * pseudoflavor restrictions are not enforced on NLM,
307 * pseudoflavor restrictions are not enforced on NLM, 307 * which clients virtually always use auth_sys for,
308 * which clients virtually always use auth_sys for, 308 * even while using RPCSEC_GSS for NFS.
309 * even while using RPCSEC_GSS for NFS. 309 */
310 */ 310 if (access & NFSD_MAY_LOCK)
311 error = check_nfsd_access(exp, rqstp); 311 goto skip_pseudoflavor_check;
312 if (error) 312 /*
313 goto out; 313 * Clients may expect to be able to use auth_sys during mount,
314 } 314 * even if they use gss for everything else; see section 2.3.2
315 * of rfc 2623.
316 */
317 if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
318 && exp->ex_path.dentry == dentry)
319 goto skip_pseudoflavor_check;
320
321 error = check_nfsd_access(exp, rqstp);
322 if (error)
323 goto out;
315 324
325skip_pseudoflavor_check:
316 /* Finally, check access permissions. */ 326 /* Finally, check access permissions. */
317 error = nfsd_permission(rqstp, exp, dentry, access); 327 error = nfsd_permission(rqstp, exp, dentry, access);
318 328
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0766f95d236a..5cffeca7acef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
65 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); 65 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
66 66
67 fh_copy(&resp->fh, &argp->fh); 67 fh_copy(&resp->fh, &argp->fh);
68 nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); 68 nfserr = fh_verify(rqstp, &resp->fh, 0,
69 NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
69 return nfsd_return_attrs(nfserr, resp); 70 return nfsd_return_attrs(nfserr, resp);
70} 71}
71 72
@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
521 522
522 dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); 523 dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
523 524
524 nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); 525 nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
526 NFSD_MAY_BYPASS_GSS_ON_ROOT);
525 fh_put(&argp->fh); 527 fh_put(&argp->fh);
526 return nfserr; 528 return nfserr;
527} 529}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 80292ff5e924..59eeb46f82c5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,6 +229,7 @@ int nfsd_create_serv(void)
229 229
230 atomic_set(&nfsd_busy, 0); 230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 AF_INET,
232 nfsd_last_thread, nfsd, THIS_MODULE); 233 nfsd_last_thread, nfsd, THIS_MODULE);
233 if (nfsd_serv == NULL) 234 if (nfsd_serv == NULL)
234 err = -ENOMEM; 235 err = -ENOMEM;
@@ -243,25 +244,20 @@ static int nfsd_init_socks(int port)
243 if (!list_empty(&nfsd_serv->sv_permsocks)) 244 if (!list_empty(&nfsd_serv->sv_permsocks))
244 return 0; 245 return 0;
245 246
246 error = lockd_up(IPPROTO_UDP); 247 error = svc_create_xprt(nfsd_serv, "udp", port,
247 if (error >= 0) {
248 error = svc_create_xprt(nfsd_serv, "udp", port,
249 SVC_SOCK_DEFAULTS); 248 SVC_SOCK_DEFAULTS);
250 if (error < 0)
251 lockd_down();
252 }
253 if (error < 0) 249 if (error < 0)
254 return error; 250 return error;
255 251
256 error = lockd_up(IPPROTO_TCP); 252 error = svc_create_xprt(nfsd_serv, "tcp", port,
257 if (error >= 0) {
258 error = svc_create_xprt(nfsd_serv, "tcp", port,
259 SVC_SOCK_DEFAULTS); 253 SVC_SOCK_DEFAULTS);
260 if (error < 0)
261 lockd_down();
262 }
263 if (error < 0) 254 if (error < 0)
264 return error; 255 return error;
256
257 error = lockd_up();
258 if (error < 0)
259 return error;
260
265 return 0; 261 return 0;
266} 262}
267 263
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 18060bed5267..aa1d0d6489a1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -83,7 +83,6 @@ struct raparm_hbucket {
83 spinlock_t pb_lock; 83 spinlock_t pb_lock;
84} ____cacheline_aligned_in_smp; 84} ____cacheline_aligned_in_smp;
85 85
86static struct raparms * raparml;
87#define RAPARM_HASH_BITS 4 86#define RAPARM_HASH_BITS 4
88#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) 87#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) 88#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
@@ -1866,9 +1865,9 @@ out:
1866 * N.B. After this call fhp needs an fh_put 1865 * N.B. After this call fhp needs an fh_put
1867 */ 1866 */
1868__be32 1867__be32
1869nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) 1868nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
1870{ 1869{
1871 __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); 1870 __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
1872 if (!err && vfs_statfs(fhp->fh_dentry,stat)) 1871 if (!err && vfs_statfs(fhp->fh_dentry,stat))
1873 err = nfserr_io; 1872 err = nfserr_io;
1874 return err; 1873 return err;
@@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
1966void 1965void
1967nfsd_racache_shutdown(void) 1966nfsd_racache_shutdown(void)
1968{ 1967{
1969 if (!raparml) 1968 struct raparms *raparm, *last_raparm;
1970 return; 1969 unsigned int i;
1970
1971 dprintk("nfsd: freeing readahead buffers.\n"); 1971 dprintk("nfsd: freeing readahead buffers.\n");
1972 kfree(raparml); 1972
1973 raparml = NULL; 1973 for (i = 0; i < RAPARM_HASH_SIZE; i++) {
1974 raparm = raparm_hash[i].pb_head;
1975 while(raparm) {
1976 last_raparm = raparm;
1977 raparm = raparm->p_next;
1978 kfree(last_raparm);
1979 }
1980 raparm_hash[i].pb_head = NULL;
1981 }
1974} 1982}
1975/* 1983/*
1976 * Initialize readahead param cache 1984 * Initialize readahead param cache
@@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size)
1981 int i; 1989 int i;
1982 int j = 0; 1990 int j = 0;
1983 int nperbucket; 1991 int nperbucket;
1992 struct raparms **raparm = NULL;
1984 1993
1985 1994
1986 if (raparml) 1995 if (raparm_hash[0].pb_head)
1987 return 0; 1996 return 0;
1988 if (cache_size < 2*RAPARM_HASH_SIZE) 1997 nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
1989 cache_size = 2*RAPARM_HASH_SIZE; 1998 if (nperbucket < 2)
1990 raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); 1999 nperbucket = 2;
1991 2000 cache_size = nperbucket * RAPARM_HASH_SIZE;
1992 if (!raparml) {
1993 printk(KERN_WARNING
1994 "nfsd: Could not allocate memory read-ahead cache.\n");
1995 return -ENOMEM;
1996 }
1997 2001
1998 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); 2002 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
1999 for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { 2003
2000 raparm_hash[i].pb_head = NULL; 2004 for (i = 0; i < RAPARM_HASH_SIZE; i++) {
2001 spin_lock_init(&raparm_hash[i].pb_lock); 2005 spin_lock_init(&raparm_hash[i].pb_lock);
2002 } 2006
2003 nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); 2007 raparm = &raparm_hash[i].pb_head;
2004 for (i = 0; i < cache_size - 1; i++) { 2008 for (j = 0; j < nperbucket; j++) {
2005 if (i % nperbucket == 0) 2009 *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
2006 raparm_hash[j++].pb_head = raparml + i; 2010 if (!*raparm)
2007 if (i % nperbucket < nperbucket-1) 2011 goto out_nomem;
2008 raparml[i].p_next = raparml + i + 1; 2012 raparm = &(*raparm)->p_next;
2013 }
2014 *raparm = NULL;
2009 } 2015 }
2010 2016
2011 nfsdstats.ra_size = cache_size; 2017 nfsdstats.ra_size = cache_size;
2012 return 0; 2018 return 0;
2019
2020out_nomem:
2021 dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
2022 nfsd_racache_shutdown();
2023 return -ENOMEM;
2013} 2024}
2014 2025
2015#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 2026#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 64965e1c21c4..9b0efdad8910 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -13,9 +13,7 @@
13#include <linux/nls.h> 13#include <linux/nls.h>
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#ifdef CONFIG_KMOD
17#include <linux/kmod.h> 16#include <linux/kmod.h>
18#endif
19#include <linux/spinlock.h> 17#include <linux/spinlock.h>
20 18
21static struct nls_table default_table; 19static struct nls_table default_table;
@@ -215,24 +213,7 @@ static struct nls_table *find_nls(char *charset)
215 213
216struct nls_table *load_nls(char *charset) 214struct nls_table *load_nls(char *charset)
217{ 215{
218 struct nls_table *nls; 216 return try_then_request_module(find_nls(charset), "nls_%s", charset);
219#ifdef CONFIG_KMOD
220 int ret;
221#endif
222
223 nls = find_nls(charset);
224 if (nls)
225 return nls;
226
227#ifdef CONFIG_KMOD
228 ret = request_module("nls_%s", charset);
229 if (ret != 0) {
230 printk("Unable to load NLS charset %s\n", charset);
231 return NULL;
232 }
233 nls = find_nls(charset);
234#endif
235 return nls;
236} 217}
237 218
238void unload_nls(struct nls_table *nls) 219void unload_nls(struct nls_table *nls)
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index e1781c8b1650..9e8a95be7a1e 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
174 // TODO: Consider moving this lot to a separate function! (AIA) 174 // TODO: Consider moving this lot to a separate function! (AIA)
175handle_name: 175handle_name:
176 { 176 {
177 struct dentry *real_dent, *new_dent;
178 MFT_RECORD *m; 177 MFT_RECORD *m;
179 ntfs_attr_search_ctx *ctx; 178 ntfs_attr_search_ctx *ctx;
180 ntfs_inode *ni = NTFS_I(dent_inode); 179 ntfs_inode *ni = NTFS_I(dent_inode);
@@ -255,93 +254,9 @@ handle_name:
255 } 254 }
256 nls_name.hash = full_name_hash(nls_name.name, nls_name.len); 255 nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
257 256
258 /* 257 dent = d_add_ci(dent, dent_inode, &nls_name);
259 * Note: No need for dent->d_lock lock as i_mutex is held on the
260 * parent inode.
261 */
262
263 /* Does a dentry matching the nls_name exist already? */
264 real_dent = d_lookup(dent->d_parent, &nls_name);
265 /* If not, create it now. */
266 if (!real_dent) {
267 real_dent = d_alloc(dent->d_parent, &nls_name);
268 kfree(nls_name.name);
269 if (!real_dent) {
270 err = -ENOMEM;
271 goto err_out;
272 }
273 new_dent = d_splice_alias(dent_inode, real_dent);
274 if (new_dent)
275 dput(real_dent);
276 else
277 new_dent = real_dent;
278 ntfs_debug("Done. (Created new dentry.)");
279 return new_dent;
280 }
281 kfree(nls_name.name); 258 kfree(nls_name.name);
282 /* Matching dentry exists, check if it is negative. */ 259 return dent;
283 if (real_dent->d_inode) {
284 if (unlikely(real_dent->d_inode != dent_inode)) {
285 /* This can happen because bad inodes are unhashed. */
286 BUG_ON(!is_bad_inode(dent_inode));
287 BUG_ON(!is_bad_inode(real_dent->d_inode));
288 }
289 /*
290 * Already have the inode and the dentry attached, decrement
291 * the reference count to balance the ntfs_iget() we did
292 * earlier on. We found the dentry using d_lookup() so it
293 * cannot be disconnected and thus we do not need to worry
294 * about any NFS/disconnectedness issues here.
295 */
296 iput(dent_inode);
297 ntfs_debug("Done. (Already had inode and dentry.)");
298 return real_dent;
299 }
300 /*
301 * Negative dentry: instantiate it unless the inode is a directory and
302 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
303 * in which case d_move() that in place of the found dentry.
304 */
305 if (!S_ISDIR(dent_inode->i_mode)) {
306 /* Not a directory; everything is easy. */
307 d_instantiate(real_dent, dent_inode);
308 ntfs_debug("Done. (Already had negative file dentry.)");
309 return real_dent;
310 }
311 spin_lock(&dcache_lock);
312 if (list_empty(&dent_inode->i_dentry)) {
313 /*
314 * Directory without a 'disconnected' dentry; we need to do
315 * d_instantiate() by hand because it takes dcache_lock which
316 * we already hold.
317 */
318 list_add(&real_dent->d_alias, &dent_inode->i_dentry);
319 real_dent->d_inode = dent_inode;
320 spin_unlock(&dcache_lock);
321 security_d_instantiate(real_dent, dent_inode);
322 ntfs_debug("Done. (Already had negative directory dentry.)");
323 return real_dent;
324 }
325 /*
326 * Directory with a 'disconnected' dentry; get a reference to the
327 * 'disconnected' dentry.
328 */
329 new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
330 d_alias);
331 dget_locked(new_dent);
332 spin_unlock(&dcache_lock);
333 /* Do security vodoo. */
334 security_d_instantiate(real_dent, dent_inode);
335 /* Move new_dent in place of real_dent. */
336 d_move(new_dent, real_dent);
337 /* Balance the ntfs_iget() we did above. */
338 iput(dent_inode);
339 /* Throw away real_dent. */
340 dput(real_dent);
341 /* Use new_dent as the actual dentry. */
342 ntfs_debug("Done. (Already had negative, disconnected directory "
343 "dentry.)");
344 return new_dent;
345 260
346eio_err_out: 261eio_err_out:
347 ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); 262 ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e8..4087fbdac327 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
113 * Reason flags (32-bit). Cumulative flags describing the change(s) to the 113 * Reason flags (32-bit). Cumulative flags describing the change(s) to the
114 * file since it was last opened. I think the names speak for themselves but 114 * file since it was last opened. I think the names speak for themselves but
115 * if you disagree check out the descriptions in the Linux NTFS project NTFS 115 * if you disagree check out the descriptions in the Linux NTFS project NTFS
116 * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html 116 * documentation: http://www.linux-ntfs.org/
117 */ 117 */
118enum { 118enum {
119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), 119 USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
145 * Source info flags (32-bit). Information about the source of the change(s) 145 * Source info flags (32-bit). Information about the source of the change(s)
146 * to the file. For detailed descriptions of what these mean, see the Linux 146 * to the file. For detailed descriptions of what these mean, see the Linux
147 * NTFS project NTFS documentation: 147 * NTFS project NTFS documentation:
148 * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html 148 * http://www.linux-ntfs.org/
149 */ 149 */
150enum { 150enum {
151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), 151 USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001),
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de56fdb..589dcdfdfe3c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
34 symlink.o \ 34 symlink.o \
35 sysfile.o \ 35 sysfile.o \
36 uptodate.o \ 36 uptodate.o \
37 ver.o 37 ver.o \
38 xattr.o
38 39
39ocfs2_stackglue-objs := stackglue.o 40ocfs2_stackglue-objs := stackglue.o
40ocfs2_stack_o2cb-objs := stack_o2cb.o 41ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e068..0cc2deb9394c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,340 @@
49 49
50#include "buffer_head_io.h" 50#include "buffer_head_io.h"
51 51
52
53/*
54 * Operations for a specific extent tree type.
55 *
56 * To implement an on-disk btree (extent tree) type in ocfs2, add
57 * an ocfs2_extent_tree_operations structure and the matching
58 * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it
59 * for the allocation portion of the extent tree.
60 */
61struct ocfs2_extent_tree_operations {
62 /*
63 * last_eb_blk is the block number of the right most leaf extent
64 * block. Most on-disk structures containing an extent tree store
65 * this value for fast access. The ->eo_set_last_eb_blk() and
66 * ->eo_get_last_eb_blk() operations access this value. They are
67 * both required.
68 */
69 void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
70 u64 blkno);
71 u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
72
73 /*
74 * The on-disk structure usually keeps track of how many total
75 * clusters are stored in this extent tree. This function updates
76 * that value. new_clusters is the delta, and must be
77 * added to the total. Required.
78 */
79 void (*eo_update_clusters)(struct inode *inode,
80 struct ocfs2_extent_tree *et,
81 u32 new_clusters);
82
83 /*
84 * If ->eo_insert_check() exists, it is called before rec is
85 * inserted into the extent tree. It is optional.
86 */
87 int (*eo_insert_check)(struct inode *inode,
88 struct ocfs2_extent_tree *et,
89 struct ocfs2_extent_rec *rec);
90 int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
91
92 /*
93 * --------------------------------------------------------------
94 * The remaining are internal to ocfs2_extent_tree and don't have
95 * accessor functions
96 */
97
98 /*
99 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
100 * It is required.
101 */
102 void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
103
104 /*
105 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
106 * it exists. If it does not, et->et_max_leaf_clusters is set
107 * to 0 (unlimited). Optional.
108 */
109 void (*eo_fill_max_leaf_clusters)(struct inode *inode,
110 struct ocfs2_extent_tree *et);
111};
112
113
114/*
115 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
116 * in the methods.
117 */
118static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
119static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
120 u64 blkno);
121static void ocfs2_dinode_update_clusters(struct inode *inode,
122 struct ocfs2_extent_tree *et,
123 u32 clusters);
124static int ocfs2_dinode_insert_check(struct inode *inode,
125 struct ocfs2_extent_tree *et,
126 struct ocfs2_extent_rec *rec);
127static int ocfs2_dinode_sanity_check(struct inode *inode,
128 struct ocfs2_extent_tree *et);
129static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
130static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
131 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
132 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
133 .eo_update_clusters = ocfs2_dinode_update_clusters,
134 .eo_insert_check = ocfs2_dinode_insert_check,
135 .eo_sanity_check = ocfs2_dinode_sanity_check,
136 .eo_fill_root_el = ocfs2_dinode_fill_root_el,
137};
138
139static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
140 u64 blkno)
141{
142 struct ocfs2_dinode *di = et->et_object;
143
144 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
145 di->i_last_eb_blk = cpu_to_le64(blkno);
146}
147
148static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
149{
150 struct ocfs2_dinode *di = et->et_object;
151
152 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
153 return le64_to_cpu(di->i_last_eb_blk);
154}
155
156static void ocfs2_dinode_update_clusters(struct inode *inode,
157 struct ocfs2_extent_tree *et,
158 u32 clusters)
159{
160 struct ocfs2_dinode *di = et->et_object;
161
162 le32_add_cpu(&di->i_clusters, clusters);
163 spin_lock(&OCFS2_I(inode)->ip_lock);
164 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
165 spin_unlock(&OCFS2_I(inode)->ip_lock);
166}
167
168static int ocfs2_dinode_insert_check(struct inode *inode,
169 struct ocfs2_extent_tree *et,
170 struct ocfs2_extent_rec *rec)
171{
172 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
173
174 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
175 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
176 (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
177 "Device %s, asking for sparse allocation: inode %llu, "
178 "cpos %u, clusters %u\n",
179 osb->dev_str,
180 (unsigned long long)OCFS2_I(inode)->ip_blkno,
181 rec->e_cpos,
182 OCFS2_I(inode)->ip_clusters);
183
184 return 0;
185}
186
187static int ocfs2_dinode_sanity_check(struct inode *inode,
188 struct ocfs2_extent_tree *et)
189{
190 int ret = 0;
191 struct ocfs2_dinode *di;
192
193 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
194
195 di = et->et_object;
196 if (!OCFS2_IS_VALID_DINODE(di)) {
197 ret = -EIO;
198 ocfs2_error(inode->i_sb,
199 "Inode %llu has invalid path root",
200 (unsigned long long)OCFS2_I(inode)->ip_blkno);
201 }
202
203 return ret;
204}
205
206static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
207{
208 struct ocfs2_dinode *di = et->et_object;
209
210 et->et_root_el = &di->id2.i_list;
211}
212
213
214static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
215{
216 struct ocfs2_xattr_value_root *xv = et->et_object;
217
218 et->et_root_el = &xv->xr_list;
219}
220
221static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
222 u64 blkno)
223{
224 struct ocfs2_xattr_value_root *xv =
225 (struct ocfs2_xattr_value_root *)et->et_object;
226
227 xv->xr_last_eb_blk = cpu_to_le64(blkno);
228}
229
230static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
231{
232 struct ocfs2_xattr_value_root *xv =
233 (struct ocfs2_xattr_value_root *) et->et_object;
234
235 return le64_to_cpu(xv->xr_last_eb_blk);
236}
237
238static void ocfs2_xattr_value_update_clusters(struct inode *inode,
239 struct ocfs2_extent_tree *et,
240 u32 clusters)
241{
242 struct ocfs2_xattr_value_root *xv =
243 (struct ocfs2_xattr_value_root *)et->et_object;
244
245 le32_add_cpu(&xv->xr_clusters, clusters);
246}
247
248static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
249 .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
250 .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
251 .eo_update_clusters = ocfs2_xattr_value_update_clusters,
252 .eo_fill_root_el = ocfs2_xattr_value_fill_root_el,
253};
254
255static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
256{
257 struct ocfs2_xattr_block *xb = et->et_object;
258
259 et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
260}
261
262static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
263 struct ocfs2_extent_tree *et)
264{
265 et->et_max_leaf_clusters =
266 ocfs2_clusters_for_bytes(inode->i_sb,
267 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
268}
269
270static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
271 u64 blkno)
272{
273 struct ocfs2_xattr_block *xb = et->et_object;
274 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
275
276 xt->xt_last_eb_blk = cpu_to_le64(blkno);
277}
278
279static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
280{
281 struct ocfs2_xattr_block *xb = et->et_object;
282 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
283
284 return le64_to_cpu(xt->xt_last_eb_blk);
285}
286
287static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
288 struct ocfs2_extent_tree *et,
289 u32 clusters)
290{
291 struct ocfs2_xattr_block *xb = et->et_object;
292
293 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
294}
295
296static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
297 .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
298 .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
299 .eo_update_clusters = ocfs2_xattr_tree_update_clusters,
300 .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el,
301 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
302};
303
304static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
305 struct inode *inode,
306 struct buffer_head *bh,
307 void *obj,
308 struct ocfs2_extent_tree_operations *ops)
309{
310 et->et_ops = ops;
311 et->et_root_bh = bh;
312 if (!obj)
313 obj = (void *)bh->b_data;
314 et->et_object = obj;
315
316 et->et_ops->eo_fill_root_el(et);
317 if (!et->et_ops->eo_fill_max_leaf_clusters)
318 et->et_max_leaf_clusters = 0;
319 else
320 et->et_ops->eo_fill_max_leaf_clusters(inode, et);
321}
322
323void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
324 struct inode *inode,
325 struct buffer_head *bh)
326{
327 __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
328}
329
330void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
331 struct inode *inode,
332 struct buffer_head *bh)
333{
334 __ocfs2_init_extent_tree(et, inode, bh, NULL,
335 &ocfs2_xattr_tree_et_ops);
336}
337
338void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 struct inode *inode,
340 struct buffer_head *bh,
341 struct ocfs2_xattr_value_root *xv)
342{
343 __ocfs2_init_extent_tree(et, inode, bh, xv,
344 &ocfs2_xattr_value_et_ops);
345}
346
347static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
348 u64 new_last_eb_blk)
349{
350 et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
351}
352
353static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
354{
355 return et->et_ops->eo_get_last_eb_blk(et);
356}
357
358static inline void ocfs2_et_update_clusters(struct inode *inode,
359 struct ocfs2_extent_tree *et,
360 u32 clusters)
361{
362 et->et_ops->eo_update_clusters(inode, et, clusters);
363}
364
365static inline int ocfs2_et_insert_check(struct inode *inode,
366 struct ocfs2_extent_tree *et,
367 struct ocfs2_extent_rec *rec)
368{
369 int ret = 0;
370
371 if (et->et_ops->eo_insert_check)
372 ret = et->et_ops->eo_insert_check(inode, et, rec);
373 return ret;
374}
375
376static inline int ocfs2_et_sanity_check(struct inode *inode,
377 struct ocfs2_extent_tree *et)
378{
379 int ret = 0;
380
381 if (et->et_ops->eo_sanity_check)
382 ret = et->et_ops->eo_sanity_check(inode, et);
383 return ret;
384}
385
52static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 386static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 387static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54 struct ocfs2_extent_block *eb); 388 struct ocfs2_extent_block *eb);
@@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
205} 539}
206 540
207/* 541/*
208 * Allocate and initialize a new path based on a disk inode tree.
209 */
210static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
211{
212 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
213 struct ocfs2_extent_list *el = &di->id2.i_list;
214
215 return ocfs2_new_path(di_bh, el);
216}
217
218/*
219 * Convenience function to journal all components in a path. 542 * Convenience function to journal all components in a path.
220 */ 543 */
221static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 544static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
@@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt {
368 */ 691 */
369int ocfs2_num_free_extents(struct ocfs2_super *osb, 692int ocfs2_num_free_extents(struct ocfs2_super *osb,
370 struct inode *inode, 693 struct inode *inode,
371 struct ocfs2_dinode *fe) 694 struct ocfs2_extent_tree *et)
372{ 695{
373 int retval; 696 int retval;
374 struct ocfs2_extent_list *el; 697 struct ocfs2_extent_list *el = NULL;
375 struct ocfs2_extent_block *eb; 698 struct ocfs2_extent_block *eb;
376 struct buffer_head *eb_bh = NULL; 699 struct buffer_head *eb_bh = NULL;
700 u64 last_eb_blk = 0;
377 701
378 mlog_entry_void(); 702 mlog_entry_void();
379 703
380 if (!OCFS2_IS_VALID_DINODE(fe)) { 704 el = et->et_root_el;
381 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 705 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
382 retval = -EIO;
383 goto bail;
384 }
385 706
386 if (fe->i_last_eb_blk) { 707 if (last_eb_blk) {
387 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 708 retval = ocfs2_read_block(inode, last_eb_blk,
388 &eb_bh, OCFS2_BH_CACHED, inode); 709 &eb_bh);
389 if (retval < 0) { 710 if (retval < 0) {
390 mlog_errno(retval); 711 mlog_errno(retval);
391 goto bail; 712 goto bail;
392 } 713 }
393 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 714 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
394 el = &eb->h_list; 715 el = &eb->h_list;
395 } else 716 }
396 el = &fe->id2.i_list;
397 717
398 BUG_ON(el->l_tree_depth != 0); 718 BUG_ON(el->l_tree_depth != 0);
399 719
400 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 720 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
401bail: 721bail:
402 if (eb_bh) 722 brelse(eb_bh);
403 brelse(eb_bh);
404 723
405 mlog_exit(retval); 724 mlog_exit(retval);
406 return retval; 725 return retval;
@@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
486bail: 805bail:
487 if (status < 0) { 806 if (status < 0) {
488 for(i = 0; i < wanted; i++) { 807 for(i = 0; i < wanted; i++) {
489 if (bhs[i]) 808 brelse(bhs[i]);
490 brelse(bhs[i]);
491 bhs[i] = NULL; 809 bhs[i] = NULL;
492 } 810 }
493 } 811 }
@@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
531static int ocfs2_add_branch(struct ocfs2_super *osb, 849static int ocfs2_add_branch(struct ocfs2_super *osb,
532 handle_t *handle, 850 handle_t *handle,
533 struct inode *inode, 851 struct inode *inode,
534 struct buffer_head *fe_bh, 852 struct ocfs2_extent_tree *et,
535 struct buffer_head *eb_bh, 853 struct buffer_head *eb_bh,
536 struct buffer_head **last_eb_bh, 854 struct buffer_head **last_eb_bh,
537 struct ocfs2_alloc_context *meta_ac) 855 struct ocfs2_alloc_context *meta_ac)
@@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
540 u64 next_blkno, new_last_eb_blk; 858 u64 next_blkno, new_last_eb_blk;
541 struct buffer_head *bh; 859 struct buffer_head *bh;
542 struct buffer_head **new_eb_bhs = NULL; 860 struct buffer_head **new_eb_bhs = NULL;
543 struct ocfs2_dinode *fe;
544 struct ocfs2_extent_block *eb; 861 struct ocfs2_extent_block *eb;
545 struct ocfs2_extent_list *eb_el; 862 struct ocfs2_extent_list *eb_el;
546 struct ocfs2_extent_list *el; 863 struct ocfs2_extent_list *el;
@@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
550 867
551 BUG_ON(!last_eb_bh || !*last_eb_bh); 868 BUG_ON(!last_eb_bh || !*last_eb_bh);
552 869
553 fe = (struct ocfs2_dinode *) fe_bh->b_data;
554
555 if (eb_bh) { 870 if (eb_bh) {
556 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 871 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
557 el = &eb->h_list; 872 el = &eb->h_list;
558 } else 873 } else
559 el = &fe->id2.i_list; 874 el = et->et_root_el;
560 875
561 /* we never add a branch to a leaf. */ 876 /* we never add a branch to a leaf. */
562 BUG_ON(!el->l_tree_depth); 877 BUG_ON(!el->l_tree_depth);
@@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
646 mlog_errno(status); 961 mlog_errno(status);
647 goto bail; 962 goto bail;
648 } 963 }
649 status = ocfs2_journal_access(handle, inode, fe_bh, 964 status = ocfs2_journal_access(handle, inode, et->et_root_bh,
650 OCFS2_JOURNAL_ACCESS_WRITE); 965 OCFS2_JOURNAL_ACCESS_WRITE);
651 if (status < 0) { 966 if (status < 0) {
652 mlog_errno(status); 967 mlog_errno(status);
@@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
662 } 977 }
663 978
664 /* Link the new branch into the rest of the tree (el will 979 /* Link the new branch into the rest of the tree (el will
665 * either be on the fe, or the extent block passed in. */ 980 * either be on the root_bh, or the extent block passed in. */
666 i = le16_to_cpu(el->l_next_free_rec); 981 i = le16_to_cpu(el->l_next_free_rec);
667 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 982 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
668 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 983 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
671 986
672 /* fe needs a new last extent block pointer, as does the 987 /* fe needs a new last extent block pointer, as does the
673 * next_leaf on the previously last-extent-block. */ 988 * next_leaf on the previously last-extent-block. */
674 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 989 ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
675 990
676 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 991 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
677 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 992 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
679 status = ocfs2_journal_dirty(handle, *last_eb_bh); 994 status = ocfs2_journal_dirty(handle, *last_eb_bh);
680 if (status < 0) 995 if (status < 0)
681 mlog_errno(status); 996 mlog_errno(status);
682 status = ocfs2_journal_dirty(handle, fe_bh); 997 status = ocfs2_journal_dirty(handle, et->et_root_bh);
683 if (status < 0) 998 if (status < 0)
684 mlog_errno(status); 999 mlog_errno(status);
685 if (eb_bh) { 1000 if (eb_bh) {
@@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
700bail: 1015bail:
701 if (new_eb_bhs) { 1016 if (new_eb_bhs) {
702 for (i = 0; i < new_blocks; i++) 1017 for (i = 0; i < new_blocks; i++)
703 if (new_eb_bhs[i]) 1018 brelse(new_eb_bhs[i]);
704 brelse(new_eb_bhs[i]);
705 kfree(new_eb_bhs); 1019 kfree(new_eb_bhs);
706 } 1020 }
707 1021
@@ -717,16 +1031,15 @@ bail:
717static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 1031static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
718 handle_t *handle, 1032 handle_t *handle,
719 struct inode *inode, 1033 struct inode *inode,
720 struct buffer_head *fe_bh, 1034 struct ocfs2_extent_tree *et,
721 struct ocfs2_alloc_context *meta_ac, 1035 struct ocfs2_alloc_context *meta_ac,
722 struct buffer_head **ret_new_eb_bh) 1036 struct buffer_head **ret_new_eb_bh)
723{ 1037{
724 int status, i; 1038 int status, i;
725 u32 new_clusters; 1039 u32 new_clusters;
726 struct buffer_head *new_eb_bh = NULL; 1040 struct buffer_head *new_eb_bh = NULL;
727 struct ocfs2_dinode *fe;
728 struct ocfs2_extent_block *eb; 1041 struct ocfs2_extent_block *eb;
729 struct ocfs2_extent_list *fe_el; 1042 struct ocfs2_extent_list *root_el;
730 struct ocfs2_extent_list *eb_el; 1043 struct ocfs2_extent_list *eb_el;
731 1044
732 mlog_entry_void(); 1045 mlog_entry_void();
@@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
746 } 1059 }
747 1060
748 eb_el = &eb->h_list; 1061 eb_el = &eb->h_list;
749 fe = (struct ocfs2_dinode *) fe_bh->b_data; 1062 root_el = et->et_root_el;
750 fe_el = &fe->id2.i_list;
751 1063
752 status = ocfs2_journal_access(handle, inode, new_eb_bh, 1064 status = ocfs2_journal_access(handle, inode, new_eb_bh,
753 OCFS2_JOURNAL_ACCESS_CREATE); 1065 OCFS2_JOURNAL_ACCESS_CREATE);
@@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
756 goto bail; 1068 goto bail;
757 } 1069 }
758 1070
759 /* copy the fe data into the new extent block */ 1071 /* copy the root extent list data into the new extent block */
760 eb_el->l_tree_depth = fe_el->l_tree_depth; 1072 eb_el->l_tree_depth = root_el->l_tree_depth;
761 eb_el->l_next_free_rec = fe_el->l_next_free_rec; 1073 eb_el->l_next_free_rec = root_el->l_next_free_rec;
762 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 1074 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
763 eb_el->l_recs[i] = fe_el->l_recs[i]; 1075 eb_el->l_recs[i] = root_el->l_recs[i];
764 1076
765 status = ocfs2_journal_dirty(handle, new_eb_bh); 1077 status = ocfs2_journal_dirty(handle, new_eb_bh);
766 if (status < 0) { 1078 if (status < 0) {
@@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
768 goto bail; 1080 goto bail;
769 } 1081 }
770 1082
771 status = ocfs2_journal_access(handle, inode, fe_bh, 1083 status = ocfs2_journal_access(handle, inode, et->et_root_bh,
772 OCFS2_JOURNAL_ACCESS_WRITE); 1084 OCFS2_JOURNAL_ACCESS_WRITE);
773 if (status < 0) { 1085 if (status < 0) {
774 mlog_errno(status); 1086 mlog_errno(status);
@@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
777 1089
778 new_clusters = ocfs2_sum_rightmost_rec(eb_el); 1090 new_clusters = ocfs2_sum_rightmost_rec(eb_el);
779 1091
780 /* update fe now */ 1092 /* update root_bh now */
781 le16_add_cpu(&fe_el->l_tree_depth, 1); 1093 le16_add_cpu(&root_el->l_tree_depth, 1);
782 fe_el->l_recs[0].e_cpos = 0; 1094 root_el->l_recs[0].e_cpos = 0;
783 fe_el->l_recs[0].e_blkno = eb->h_blkno; 1095 root_el->l_recs[0].e_blkno = eb->h_blkno;
784 fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 1096 root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
785 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 1097 for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
786 memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 1098 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
787 fe_el->l_next_free_rec = cpu_to_le16(1); 1099 root_el->l_next_free_rec = cpu_to_le16(1);
788 1100
789 /* If this is our 1st tree depth shift, then last_eb_blk 1101 /* If this is our 1st tree depth shift, then last_eb_blk
790 * becomes the allocated extent block */ 1102 * becomes the allocated extent block */
791 if (fe_el->l_tree_depth == cpu_to_le16(1)) 1103 if (root_el->l_tree_depth == cpu_to_le16(1))
792 fe->i_last_eb_blk = eb->h_blkno; 1104 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
793 1105
794 status = ocfs2_journal_dirty(handle, fe_bh); 1106 status = ocfs2_journal_dirty(handle, et->et_root_bh);
795 if (status < 0) { 1107 if (status < 0) {
796 mlog_errno(status); 1108 mlog_errno(status);
797 goto bail; 1109 goto bail;
@@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
801 new_eb_bh = NULL; 1113 new_eb_bh = NULL;
802 status = 0; 1114 status = 0;
803bail: 1115bail:
804 if (new_eb_bh) 1116 brelse(new_eb_bh);
805 brelse(new_eb_bh);
806 1117
807 mlog_exit(status); 1118 mlog_exit(status);
808 return status; 1119 return status;
@@ -817,22 +1128,21 @@ bail:
817 * 1) a lowest extent block is found, then we pass it back in 1128 * 1) a lowest extent block is found, then we pass it back in
818 * *lowest_eb_bh and return '0' 1129 * *lowest_eb_bh and return '0'
819 * 1130 *
820 * 2) the search fails to find anything, but the dinode has room. We 1131 * 2) the search fails to find anything, but the root_el has room. We
821 * pass NULL back in *lowest_eb_bh, but still return '0' 1132 * pass NULL back in *lowest_eb_bh, but still return '0'
822 * 1133 *
823 * 3) the search fails to find anything AND the dinode is full, in 1134 * 3) the search fails to find anything AND the root_el is full, in
824 * which case we return > 0 1135 * which case we return > 0
825 * 1136 *
826 * return status < 0 indicates an error. 1137 * return status < 0 indicates an error.
827 */ 1138 */
828static int ocfs2_find_branch_target(struct ocfs2_super *osb, 1139static int ocfs2_find_branch_target(struct ocfs2_super *osb,
829 struct inode *inode, 1140 struct inode *inode,
830 struct buffer_head *fe_bh, 1141 struct ocfs2_extent_tree *et,
831 struct buffer_head **target_bh) 1142 struct buffer_head **target_bh)
832{ 1143{
833 int status = 0, i; 1144 int status = 0, i;
834 u64 blkno; 1145 u64 blkno;
835 struct ocfs2_dinode *fe;
836 struct ocfs2_extent_block *eb; 1146 struct ocfs2_extent_block *eb;
837 struct ocfs2_extent_list *el; 1147 struct ocfs2_extent_list *el;
838 struct buffer_head *bh = NULL; 1148 struct buffer_head *bh = NULL;
@@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
842 1152
843 *target_bh = NULL; 1153 *target_bh = NULL;
844 1154
845 fe = (struct ocfs2_dinode *) fe_bh->b_data; 1155 el = et->et_root_el;
846 el = &fe->id2.i_list;
847 1156
848 while(le16_to_cpu(el->l_tree_depth) > 1) { 1157 while(le16_to_cpu(el->l_tree_depth) > 1) {
849 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1158 if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
864 goto bail; 1173 goto bail;
865 } 1174 }
866 1175
867 if (bh) { 1176 brelse(bh);
868 brelse(bh); 1177 bh = NULL;
869 bh = NULL;
870 }
871 1178
872 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, 1179 status = ocfs2_read_block(inode, blkno, &bh);
873 inode);
874 if (status < 0) { 1180 if (status < 0) {
875 mlog_errno(status); 1181 mlog_errno(status);
876 goto bail; 1182 goto bail;
@@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
886 1192
887 if (le16_to_cpu(el->l_next_free_rec) < 1193 if (le16_to_cpu(el->l_next_free_rec) <
888 le16_to_cpu(el->l_count)) { 1194 le16_to_cpu(el->l_count)) {
889 if (lowest_bh) 1195 brelse(lowest_bh);
890 brelse(lowest_bh);
891 lowest_bh = bh; 1196 lowest_bh = bh;
892 get_bh(lowest_bh); 1197 get_bh(lowest_bh);
893 } 1198 }
@@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
895 1200
896 /* If we didn't find one and the fe doesn't have any room, 1201 /* If we didn't find one and the fe doesn't have any room,
897 * then return '1' */ 1202 * then return '1' */
898 if (!lowest_bh 1203 el = et->et_root_el;
899 && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) 1204 if (!lowest_bh && (el->l_next_free_rec == el->l_count))
900 status = 1; 1205 status = 1;
901 1206
902 *target_bh = lowest_bh; 1207 *target_bh = lowest_bh;
903bail: 1208bail:
904 if (bh) 1209 brelse(bh);
905 brelse(bh);
906 1210
907 mlog_exit(status); 1211 mlog_exit(status);
908 return status; 1212 return status;
@@ -919,19 +1223,19 @@ bail:
919 * *last_eb_bh will be updated by ocfs2_add_branch(). 1223 * *last_eb_bh will be updated by ocfs2_add_branch().
920 */ 1224 */
921static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 1225static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
922 struct buffer_head *di_bh, int *final_depth, 1226 struct ocfs2_extent_tree *et, int *final_depth,
923 struct buffer_head **last_eb_bh, 1227 struct buffer_head **last_eb_bh,
924 struct ocfs2_alloc_context *meta_ac) 1228 struct ocfs2_alloc_context *meta_ac)
925{ 1229{
926 int ret, shift; 1230 int ret, shift;
927 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1231 struct ocfs2_extent_list *el = et->et_root_el;
928 int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); 1232 int depth = le16_to_cpu(el->l_tree_depth);
929 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
930 struct buffer_head *bh = NULL; 1234 struct buffer_head *bh = NULL;
931 1235
932 BUG_ON(meta_ac == NULL); 1236 BUG_ON(meta_ac == NULL);
933 1237
934 shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); 1238 shift = ocfs2_find_branch_target(osb, inode, et, &bh);
935 if (shift < 0) { 1239 if (shift < 0) {
936 ret = shift; 1240 ret = shift;
937 mlog_errno(ret); 1241 mlog_errno(ret);
@@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
948 /* ocfs2_shift_tree_depth will return us a buffer with 1252 /* ocfs2_shift_tree_depth will return us a buffer with
949 * the new extent block (so we can pass that to 1253 * the new extent block (so we can pass that to
950 * ocfs2_add_branch). */ 1254 * ocfs2_add_branch). */
951 ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, 1255 ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
952 meta_ac, &bh); 1256 meta_ac, &bh);
953 if (ret < 0) { 1257 if (ret < 0) {
954 mlog_errno(ret); 1258 mlog_errno(ret);
@@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
975 /* call ocfs2_add_branch to add the final part of the tree with 1279 /* call ocfs2_add_branch to add the final part of the tree with
976 * the new data. */ 1280 * the new data. */
977 mlog(0, "add branch. bh = %p\n", bh); 1281 mlog(0, "add branch. bh = %p\n", bh);
978 ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, 1282 ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
979 meta_ac); 1283 meta_ac);
980 if (ret < 0) { 1284 if (ret < 0) {
981 mlog_errno(ret); 1285 mlog_errno(ret);
@@ -990,15 +1294,6 @@ out:
990} 1294}
991 1295
992/* 1296/*
993 * This is only valid for leaf nodes, which are the only ones that can
994 * have empty extents anyway.
995 */
996static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
997{
998 return !rec->e_leaf_clusters;
999}
1000
1001/*
1002 * This function will discard the rightmost extent record. 1297 * This function will discard the rightmost extent record.
1003 */ 1298 */
1004static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 1299static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
@@ -1245,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
1245 1540
1246 brelse(bh); 1541 brelse(bh);
1247 bh = NULL; 1542 bh = NULL;
1248 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, 1543 ret = ocfs2_read_block(inode, blkno, &bh);
1249 &bh, OCFS2_BH_CACHED, inode);
1250 if (ret) { 1544 if (ret) {
1251 mlog_errno(ret); 1545 mlog_errno(ret);
1252 goto out; 1546 goto out;
@@ -2067,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2067 struct ocfs2_path *right_path, 2361 struct ocfs2_path *right_path,
2068 int subtree_index, 2362 int subtree_index,
2069 struct ocfs2_cached_dealloc_ctxt *dealloc, 2363 struct ocfs2_cached_dealloc_ctxt *dealloc,
2070 int *deleted) 2364 int *deleted,
2365 struct ocfs2_extent_tree *et)
2071{ 2366{
2072 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2367 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2073 struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); 2368 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2074 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2075 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; 2369 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2076 struct ocfs2_extent_block *eb; 2370 struct ocfs2_extent_block *eb;
2077 2371
@@ -2123,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2123 * We have to update i_last_eb_blk during the meta 2417 * We have to update i_last_eb_blk during the meta
2124 * data delete. 2418 * data delete.
2125 */ 2419 */
2126 ret = ocfs2_journal_access(handle, inode, di_bh, 2420 ret = ocfs2_journal_access(handle, inode, et_root_bh,
2127 OCFS2_JOURNAL_ACCESS_WRITE); 2421 OCFS2_JOURNAL_ACCESS_WRITE);
2128 if (ret) { 2422 if (ret) {
2129 mlog_errno(ret); 2423 mlog_errno(ret);
@@ -2198,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2198 ocfs2_update_edge_lengths(inode, handle, left_path); 2492 ocfs2_update_edge_lengths(inode, handle, left_path);
2199 2493
2200 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2494 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2201 di->i_last_eb_blk = eb->h_blkno; 2495 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2202 2496
2203 /* 2497 /*
2204 * Removal of the extent in the left leaf was skipped 2498 * Removal of the extent in the left leaf was skipped
@@ -2208,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2208 if (right_has_empty) 2502 if (right_has_empty)
2209 ocfs2_remove_empty_extent(left_leaf_el); 2503 ocfs2_remove_empty_extent(left_leaf_el);
2210 2504
2211 ret = ocfs2_journal_dirty(handle, di_bh); 2505 ret = ocfs2_journal_dirty(handle, et_root_bh);
2212 if (ret) 2506 if (ret)
2213 mlog_errno(ret); 2507 mlog_errno(ret);
2214 2508
@@ -2331,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2331 handle_t *handle, int orig_credits, 2625 handle_t *handle, int orig_credits,
2332 struct ocfs2_path *path, 2626 struct ocfs2_path *path,
2333 struct ocfs2_cached_dealloc_ctxt *dealloc, 2627 struct ocfs2_cached_dealloc_ctxt *dealloc,
2334 struct ocfs2_path **empty_extent_path) 2628 struct ocfs2_path **empty_extent_path,
2629 struct ocfs2_extent_tree *et)
2335{ 2630{
2336 int ret, subtree_root, deleted; 2631 int ret, subtree_root, deleted;
2337 u32 right_cpos; 2632 u32 right_cpos;
@@ -2404,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2404 2699
2405 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 2700 ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2406 right_path, subtree_root, 2701 right_path, subtree_root,
2407 dealloc, &deleted); 2702 dealloc, &deleted, et);
2408 if (ret == -EAGAIN) { 2703 if (ret == -EAGAIN) {
2409 /* 2704 /*
2410 * The rotation has to temporarily stop due to 2705 * The rotation has to temporarily stop due to
@@ -2447,29 +2742,20 @@ out:
2447} 2742}
2448 2743
2449static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, 2744static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2450 struct ocfs2_path *path, 2745 struct ocfs2_path *path,
2451 struct ocfs2_cached_dealloc_ctxt *dealloc) 2746 struct ocfs2_cached_dealloc_ctxt *dealloc,
2747 struct ocfs2_extent_tree *et)
2452{ 2748{
2453 int ret, subtree_index; 2749 int ret, subtree_index;
2454 u32 cpos; 2750 u32 cpos;
2455 struct ocfs2_path *left_path = NULL; 2751 struct ocfs2_path *left_path = NULL;
2456 struct ocfs2_dinode *di;
2457 struct ocfs2_extent_block *eb; 2752 struct ocfs2_extent_block *eb;
2458 struct ocfs2_extent_list *el; 2753 struct ocfs2_extent_list *el;
2459 2754
2460 /*
2461 * XXX: This code assumes that the root is an inode, which is
2462 * true for now but may change as tree code gets generic.
2463 */
2464 di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2465 if (!OCFS2_IS_VALID_DINODE(di)) {
2466 ret = -EIO;
2467 ocfs2_error(inode->i_sb,
2468 "Inode %llu has invalid path root",
2469 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2470 goto out;
2471 }
2472 2755
2756 ret = ocfs2_et_sanity_check(inode, et);
2757 if (ret)
2758 goto out;
2473 /* 2759 /*
2474 * There's two ways we handle this depending on 2760 * There's two ways we handle this depending on
2475 * whether path is the only existing one. 2761 * whether path is the only existing one.
@@ -2526,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2526 ocfs2_update_edge_lengths(inode, handle, left_path); 2812 ocfs2_update_edge_lengths(inode, handle, left_path);
2527 2813
2528 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2814 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2529 di->i_last_eb_blk = eb->h_blkno; 2815 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2530 } else { 2816 } else {
2531 /* 2817 /*
2532 * 'path' is also the leftmost path which 2818 * 'path' is also the leftmost path which
@@ -2537,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2537 */ 2823 */
2538 ocfs2_unlink_path(inode, handle, dealloc, path, 1); 2824 ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2539 2825
2540 el = &di->id2.i_list; 2826 el = et->et_root_el;
2541 el->l_tree_depth = 0; 2827 el->l_tree_depth = 0;
2542 el->l_next_free_rec = 0; 2828 el->l_next_free_rec = 0;
2543 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2829 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2544 2830
2545 di->i_last_eb_blk = 0; 2831 ocfs2_et_set_last_eb_blk(et, 0);
2546 } 2832 }
2547 2833
2548 ocfs2_journal_dirty(handle, path_root_bh(path)); 2834 ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2570,7 +2856,8 @@ out:
2570 */ 2856 */
2571static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, 2857static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2572 struct ocfs2_path *path, 2858 struct ocfs2_path *path,
2573 struct ocfs2_cached_dealloc_ctxt *dealloc) 2859 struct ocfs2_cached_dealloc_ctxt *dealloc,
2860 struct ocfs2_extent_tree *et)
2574{ 2861{
2575 int ret, orig_credits = handle->h_buffer_credits; 2862 int ret, orig_credits = handle->h_buffer_credits;
2576 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 2863 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2584,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2584 if (path->p_tree_depth == 0) { 2871 if (path->p_tree_depth == 0) {
2585rightmost_no_delete: 2872rightmost_no_delete:
2586 /* 2873 /*
2587 * In-inode extents. This is trivially handled, so do 2874 * Inline extents. This is trivially handled, so do
2588 * it up front. 2875 * it up front.
2589 */ 2876 */
2590 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 2877 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2638,7 +2925,7 @@ rightmost_no_delete:
2638 */ 2925 */
2639 2926
2640 ret = ocfs2_remove_rightmost_path(inode, handle, path, 2927 ret = ocfs2_remove_rightmost_path(inode, handle, path,
2641 dealloc); 2928 dealloc, et);
2642 if (ret) 2929 if (ret)
2643 mlog_errno(ret); 2930 mlog_errno(ret);
2644 goto out; 2931 goto out;
@@ -2650,7 +2937,7 @@ rightmost_no_delete:
2650 */ 2937 */
2651try_rotate: 2938try_rotate:
2652 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, 2939 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2653 dealloc, &restart_path); 2940 dealloc, &restart_path, et);
2654 if (ret && ret != -EAGAIN) { 2941 if (ret && ret != -EAGAIN) {
2655 mlog_errno(ret); 2942 mlog_errno(ret);
2656 goto out; 2943 goto out;
@@ -2662,7 +2949,7 @@ try_rotate:
2662 2949
2663 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, 2950 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2664 tmp_path, dealloc, 2951 tmp_path, dealloc,
2665 &restart_path); 2952 &restart_path, et);
2666 if (ret && ret != -EAGAIN) { 2953 if (ret && ret != -EAGAIN) {
2667 mlog_errno(ret); 2954 mlog_errno(ret);
2668 goto out; 2955 goto out;
@@ -2948,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
2948 handle_t *handle, 3235 handle_t *handle,
2949 struct ocfs2_extent_rec *split_rec, 3236 struct ocfs2_extent_rec *split_rec,
2950 struct ocfs2_cached_dealloc_ctxt *dealloc, 3237 struct ocfs2_cached_dealloc_ctxt *dealloc,
3238 struct ocfs2_extent_tree *et,
2951 int index) 3239 int index)
2952{ 3240{
2953 int ret, i, subtree_index = 0, has_empty_extent = 0; 3241 int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3068,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3068 le16_to_cpu(el->l_next_free_rec) == 1) { 3356 le16_to_cpu(el->l_next_free_rec) == 1) {
3069 3357
3070 ret = ocfs2_remove_rightmost_path(inode, handle, 3358 ret = ocfs2_remove_rightmost_path(inode, handle,
3071 right_path, dealloc); 3359 right_path,
3360 dealloc, et);
3072 if (ret) { 3361 if (ret) {
3073 mlog_errno(ret); 3362 mlog_errno(ret);
3074 goto out; 3363 goto out;
@@ -3095,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3095 int split_index, 3384 int split_index,
3096 struct ocfs2_extent_rec *split_rec, 3385 struct ocfs2_extent_rec *split_rec,
3097 struct ocfs2_cached_dealloc_ctxt *dealloc, 3386 struct ocfs2_cached_dealloc_ctxt *dealloc,
3098 struct ocfs2_merge_ctxt *ctxt) 3387 struct ocfs2_merge_ctxt *ctxt,
3388 struct ocfs2_extent_tree *et)
3099 3389
3100{ 3390{
3101 int ret = 0; 3391 int ret = 0;
@@ -3113,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3113 * illegal. 3403 * illegal.
3114 */ 3404 */
3115 ret = ocfs2_rotate_tree_left(inode, handle, path, 3405 ret = ocfs2_rotate_tree_left(inode, handle, path,
3116 dealloc); 3406 dealloc, et);
3117 if (ret) { 3407 if (ret) {
3118 mlog_errno(ret); 3408 mlog_errno(ret);
3119 goto out; 3409 goto out;
@@ -3156,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3156 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3446 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3157 3447
3158 /* The merge left us with an empty extent, remove it. */ 3448 /* The merge left us with an empty extent, remove it. */
3159 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); 3449 ret = ocfs2_rotate_tree_left(inode, handle, path,
3450 dealloc, et);
3160 if (ret) { 3451 if (ret) {
3161 mlog_errno(ret); 3452 mlog_errno(ret);
3162 goto out; 3453 goto out;
@@ -3170,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3170 */ 3461 */
3171 ret = ocfs2_merge_rec_left(inode, path, 3462 ret = ocfs2_merge_rec_left(inode, path,
3172 handle, rec, 3463 handle, rec,
3173 dealloc, 3464 dealloc, et,
3174 split_index); 3465 split_index);
3175 3466
3176 if (ret) { 3467 if (ret) {
@@ -3179,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3179 } 3470 }
3180 3471
3181 ret = ocfs2_rotate_tree_left(inode, handle, path, 3472 ret = ocfs2_rotate_tree_left(inode, handle, path,
3182 dealloc); 3473 dealloc, et);
3183 /* 3474 /*
3184 * Error from this last rotate is not critical, so 3475 * Error from this last rotate is not critical, so
3185 * print but don't bubble it up. 3476 * print but don't bubble it up.
@@ -3199,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3199 ret = ocfs2_merge_rec_left(inode, 3490 ret = ocfs2_merge_rec_left(inode,
3200 path, 3491 path,
3201 handle, split_rec, 3492 handle, split_rec,
3202 dealloc, 3493 dealloc, et,
3203 split_index); 3494 split_index);
3204 if (ret) { 3495 if (ret) {
3205 mlog_errno(ret); 3496 mlog_errno(ret);
@@ -3222,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3222 * our leaf. Try to rotate it away. 3513 * our leaf. Try to rotate it away.
3223 */ 3514 */
3224 ret = ocfs2_rotate_tree_left(inode, handle, path, 3515 ret = ocfs2_rotate_tree_left(inode, handle, path,
3225 dealloc); 3516 dealloc, et);
3226 if (ret) 3517 if (ret)
3227 mlog_errno(ret); 3518 mlog_errno(ret);
3228 ret = 0; 3519 ret = 0;
@@ -3356,16 +3647,6 @@ rotate:
3356 ocfs2_rotate_leaf(el, insert_rec); 3647 ocfs2_rotate_leaf(el, insert_rec);
3357} 3648}
3358 3649
3359static inline void ocfs2_update_dinode_clusters(struct inode *inode,
3360 struct ocfs2_dinode *di,
3361 u32 clusters)
3362{
3363 le32_add_cpu(&di->i_clusters, clusters);
3364 spin_lock(&OCFS2_I(inode)->ip_lock);
3365 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
3366 spin_unlock(&OCFS2_I(inode)->ip_lock);
3367}
3368
3369static void ocfs2_adjust_rightmost_records(struct inode *inode, 3650static void ocfs2_adjust_rightmost_records(struct inode *inode,
3370 handle_t *handle, 3651 handle_t *handle,
3371 struct ocfs2_path *path, 3652 struct ocfs2_path *path,
@@ -3567,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode,
3567} 3848}
3568 3849
3569/* 3850/*
3570 * This function only does inserts on an allocation b-tree. For dinode 3851 * This function only does inserts on an allocation b-tree. For tree
3571 * lists, ocfs2_insert_at_leaf() is called directly. 3852 * depth = 0, ocfs2_insert_at_leaf() is called directly.
3572 * 3853 *
3573 * right_path is the path we want to do the actual insert 3854 * right_path is the path we want to do the actual insert
3574 * in. left_path should only be passed in if we need to update that 3855 * in. left_path should only be passed in if we need to update that
@@ -3665,7 +3946,7 @@ out:
3665 3946
3666static int ocfs2_do_insert_extent(struct inode *inode, 3947static int ocfs2_do_insert_extent(struct inode *inode,
3667 handle_t *handle, 3948 handle_t *handle,
3668 struct buffer_head *di_bh, 3949 struct ocfs2_extent_tree *et,
3669 struct ocfs2_extent_rec *insert_rec, 3950 struct ocfs2_extent_rec *insert_rec,
3670 struct ocfs2_insert_type *type) 3951 struct ocfs2_insert_type *type)
3671{ 3952{
@@ -3673,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3673 u32 cpos; 3954 u32 cpos;
3674 struct ocfs2_path *right_path = NULL; 3955 struct ocfs2_path *right_path = NULL;
3675 struct ocfs2_path *left_path = NULL; 3956 struct ocfs2_path *left_path = NULL;
3676 struct ocfs2_dinode *di;
3677 struct ocfs2_extent_list *el; 3957 struct ocfs2_extent_list *el;
3678 3958
3679 di = (struct ocfs2_dinode *) di_bh->b_data; 3959 el = et->et_root_el;
3680 el = &di->id2.i_list;
3681 3960
3682 ret = ocfs2_journal_access(handle, inode, di_bh, 3961 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3683 OCFS2_JOURNAL_ACCESS_WRITE); 3962 OCFS2_JOURNAL_ACCESS_WRITE);
3684 if (ret) { 3963 if (ret) {
3685 mlog_errno(ret); 3964 mlog_errno(ret);
@@ -3691,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3691 goto out_update_clusters; 3970 goto out_update_clusters;
3692 } 3971 }
3693 3972
3694 right_path = ocfs2_new_inode_path(di_bh); 3973 right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
3695 if (!right_path) { 3974 if (!right_path) {
3696 ret = -ENOMEM; 3975 ret = -ENOMEM;
3697 mlog_errno(ret); 3976 mlog_errno(ret);
@@ -3741,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3741 * ocfs2_rotate_tree_right() might have extended the 4020 * ocfs2_rotate_tree_right() might have extended the
3742 * transaction without re-journaling our tree root. 4021 * transaction without re-journaling our tree root.
3743 */ 4022 */
3744 ret = ocfs2_journal_access(handle, inode, di_bh, 4023 ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
3745 OCFS2_JOURNAL_ACCESS_WRITE); 4024 OCFS2_JOURNAL_ACCESS_WRITE);
3746 if (ret) { 4025 if (ret) {
3747 mlog_errno(ret); 4026 mlog_errno(ret);
@@ -3766,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3766 4045
3767out_update_clusters: 4046out_update_clusters:
3768 if (type->ins_split == SPLIT_NONE) 4047 if (type->ins_split == SPLIT_NONE)
3769 ocfs2_update_dinode_clusters(inode, di, 4048 ocfs2_et_update_clusters(inode, et,
3770 le16_to_cpu(insert_rec->e_leaf_clusters)); 4049 le16_to_cpu(insert_rec->e_leaf_clusters));
3771 4050
3772 ret = ocfs2_journal_dirty(handle, di_bh); 4051 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
3773 if (ret) 4052 if (ret)
3774 mlog_errno(ret); 4053 mlog_errno(ret);
3775 4054
@@ -3899,7 +4178,8 @@ out:
3899static void ocfs2_figure_contig_type(struct inode *inode, 4178static void ocfs2_figure_contig_type(struct inode *inode,
3900 struct ocfs2_insert_type *insert, 4179 struct ocfs2_insert_type *insert,
3901 struct ocfs2_extent_list *el, 4180 struct ocfs2_extent_list *el,
3902 struct ocfs2_extent_rec *insert_rec) 4181 struct ocfs2_extent_rec *insert_rec,
4182 struct ocfs2_extent_tree *et)
3903{ 4183{
3904 int i; 4184 int i;
3905 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4185 enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -3915,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode,
3915 } 4195 }
3916 } 4196 }
3917 insert->ins_contig = contig_type; 4197 insert->ins_contig = contig_type;
4198
4199 if (insert->ins_contig != CONTIG_NONE) {
4200 struct ocfs2_extent_rec *rec =
4201 &el->l_recs[insert->ins_contig_index];
4202 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4203 le16_to_cpu(insert_rec->e_leaf_clusters);
4204
4205 /*
4206 * Caller might want us to limit the size of extents, don't
4207 * calculate contiguousness if we might exceed that limit.
4208 */
4209 if (et->et_max_leaf_clusters &&
4210 (len > et->et_max_leaf_clusters))
4211 insert->ins_contig = CONTIG_NONE;
4212 }
3918} 4213}
3919 4214
3920/* 4215/*
@@ -3923,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
3923 * ocfs2_figure_appending_type() will figure out whether we'll have to 4218 * ocfs2_figure_appending_type() will figure out whether we'll have to
3924 * insert at the tail of the rightmost leaf. 4219 * insert at the tail of the rightmost leaf.
3925 * 4220 *
3926 * This should also work against the dinode list for tree's with 0 4221 * This should also work against the root extent list for tree's with 0
3927 * depth. If we consider the dinode list to be the rightmost leaf node 4222 * depth. If we consider the root extent list to be the rightmost leaf node
3928 * then the logic here makes sense. 4223 * then the logic here makes sense.
3929 */ 4224 */
3930static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 4225static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3975,14 +4270,13 @@ set_tail_append:
3975 * structure. 4270 * structure.
3976 */ 4271 */
3977static int ocfs2_figure_insert_type(struct inode *inode, 4272static int ocfs2_figure_insert_type(struct inode *inode,
3978 struct buffer_head *di_bh, 4273 struct ocfs2_extent_tree *et,
3979 struct buffer_head **last_eb_bh, 4274 struct buffer_head **last_eb_bh,
3980 struct ocfs2_extent_rec *insert_rec, 4275 struct ocfs2_extent_rec *insert_rec,
3981 int *free_records, 4276 int *free_records,
3982 struct ocfs2_insert_type *insert) 4277 struct ocfs2_insert_type *insert)
3983{ 4278{
3984 int ret; 4279 int ret;
3985 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3986 struct ocfs2_extent_block *eb; 4280 struct ocfs2_extent_block *eb;
3987 struct ocfs2_extent_list *el; 4281 struct ocfs2_extent_list *el;
3988 struct ocfs2_path *path = NULL; 4282 struct ocfs2_path *path = NULL;
@@ -3990,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
3990 4284
3991 insert->ins_split = SPLIT_NONE; 4285 insert->ins_split = SPLIT_NONE;
3992 4286
3993 el = &di->id2.i_list; 4287 el = et->et_root_el;
3994 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 4288 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
3995 4289
3996 if (el->l_tree_depth) { 4290 if (el->l_tree_depth) {
@@ -4000,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4000 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4294 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4001 * may want it later. 4295 * may want it later.
4002 */ 4296 */
4003 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4297 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
4004 le64_to_cpu(di->i_last_eb_blk), &bh,
4005 OCFS2_BH_CACHED, inode);
4006 if (ret) { 4298 if (ret) {
4007 mlog_exit(ret); 4299 mlog_exit(ret);
4008 goto out; 4300 goto out;
@@ -4023,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4023 le16_to_cpu(el->l_next_free_rec); 4315 le16_to_cpu(el->l_next_free_rec);
4024 4316
4025 if (!insert->ins_tree_depth) { 4317 if (!insert->ins_tree_depth) {
4026 ocfs2_figure_contig_type(inode, insert, el, insert_rec); 4318 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4027 ocfs2_figure_appending_type(insert, el, insert_rec); 4319 ocfs2_figure_appending_type(insert, el, insert_rec);
4028 return 0; 4320 return 0;
4029 } 4321 }
4030 4322
4031 path = ocfs2_new_inode_path(di_bh); 4323 path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4032 if (!path) { 4324 if (!path) {
4033 ret = -ENOMEM; 4325 ret = -ENOMEM;
4034 mlog_errno(ret); 4326 mlog_errno(ret);
@@ -4057,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4057 * into two types of appends: simple record append, or a 4349 * into two types of appends: simple record append, or a
4058 * rotate inside the tail leaf. 4350 * rotate inside the tail leaf.
4059 */ 4351 */
4060 ocfs2_figure_contig_type(inode, insert, el, insert_rec); 4352 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4061 4353
4062 /* 4354 /*
4063 * The insert code isn't quite ready to deal with all cases of 4355 * The insert code isn't quite ready to deal with all cases of
@@ -4078,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4078 * the case that we're doing a tail append, so maybe we can 4370 * the case that we're doing a tail append, so maybe we can
4079 * take advantage of that information somehow. 4371 * take advantage of that information somehow.
4080 */ 4372 */
4081 if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { 4373 if (ocfs2_et_get_last_eb_blk(et) ==
4374 path_leaf_bh(path)->b_blocknr) {
4082 /* 4375 /*
4083 * Ok, ocfs2_find_path() returned us the rightmost 4376 * Ok, ocfs2_find_path() returned us the rightmost
4084 * tree path. This might be an appending insert. There are 4377 * tree path. This might be an appending insert. There are
@@ -4108,7 +4401,7 @@ out:
4108int ocfs2_insert_extent(struct ocfs2_super *osb, 4401int ocfs2_insert_extent(struct ocfs2_super *osb,
4109 handle_t *handle, 4402 handle_t *handle,
4110 struct inode *inode, 4403 struct inode *inode,
4111 struct buffer_head *fe_bh, 4404 struct ocfs2_extent_tree *et,
4112 u32 cpos, 4405 u32 cpos,
4113 u64 start_blk, 4406 u64 start_blk,
4114 u32 new_clusters, 4407 u32 new_clusters,
@@ -4121,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4121 struct ocfs2_insert_type insert = {0, }; 4414 struct ocfs2_insert_type insert = {0, };
4122 struct ocfs2_extent_rec rec; 4415 struct ocfs2_extent_rec rec;
4123 4416
4124 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4125
4126 mlog(0, "add %u clusters at position %u to inode %llu\n", 4417 mlog(0, "add %u clusters at position %u to inode %llu\n",
4127 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4418 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4128 4419
4129 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4130 (OCFS2_I(inode)->ip_clusters != cpos),
4131 "Device %s, asking for sparse allocation: inode %llu, "
4132 "cpos %u, clusters %u\n",
4133 osb->dev_str,
4134 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4135 OCFS2_I(inode)->ip_clusters);
4136
4137 memset(&rec, 0, sizeof(rec)); 4420 memset(&rec, 0, sizeof(rec));
4138 rec.e_cpos = cpu_to_le32(cpos); 4421 rec.e_cpos = cpu_to_le32(cpos);
4139 rec.e_blkno = cpu_to_le64(start_blk); 4422 rec.e_blkno = cpu_to_le64(start_blk);
4140 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4423 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4141 rec.e_flags = flags; 4424 rec.e_flags = flags;
4425 status = ocfs2_et_insert_check(inode, et, &rec);
4426 if (status) {
4427 mlog_errno(status);
4428 goto bail;
4429 }
4142 4430
4143 status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 4431 status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4144 &free_records, &insert); 4432 &free_records, &insert);
4145 if (status < 0) { 4433 if (status < 0) {
4146 mlog_errno(status); 4434 mlog_errno(status);
@@ -4154,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4154 free_records, insert.ins_tree_depth); 4442 free_records, insert.ins_tree_depth);
4155 4443
4156 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4444 if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4157 status = ocfs2_grow_tree(inode, handle, fe_bh, 4445 status = ocfs2_grow_tree(inode, handle, et,
4158 &insert.ins_tree_depth, &last_eb_bh, 4446 &insert.ins_tree_depth, &last_eb_bh,
4159 meta_ac); 4447 meta_ac);
4160 if (status) { 4448 if (status) {
@@ -4164,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4164 } 4452 }
4165 4453
4166 /* Finally, we can add clusters. This might rotate the tree for us. */ 4454 /* Finally, we can add clusters. This might rotate the tree for us. */
4167 status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 4455 status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4168 if (status < 0) 4456 if (status < 0)
4169 mlog_errno(status); 4457 mlog_errno(status);
4170 else 4458 else if (et->et_ops == &ocfs2_dinode_et_ops)
4171 ocfs2_extent_map_insert_rec(inode, &rec); 4459 ocfs2_extent_map_insert_rec(inode, &rec);
4172 4460
4173bail: 4461bail:
4174 if (last_eb_bh) 4462 brelse(last_eb_bh);
4175 brelse(last_eb_bh); 4463
4464 mlog_exit(status);
4465 return status;
4466}
4467
4468/*
4469 * Allcate and add clusters into the extent b-tree.
4470 * The new clusters(clusters_to_add) will be inserted at logical_offset.
4471 * The extent b-tree's root is specified by et, and
4472 * it is not limited to the file storage. Any extent tree can use this
4473 * function if it implements the proper ocfs2_extent_tree.
4474 */
4475int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4476 struct inode *inode,
4477 u32 *logical_offset,
4478 u32 clusters_to_add,
4479 int mark_unwritten,
4480 struct ocfs2_extent_tree *et,
4481 handle_t *handle,
4482 struct ocfs2_alloc_context *data_ac,
4483 struct ocfs2_alloc_context *meta_ac,
4484 enum ocfs2_alloc_restarted *reason_ret)
4485{
4486 int status = 0;
4487 int free_extents;
4488 enum ocfs2_alloc_restarted reason = RESTART_NONE;
4489 u32 bit_off, num_bits;
4490 u64 block;
4491 u8 flags = 0;
4492
4493 BUG_ON(!clusters_to_add);
4494
4495 if (mark_unwritten)
4496 flags = OCFS2_EXT_UNWRITTEN;
4497
4498 free_extents = ocfs2_num_free_extents(osb, inode, et);
4499 if (free_extents < 0) {
4500 status = free_extents;
4501 mlog_errno(status);
4502 goto leave;
4503 }
4504
4505 /* there are two cases which could cause us to EAGAIN in the
4506 * we-need-more-metadata case:
4507 * 1) we haven't reserved *any*
4508 * 2) we are so fragmented, we've needed to add metadata too
4509 * many times. */
4510 if (!free_extents && !meta_ac) {
4511 mlog(0, "we haven't reserved any metadata!\n");
4512 status = -EAGAIN;
4513 reason = RESTART_META;
4514 goto leave;
4515 } else if ((!free_extents)
4516 && (ocfs2_alloc_context_bits_left(meta_ac)
4517 < ocfs2_extend_meta_needed(et->et_root_el))) {
4518 mlog(0, "filesystem is really fragmented...\n");
4519 status = -EAGAIN;
4520 reason = RESTART_META;
4521 goto leave;
4522 }
4523
4524 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4525 clusters_to_add, &bit_off, &num_bits);
4526 if (status < 0) {
4527 if (status != -ENOSPC)
4528 mlog_errno(status);
4529 goto leave;
4530 }
4176 4531
4532 BUG_ON(num_bits > clusters_to_add);
4533
4534 /* reserve our write early -- insert_extent may update the inode */
4535 status = ocfs2_journal_access(handle, inode, et->et_root_bh,
4536 OCFS2_JOURNAL_ACCESS_WRITE);
4537 if (status < 0) {
4538 mlog_errno(status);
4539 goto leave;
4540 }
4541
4542 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4543 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4544 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4545 status = ocfs2_insert_extent(osb, handle, inode, et,
4546 *logical_offset, block,
4547 num_bits, flags, meta_ac);
4548 if (status < 0) {
4549 mlog_errno(status);
4550 goto leave;
4551 }
4552
4553 status = ocfs2_journal_dirty(handle, et->et_root_bh);
4554 if (status < 0) {
4555 mlog_errno(status);
4556 goto leave;
4557 }
4558
4559 clusters_to_add -= num_bits;
4560 *logical_offset += num_bits;
4561
4562 if (clusters_to_add) {
4563 mlog(0, "need to alloc once more, wanted = %u\n",
4564 clusters_to_add);
4565 status = -EAGAIN;
4566 reason = RESTART_TRANS;
4567 }
4568
4569leave:
4177 mlog_exit(status); 4570 mlog_exit(status);
4571 if (reason_ret)
4572 *reason_ret = reason;
4178 return status; 4573 return status;
4179} 4574}
4180 4575
@@ -4201,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
4201static int ocfs2_split_and_insert(struct inode *inode, 4596static int ocfs2_split_and_insert(struct inode *inode,
4202 handle_t *handle, 4597 handle_t *handle,
4203 struct ocfs2_path *path, 4598 struct ocfs2_path *path,
4204 struct buffer_head *di_bh, 4599 struct ocfs2_extent_tree *et,
4205 struct buffer_head **last_eb_bh, 4600 struct buffer_head **last_eb_bh,
4206 int split_index, 4601 int split_index,
4207 struct ocfs2_extent_rec *orig_split_rec, 4602 struct ocfs2_extent_rec *orig_split_rec,
@@ -4215,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
4215 struct ocfs2_extent_rec split_rec = *orig_split_rec; 4610 struct ocfs2_extent_rec split_rec = *orig_split_rec;
4216 struct ocfs2_insert_type insert; 4611 struct ocfs2_insert_type insert;
4217 struct ocfs2_extent_block *eb; 4612 struct ocfs2_extent_block *eb;
4218 struct ocfs2_dinode *di;
4219 4613
4220leftright: 4614leftright:
4221 /* 4615 /*
@@ -4224,8 +4618,7 @@ leftright:
4224 */ 4618 */
4225 rec = path_leaf_el(path)->l_recs[split_index]; 4619 rec = path_leaf_el(path)->l_recs[split_index];
4226 4620
4227 di = (struct ocfs2_dinode *)di_bh->b_data; 4621 rightmost_el = et->et_root_el;
4228 rightmost_el = &di->id2.i_list;
4229 4622
4230 depth = le16_to_cpu(rightmost_el->l_tree_depth); 4623 depth = le16_to_cpu(rightmost_el->l_tree_depth);
4231 if (depth) { 4624 if (depth) {
@@ -4236,8 +4629,8 @@ leftright:
4236 4629
4237 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4630 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4238 le16_to_cpu(rightmost_el->l_count)) { 4631 le16_to_cpu(rightmost_el->l_count)) {
4239 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, 4632 ret = ocfs2_grow_tree(inode, handle, et,
4240 meta_ac); 4633 &depth, last_eb_bh, meta_ac);
4241 if (ret) { 4634 if (ret) {
4242 mlog_errno(ret); 4635 mlog_errno(ret);
4243 goto out; 4636 goto out;
@@ -4274,8 +4667,7 @@ leftright:
4274 do_leftright = 1; 4667 do_leftright = 1;
4275 } 4668 }
4276 4669
4277 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, 4670 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4278 &insert);
4279 if (ret) { 4671 if (ret) {
4280 mlog_errno(ret); 4672 mlog_errno(ret);
4281 goto out; 4673 goto out;
@@ -4317,8 +4709,9 @@ out:
4317 * of the tree is required. All other cases will degrade into a less 4709 * of the tree is required. All other cases will degrade into a less
4318 * optimal tree layout. 4710 * optimal tree layout.
4319 * 4711 *
4320 * last_eb_bh should be the rightmost leaf block for any inode with a 4712 * last_eb_bh should be the rightmost leaf block for any extent
4321 * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. 4713 * btree. Since a split may grow the tree or a merge might shrink it,
4714 * the caller cannot trust the contents of that buffer after this call.
4322 * 4715 *
4323 * This code is optimized for readability - several passes might be 4716 * This code is optimized for readability - several passes might be
4324 * made over certain portions of the tree. All of those blocks will 4717 * made over certain portions of the tree. All of those blocks will
@@ -4326,7 +4719,7 @@ out:
4326 * extra overhead is not expressed in terms of disk reads. 4719 * extra overhead is not expressed in terms of disk reads.
4327 */ 4720 */
4328static int __ocfs2_mark_extent_written(struct inode *inode, 4721static int __ocfs2_mark_extent_written(struct inode *inode,
4329 struct buffer_head *di_bh, 4722 struct ocfs2_extent_tree *et,
4330 handle_t *handle, 4723 handle_t *handle,
4331 struct ocfs2_path *path, 4724 struct ocfs2_path *path,
4332 int split_index, 4725 int split_index,
@@ -4366,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4366 */ 4759 */
4367 if (path->p_tree_depth) { 4760 if (path->p_tree_depth) {
4368 struct ocfs2_extent_block *eb; 4761 struct ocfs2_extent_block *eb;
4369 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4370 4762
4371 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4763 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
4372 le64_to_cpu(di->i_last_eb_blk), 4764 &last_eb_bh);
4373 &last_eb_bh, OCFS2_BH_CACHED, inode);
4374 if (ret) { 4765 if (ret) {
4375 mlog_exit(ret); 4766 mlog_exit(ret);
4376 goto out; 4767 goto out;
@@ -4403,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4403 if (ctxt.c_split_covers_rec) 4794 if (ctxt.c_split_covers_rec)
4404 el->l_recs[split_index] = *split_rec; 4795 el->l_recs[split_index] = *split_rec;
4405 else 4796 else
4406 ret = ocfs2_split_and_insert(inode, handle, path, di_bh, 4797 ret = ocfs2_split_and_insert(inode, handle, path, et,
4407 &last_eb_bh, split_index, 4798 &last_eb_bh, split_index,
4408 split_rec, meta_ac); 4799 split_rec, meta_ac);
4409 if (ret) 4800 if (ret)
@@ -4411,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4411 } else { 4802 } else {
4412 ret = ocfs2_try_to_merge_extent(inode, handle, path, 4803 ret = ocfs2_try_to_merge_extent(inode, handle, path,
4413 split_index, split_rec, 4804 split_index, split_rec,
4414 dealloc, &ctxt); 4805 dealloc, &ctxt, et);
4415 if (ret) 4806 if (ret)
4416 mlog_errno(ret); 4807 mlog_errno(ret);
4417 } 4808 }
@@ -4429,7 +4820,8 @@ out:
4429 * 4820 *
4430 * The caller is responsible for passing down meta_ac if we'll need it. 4821 * The caller is responsible for passing down meta_ac if we'll need it.
4431 */ 4822 */
4432int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, 4823int ocfs2_mark_extent_written(struct inode *inode,
4824 struct ocfs2_extent_tree *et,
4433 handle_t *handle, u32 cpos, u32 len, u32 phys, 4825 handle_t *handle, u32 cpos, u32 len, u32 phys,
4434 struct ocfs2_alloc_context *meta_ac, 4826 struct ocfs2_alloc_context *meta_ac,
4435 struct ocfs2_cached_dealloc_ctxt *dealloc) 4827 struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4455,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4455 /* 4847 /*
4456 * XXX: This should be fixed up so that we just re-insert the 4848 * XXX: This should be fixed up so that we just re-insert the
4457 * next extent records. 4849 * next extent records.
4850 *
4851 * XXX: This is a hack on the extent tree, maybe it should be
4852 * an op?
4458 */ 4853 */
4459 ocfs2_extent_map_trunc(inode, 0); 4854 if (et->et_ops == &ocfs2_dinode_et_ops)
4855 ocfs2_extent_map_trunc(inode, 0);
4460 4856
4461 left_path = ocfs2_new_inode_path(di_bh); 4857 left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4462 if (!left_path) { 4858 if (!left_path) {
4463 ret = -ENOMEM; 4859 ret = -ENOMEM;
4464 mlog_errno(ret); 4860 mlog_errno(ret);
@@ -4489,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4489 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; 4885 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4490 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; 4886 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4491 4887
4492 ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, 4888 ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4493 index, &split_rec, meta_ac, dealloc); 4889 index, &split_rec, meta_ac,
4890 dealloc);
4494 if (ret) 4891 if (ret)
4495 mlog_errno(ret); 4892 mlog_errno(ret);
4496 4893
@@ -4499,13 +4896,12 @@ out:
4499 return ret; 4896 return ret;
4500} 4897}
4501 4898
4502static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, 4899static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4503 handle_t *handle, struct ocfs2_path *path, 4900 handle_t *handle, struct ocfs2_path *path,
4504 int index, u32 new_range, 4901 int index, u32 new_range,
4505 struct ocfs2_alloc_context *meta_ac) 4902 struct ocfs2_alloc_context *meta_ac)
4506{ 4903{
4507 int ret, depth, credits = handle->h_buffer_credits; 4904 int ret, depth, credits = handle->h_buffer_credits;
4508 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4509 struct buffer_head *last_eb_bh = NULL; 4905 struct buffer_head *last_eb_bh = NULL;
4510 struct ocfs2_extent_block *eb; 4906 struct ocfs2_extent_block *eb;
4511 struct ocfs2_extent_list *rightmost_el, *el; 4907 struct ocfs2_extent_list *rightmost_el, *el;
@@ -4522,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4522 4918
4523 depth = path->p_tree_depth; 4919 depth = path->p_tree_depth;
4524 if (depth > 0) { 4920 if (depth > 0) {
4525 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4921 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
4526 le64_to_cpu(di->i_last_eb_blk), 4922 &last_eb_bh);
4527 &last_eb_bh, OCFS2_BH_CACHED, inode);
4528 if (ret < 0) { 4923 if (ret < 0) {
4529 mlog_errno(ret); 4924 mlog_errno(ret);
4530 goto out; 4925 goto out;
@@ -4535,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4535 } else 4930 } else
4536 rightmost_el = path_leaf_el(path); 4931 rightmost_el = path_leaf_el(path);
4537 4932
4538 credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); 4933 credits += path->p_tree_depth +
4934 ocfs2_extend_meta_needed(et->et_root_el);
4539 ret = ocfs2_extend_trans(handle, credits); 4935 ret = ocfs2_extend_trans(handle, credits);
4540 if (ret) { 4936 if (ret) {
4541 mlog_errno(ret); 4937 mlog_errno(ret);
@@ -4544,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4544 4940
4545 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4941 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4546 le16_to_cpu(rightmost_el->l_count)) { 4942 le16_to_cpu(rightmost_el->l_count)) {
4547 ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, 4943 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4548 meta_ac); 4944 meta_ac);
4549 if (ret) { 4945 if (ret) {
4550 mlog_errno(ret); 4946 mlog_errno(ret);
@@ -4558,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4558 insert.ins_split = SPLIT_RIGHT; 4954 insert.ins_split = SPLIT_RIGHT;
4559 insert.ins_tree_depth = depth; 4955 insert.ins_tree_depth = depth;
4560 4956
4561 ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); 4957 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4562 if (ret) 4958 if (ret)
4563 mlog_errno(ret); 4959 mlog_errno(ret);
4564 4960
@@ -4570,7 +4966,8 @@ out:
4570static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, 4966static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4571 struct ocfs2_path *path, int index, 4967 struct ocfs2_path *path, int index,
4572 struct ocfs2_cached_dealloc_ctxt *dealloc, 4968 struct ocfs2_cached_dealloc_ctxt *dealloc,
4573 u32 cpos, u32 len) 4969 u32 cpos, u32 len,
4970 struct ocfs2_extent_tree *et)
4574{ 4971{
4575 int ret; 4972 int ret;
4576 u32 left_cpos, rec_range, trunc_range; 4973 u32 left_cpos, rec_range, trunc_range;
@@ -4582,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4582 struct ocfs2_extent_block *eb; 4979 struct ocfs2_extent_block *eb;
4583 4980
4584 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 4981 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4585 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); 4982 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4586 if (ret) { 4983 if (ret) {
4587 mlog_errno(ret); 4984 mlog_errno(ret);
4588 goto out; 4985 goto out;
@@ -4713,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4713 5110
4714 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5111 ocfs2_journal_dirty(handle, path_leaf_bh(path));
4715 5112
4716 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); 5113 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4717 if (ret) { 5114 if (ret) {
4718 mlog_errno(ret); 5115 mlog_errno(ret);
4719 goto out; 5116 goto out;
@@ -4724,7 +5121,8 @@ out:
4724 return ret; 5121 return ret;
4725} 5122}
4726 5123
4727int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, 5124int ocfs2_remove_extent(struct inode *inode,
5125 struct ocfs2_extent_tree *et,
4728 u32 cpos, u32 len, handle_t *handle, 5126 u32 cpos, u32 len, handle_t *handle,
4729 struct ocfs2_alloc_context *meta_ac, 5127 struct ocfs2_alloc_context *meta_ac,
4730 struct ocfs2_cached_dealloc_ctxt *dealloc) 5128 struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4733,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4733 u32 rec_range, trunc_range; 5131 u32 rec_range, trunc_range;
4734 struct ocfs2_extent_rec *rec; 5132 struct ocfs2_extent_rec *rec;
4735 struct ocfs2_extent_list *el; 5133 struct ocfs2_extent_list *el;
4736 struct ocfs2_path *path; 5134 struct ocfs2_path *path = NULL;
4737 5135
4738 ocfs2_extent_map_trunc(inode, 0); 5136 ocfs2_extent_map_trunc(inode, 0);
4739 5137
4740 path = ocfs2_new_inode_path(di_bh); 5138 path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
4741 if (!path) { 5139 if (!path) {
4742 ret = -ENOMEM; 5140 ret = -ENOMEM;
4743 mlog_errno(ret); 5141 mlog_errno(ret);
@@ -4790,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4790 5188
4791 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5189 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4792 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5190 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4793 cpos, len); 5191 cpos, len, et);
4794 if (ret) { 5192 if (ret) {
4795 mlog_errno(ret); 5193 mlog_errno(ret);
4796 goto out; 5194 goto out;
4797 } 5195 }
4798 } else { 5196 } else {
4799 ret = ocfs2_split_tree(inode, di_bh, handle, path, index, 5197 ret = ocfs2_split_tree(inode, et, handle, path, index,
4800 trunc_range, meta_ac); 5198 trunc_range, meta_ac);
4801 if (ret) { 5199 if (ret) {
4802 mlog_errno(ret); 5200 mlog_errno(ret);
@@ -4845,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4845 } 5243 }
4846 5244
4847 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5245 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4848 cpos, len); 5246 cpos, len, et);
4849 if (ret) { 5247 if (ret) {
4850 mlog_errno(ret); 5248 mlog_errno(ret);
4851 goto out; 5249 goto out;
@@ -5188,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5188 goto bail; 5586 goto bail;
5189 } 5587 }
5190 5588
5191 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 5589 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
5192 OCFS2_BH_CACHED, inode);
5193 if (status < 0) { 5590 if (status < 0) {
5194 iput(inode); 5591 iput(inode);
5195 mlog_errno(status); 5592 mlog_errno(status);
@@ -5264,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5264bail: 5661bail:
5265 if (tl_inode) 5662 if (tl_inode)
5266 iput(tl_inode); 5663 iput(tl_inode);
5267 if (tl_bh) 5664 brelse(tl_bh);
5268 brelse(tl_bh);
5269 5665
5270 if (status < 0 && (*tl_copy)) { 5666 if (status < 0 && (*tl_copy)) {
5271 kfree(*tl_copy); 5667 kfree(*tl_copy);
@@ -6008,20 +6404,13 @@ bail:
6008 return status; 6404 return status;
6009} 6405}
6010 6406
6011static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) 6407static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6012{ 6408{
6013 set_buffer_uptodate(bh); 6409 set_buffer_uptodate(bh);
6014 mark_buffer_dirty(bh); 6410 mark_buffer_dirty(bh);
6015 return 0; 6411 return 0;
6016} 6412}
6017 6413
6018static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6019{
6020 set_buffer_uptodate(bh);
6021 mark_buffer_dirty(bh);
6022 return ocfs2_journal_dirty_data(handle, bh);
6023}
6024
6025static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 6414static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6026 unsigned int from, unsigned int to, 6415 unsigned int from, unsigned int to,
6027 struct page *page, int zero, u64 *phys) 6416 struct page *page, int zero, u64 *phys)
@@ -6040,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6040 * here if they aren't - ocfs2_map_page_blocks() 6429 * here if they aren't - ocfs2_map_page_blocks()
6041 * might've skipped some 6430 * might've skipped some
6042 */ 6431 */
6043 if (ocfs2_should_order_data(inode)) { 6432 ret = walk_page_buffers(handle, page_buffers(page),
6044 ret = walk_page_buffers(handle, 6433 from, to, &partial,
6045 page_buffers(page), 6434 ocfs2_zero_func);
6046 from, to, &partial, 6435 if (ret < 0)
6047 ocfs2_ordered_zero_func); 6436 mlog_errno(ret);
6048 if (ret < 0) 6437 else if (ocfs2_should_order_data(inode)) {
6049 mlog_errno(ret); 6438 ret = ocfs2_jbd2_file_inode(handle, inode);
6050 } else { 6439#ifdef CONFIG_OCFS2_COMPAT_JBD
6051 ret = walk_page_buffers(handle, page_buffers(page), 6440 ret = walk_page_buffers(handle, page_buffers(page),
6052 from, to, &partial, 6441 from, to, &partial,
6053 ocfs2_writeback_zero_func); 6442 ocfs2_journal_dirty_data);
6443#endif
6054 if (ret < 0) 6444 if (ret < 0)
6055 mlog_errno(ret); 6445 mlog_errno(ret);
6056 } 6446 }
@@ -6215,20 +6605,29 @@ out:
6215 return ret; 6605 return ret;
6216} 6606}
6217 6607
6218static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di) 6608static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6609 struct ocfs2_dinode *di)
6219{ 6610{
6220 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; 6611 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6612 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6221 6613
6222 memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2)); 6614 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6615 memset(&di->id2, 0, blocksize -
6616 offsetof(struct ocfs2_dinode, id2) -
6617 xattrsize);
6618 else
6619 memset(&di->id2, 0, blocksize -
6620 offsetof(struct ocfs2_dinode, id2));
6223} 6621}
6224 6622
6225void ocfs2_dinode_new_extent_list(struct inode *inode, 6623void ocfs2_dinode_new_extent_list(struct inode *inode,
6226 struct ocfs2_dinode *di) 6624 struct ocfs2_dinode *di)
6227{ 6625{
6228 ocfs2_zero_dinode_id2(inode, di); 6626 ocfs2_zero_dinode_id2_with_xattr(inode, di);
6229 di->id2.i_list.l_tree_depth = 0; 6627 di->id2.i_list.l_tree_depth = 0;
6230 di->id2.i_list.l_next_free_rec = 0; 6628 di->id2.i_list.l_next_free_rec = 0;
6231 di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb)); 6629 di->id2.i_list.l_count = cpu_to_le16(
6630 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6232} 6631}
6233 6632
6234void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) 6633void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
@@ -6245,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6245 * We clear the entire i_data structure here so that all 6644 * We clear the entire i_data structure here so that all
6246 * fields can be properly initialized. 6645 * fields can be properly initialized.
6247 */ 6646 */
6248 ocfs2_zero_dinode_id2(inode, di); 6647 ocfs2_zero_dinode_id2_with_xattr(inode, di);
6249 6648
6250 idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb)); 6649 idata->id_count = cpu_to_le16(
6650 ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6251} 6651}
6252 6652
6253int ocfs2_convert_inline_data_to_extents(struct inode *inode, 6653int ocfs2_convert_inline_data_to_extents(struct inode *inode,
@@ -6262,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6262 struct ocfs2_alloc_context *data_ac = NULL; 6662 struct ocfs2_alloc_context *data_ac = NULL;
6263 struct page **pages = NULL; 6663 struct page **pages = NULL;
6264 loff_t end = osb->s_clustersize; 6664 loff_t end = osb->s_clustersize;
6665 struct ocfs2_extent_tree et;
6265 6666
6266 has_data = i_size_read(inode) ? 1 : 0; 6667 has_data = i_size_read(inode) ? 1 : 0;
6267 6668
@@ -6361,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6361 * this proves to be false, we could always re-build 6762 * this proves to be false, we could always re-build
6362 * the in-inode data from our pages. 6763 * the in-inode data from our pages.
6363 */ 6764 */
6364 ret = ocfs2_insert_extent(osb, handle, inode, di_bh, 6765 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
6766 ret = ocfs2_insert_extent(osb, handle, inode, &et,
6365 0, block, 1, 0, NULL); 6767 0, block, 1, 0, NULL);
6366 if (ret) { 6768 if (ret) {
6367 mlog_errno(ret); 6769 mlog_errno(ret);
@@ -6404,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
6404 handle_t *handle = NULL; 6806 handle_t *handle = NULL;
6405 struct inode *tl_inode = osb->osb_tl_inode; 6807 struct inode *tl_inode = osb->osb_tl_inode;
6406 struct ocfs2_path *path = NULL; 6808 struct ocfs2_path *path = NULL;
6809 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6407 6810
6408 mlog_entry_void(); 6811 mlog_entry_void();
6409 6812
6410 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 6813 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6411 i_size_read(inode)); 6814 i_size_read(inode));
6412 6815
6413 path = ocfs2_new_inode_path(fe_bh); 6816 path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6414 if (!path) { 6817 if (!path) {
6415 status = -ENOMEM; 6818 status = -ENOMEM;
6416 mlog_errno(status); 6819 mlog_errno(status);
@@ -6581,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6581 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 6984 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6582 6985
6583 if (fe->id2.i_list.l_tree_depth) { 6986 if (fe->id2.i_list.l_tree_depth) {
6584 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 6987 status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
6585 &last_eb_bh, OCFS2_BH_CACHED, inode); 6988 &last_eb_bh);
6586 if (status < 0) { 6989 if (status < 0) {
6587 mlog_errno(status); 6990 mlog_errno(status);
6588 goto bail; 6991 goto bail;
@@ -6695,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6695 mlog(ML_NOTICE, 7098 mlog(ML_NOTICE,
6696 "Truncate completion has non-empty dealloc context\n"); 7099 "Truncate completion has non-empty dealloc context\n");
6697 7100
6698 if (tc->tc_last_eb_bh) 7101 brelse(tc->tc_last_eb_bh);
6699 brelse(tc->tc_last_eb_bh);
6700 7102
6701 kfree(tc); 7103 kfree(tc);
6702} 7104}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd8011..70257c84cfbe 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,30 +26,102 @@
26#ifndef OCFS2_ALLOC_H 26#ifndef OCFS2_ALLOC_H
27#define OCFS2_ALLOC_H 27#define OCFS2_ALLOC_H
28 28
29
30/*
31 * For xattr tree leaf, we limit the leaf byte size to be 64K.
32 */
33#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
34
35/*
36 * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
37 * the b-tree operations in ocfs2. Now all the b-tree operations are not
38 * limited to ocfs2_dinode only. Any data which need to allocate clusters
39 * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
40 * and operation.
41 *
42 * ocfs2_extent_tree becomes the first-class object for extent tree
43 * manipulation. Callers of the alloc.c code need to fill it via one of
44 * the ocfs2_init_*_extent_tree() operations below.
45 *
46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
48 * functions.
49 * ocfs2_extent_tree_operations abstract the normal operations we do for
50 * the root of extent b-tree.
51 */
52struct ocfs2_extent_tree_operations;
53struct ocfs2_extent_tree {
54 struct ocfs2_extent_tree_operations *et_ops;
55 struct buffer_head *et_root_bh;
56 struct ocfs2_extent_list *et_root_el;
57 void *et_object;
58 unsigned int et_max_leaf_clusters;
59};
60
61/*
62 * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
63 * specified object buffer.
64 */
65void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
66 struct inode *inode,
67 struct buffer_head *bh);
68void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
69 struct inode *inode,
70 struct buffer_head *bh);
71void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
72 struct inode *inode,
73 struct buffer_head *bh,
74 struct ocfs2_xattr_value_root *xv);
75
29struct ocfs2_alloc_context; 76struct ocfs2_alloc_context;
30int ocfs2_insert_extent(struct ocfs2_super *osb, 77int ocfs2_insert_extent(struct ocfs2_super *osb,
31 handle_t *handle, 78 handle_t *handle,
32 struct inode *inode, 79 struct inode *inode,
33 struct buffer_head *fe_bh, 80 struct ocfs2_extent_tree *et,
34 u32 cpos, 81 u32 cpos,
35 u64 start_blk, 82 u64 start_blk,
36 u32 new_clusters, 83 u32 new_clusters,
37 u8 flags, 84 u8 flags,
38 struct ocfs2_alloc_context *meta_ac); 85 struct ocfs2_alloc_context *meta_ac);
86
87enum ocfs2_alloc_restarted {
88 RESTART_NONE = 0,
89 RESTART_TRANS,
90 RESTART_META
91};
92int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
93 struct inode *inode,
94 u32 *logical_offset,
95 u32 clusters_to_add,
96 int mark_unwritten,
97 struct ocfs2_extent_tree *et,
98 handle_t *handle,
99 struct ocfs2_alloc_context *data_ac,
100 struct ocfs2_alloc_context *meta_ac,
101 enum ocfs2_alloc_restarted *reason_ret);
39struct ocfs2_cached_dealloc_ctxt; 102struct ocfs2_cached_dealloc_ctxt;
40int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, 103int ocfs2_mark_extent_written(struct inode *inode,
104 struct ocfs2_extent_tree *et,
41 handle_t *handle, u32 cpos, u32 len, u32 phys, 105 handle_t *handle, u32 cpos, u32 len, u32 phys,
42 struct ocfs2_alloc_context *meta_ac, 106 struct ocfs2_alloc_context *meta_ac,
43 struct ocfs2_cached_dealloc_ctxt *dealloc); 107 struct ocfs2_cached_dealloc_ctxt *dealloc);
44int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, 108int ocfs2_remove_extent(struct inode *inode,
109 struct ocfs2_extent_tree *et,
45 u32 cpos, u32 len, handle_t *handle, 110 u32 cpos, u32 len, handle_t *handle,
46 struct ocfs2_alloc_context *meta_ac, 111 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc); 112 struct ocfs2_cached_dealloc_ctxt *dealloc);
48int ocfs2_num_free_extents(struct ocfs2_super *osb, 113int ocfs2_num_free_extents(struct ocfs2_super *osb,
49 struct inode *inode, 114 struct inode *inode,
50 struct ocfs2_dinode *fe); 115 struct ocfs2_extent_tree *et);
51/* how many new metadata chunks would an allocation need at maximum? */ 116
52static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) 117/*
118 * how many new metadata chunks would an allocation need at maximum?
119 *
120 * Please note that the caller must make sure that root_el is the root
121 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
122 * the result may be wrong.
123 */
124static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
53{ 125{
54 /* 126 /*
55 * Rather than do all the work of determining how much we need 127 * Rather than do all the work of determining how much we need
@@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
59 * new tree_depth==0 extent_block, and one block at the new 131 * new tree_depth==0 extent_block, and one block at the new
60 * top-of-the tree. 132 * top-of-the tree.
61 */ 133 */
62 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; 134 return le16_to_cpu(root_el->l_tree_depth) + 2;
63} 135}
64 136
65void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di); 137void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
@@ -146,4 +218,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
146 return le16_to_cpu(rec->e_leaf_clusters); 218 return le16_to_cpu(rec->e_leaf_clusters);
147} 219}
148 220
221/*
222 * This is only valid for leaf nodes, which are the only ones that can
223 * have empty extents anyway.
224 */
225static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
226{
227 return !rec->e_leaf_clusters;
228}
229
149#endif /* OCFS2_ALLOC_H */ 230#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 506c24fb5078..c22543b33420 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
68 goto bail; 68 goto bail;
69 } 69 }
70 70
71 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 71 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
72 OCFS2_I(inode)->ip_blkno,
73 &bh, OCFS2_BH_CACHED, inode);
74 if (status < 0) { 72 if (status < 0) {
75 mlog_errno(status); 73 mlog_errno(status);
76 goto bail; 74 goto bail;
@@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
128 err = 0; 126 err = 0;
129 127
130bail: 128bail:
131 if (bh) 129 brelse(bh);
132 brelse(bh);
133 130
134 mlog_exit(err); 131 mlog_exit(err);
135 return err; 132 return err;
@@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
261{ 258{
262 int ret; 259 int ret;
263 struct buffer_head *di_bh = NULL; 260 struct buffer_head *di_bh = NULL;
264 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
265 261
266 BUG_ON(!PageLocked(page)); 262 BUG_ON(!PageLocked(page));
267 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); 263 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
268 264
269 ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh, 265 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
270 OCFS2_BH_CACHED, inode);
271 if (ret) { 266 if (ret) {
272 mlog_errno(ret); 267 mlog_errno(ret);
273 goto out; 268 goto out;
@@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
485 } 480 }
486 481
487 if (ocfs2_should_order_data(inode)) { 482 if (ocfs2_should_order_data(inode)) {
483 ret = ocfs2_jbd2_file_inode(handle, inode);
484#ifdef CONFIG_OCFS2_COMPAT_JBD
488 ret = walk_page_buffers(handle, 485 ret = walk_page_buffers(handle,
489 page_buffers(page), 486 page_buffers(page),
490 from, to, NULL, 487 from, to, NULL,
491 ocfs2_journal_dirty_data); 488 ocfs2_journal_dirty_data);
492 if (ret < 0) 489#endif
490 if (ret < 0)
493 mlog_errno(ret); 491 mlog_errno(ret);
494 } 492 }
495out: 493out:
@@ -594,7 +592,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
594 goto bail; 592 goto bail;
595 } 593 }
596 594
597 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { 595 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
598 ocfs2_error(inode->i_sb, 596 ocfs2_error(inode->i_sb,
599 "Inode %llu has a hole at block %llu\n", 597 "Inode %llu has a hole at block %llu\n",
600 (unsigned long long)OCFS2_I(inode)->ip_blkno, 598 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
669{ 667{
670 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 668 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
671 669
672 journal_invalidatepage(journal, page, offset); 670 jbd2_journal_invalidatepage(journal, page, offset);
673} 671}
674 672
675static int ocfs2_releasepage(struct page *page, gfp_t wait) 673static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
678 676
679 if (!page_has_buffers(page)) 677 if (!page_has_buffers(page))
680 return 0; 678 return 0;
681 return journal_try_to_free_buffers(journal, page, wait); 679 return jbd2_journal_try_to_free_buffers(journal, page, wait);
682} 680}
683 681
684static ssize_t ocfs2_direct_IO(int rw, 682static ssize_t ocfs2_direct_IO(int rw,
@@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode,
1074 tmppage = wc->w_pages[i]; 1072 tmppage = wc->w_pages[i];
1075 1073
1076 if (page_has_buffers(tmppage)) { 1074 if (page_has_buffers(tmppage)) {
1077 if (ocfs2_should_order_data(inode)) 1075 if (ocfs2_should_order_data(inode)) {
1076 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1077#ifdef CONFIG_OCFS2_COMPAT_JBD
1078 walk_page_buffers(wc->w_handle, 1078 walk_page_buffers(wc->w_handle,
1079 page_buffers(tmppage), 1079 page_buffers(tmppage),
1080 from, to, NULL, 1080 from, to, NULL,
1081 ocfs2_journal_dirty_data); 1081 ocfs2_journal_dirty_data);
1082#endif
1083 }
1082 1084
1083 block_commit_write(tmppage, from, to); 1085 block_commit_write(tmppage, from, to);
1084 } 1086 }
@@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1242 int ret, i, new, should_zero = 0; 1244 int ret, i, new, should_zero = 0;
1243 u64 v_blkno, p_blkno; 1245 u64 v_blkno, p_blkno;
1244 struct inode *inode = mapping->host; 1246 struct inode *inode = mapping->host;
1247 struct ocfs2_extent_tree et;
1245 1248
1246 new = phys == 0 ? 1 : 0; 1249 new = phys == 0 ? 1 : 0;
1247 if (new || unwritten) 1250 if (new || unwritten)
@@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1255 * any additional semaphores or cluster locks. 1258 * any additional semaphores or cluster locks.
1256 */ 1259 */
1257 tmp_pos = cpos; 1260 tmp_pos = cpos;
1258 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1261 ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
1259 &tmp_pos, 1, 0, wc->w_di_bh, 1262 &tmp_pos, 1, 0, wc->w_di_bh,
1260 wc->w_handle, data_ac, 1263 wc->w_handle, data_ac,
1261 meta_ac, NULL); 1264 meta_ac, NULL);
1262 /* 1265 /*
1263 * This shouldn't happen because we must have already 1266 * This shouldn't happen because we must have already
1264 * calculated the correct meta data allocation required. The 1267 * calculated the correct meta data allocation required. The
@@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1276 goto out; 1279 goto out;
1277 } 1280 }
1278 } else if (unwritten) { 1281 } else if (unwritten) {
1279 ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, 1282 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
1283 ret = ocfs2_mark_extent_written(inode, &et,
1280 wc->w_handle, cpos, 1, phys, 1284 wc->w_handle, cpos, 1, phys,
1281 meta_ac, &wc->w_dealloc); 1285 meta_ac, &wc->w_dealloc);
1282 if (ret < 0) { 1286 if (ret < 0) {
@@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1665 struct ocfs2_alloc_context *data_ac = NULL; 1669 struct ocfs2_alloc_context *data_ac = NULL;
1666 struct ocfs2_alloc_context *meta_ac = NULL; 1670 struct ocfs2_alloc_context *meta_ac = NULL;
1667 handle_t *handle; 1671 handle_t *handle;
1672 struct ocfs2_extent_tree et;
1668 1673
1669 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); 1674 ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1670 if (ret) { 1675 if (ret) {
@@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1712 * ocfs2_lock_allocators(). It greatly over-estimates 1717 * ocfs2_lock_allocators(). It greatly over-estimates
1713 * the work to be done. 1718 * the work to be done.
1714 */ 1719 */
1715 ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, 1720 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
1716 extents_to_split, &data_ac, &meta_ac); 1721 " clusters_to_add = %u, extents_to_split = %u\n",
1722 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1723 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
1724 clusters_to_alloc, extents_to_split);
1725
1726 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
1727 ret = ocfs2_lock_allocators(inode, &et,
1728 clusters_to_alloc, extents_to_split,
1729 &data_ac, &meta_ac);
1717 if (ret) { 1730 if (ret) {
1718 mlog_errno(ret); 1731 mlog_errno(ret);
1719 goto out; 1732 goto out;
1720 } 1733 }
1721 1734
1722 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1735 credits = ocfs2_calc_extend_credits(inode->i_sb,
1736 &di->id2.i_list,
1723 clusters_to_alloc); 1737 clusters_to_alloc);
1724 1738
1725 } 1739 }
@@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1905 } 1919 }
1906 1920
1907 if (page_has_buffers(tmppage)) { 1921 if (page_has_buffers(tmppage)) {
1908 if (ocfs2_should_order_data(inode)) 1922 if (ocfs2_should_order_data(inode)) {
1923 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1924#ifdef CONFIG_OCFS2_COMPAT_JBD
1909 walk_page_buffers(wc->w_handle, 1925 walk_page_buffers(wc->w_handle,
1910 page_buffers(tmppage), 1926 page_buffers(tmppage),
1911 from, to, NULL, 1927 from, to, NULL,
1912 ocfs2_journal_dirty_data); 1928 ocfs2_journal_dirty_data);
1929#endif
1930 }
1913 block_commit_write(tmppage, from, to); 1931 block_commit_write(tmppage, from, to);
1914 } 1932 }
1915 } 1933 }
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f136639f5b41..7e947c672469 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
66 /* remove from dirty list before I/O. */ 66 /* remove from dirty list before I/O. */
67 clear_buffer_dirty(bh); 67 clear_buffer_dirty(bh);
68 68
69 get_bh(bh); /* for end_buffer_write_sync() */ 69 get_bh(bh); /* for end_buffer_write_sync() */
70 bh->b_end_io = end_buffer_write_sync; 70 bh->b_end_io = end_buffer_write_sync;
71 submit_bh(WRITE, bh); 71 submit_bh(WRITE, bh);
72 72
@@ -88,22 +88,103 @@ out:
88 return ret; 88 return ret;
89} 89}
90 90
91int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, 91int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
92 struct buffer_head *bhs[], int flags, 92 unsigned int nr, struct buffer_head *bhs[])
93 struct inode *inode) 93{
94 int status = 0;
95 unsigned int i;
96 struct buffer_head *bh;
97
98 if (!nr) {
99 mlog(ML_BH_IO, "No buffers will be read!\n");
100 goto bail;
101 }
102
103 for (i = 0 ; i < nr ; i++) {
104 if (bhs[i] == NULL) {
105 bhs[i] = sb_getblk(osb->sb, block++);
106 if (bhs[i] == NULL) {
107 status = -EIO;
108 mlog_errno(status);
109 goto bail;
110 }
111 }
112 bh = bhs[i];
113
114 if (buffer_jbd(bh)) {
115 mlog(ML_ERROR,
116 "trying to sync read a jbd "
117 "managed bh (blocknr = %llu), skipping\n",
118 (unsigned long long)bh->b_blocknr);
119 continue;
120 }
121
122 if (buffer_dirty(bh)) {
123 /* This should probably be a BUG, or
124 * at least return an error. */
125 mlog(ML_ERROR,
126 "trying to sync read a dirty "
127 "buffer! (blocknr = %llu), skipping\n",
128 (unsigned long long)bh->b_blocknr);
129 continue;
130 }
131
132 lock_buffer(bh);
133 if (buffer_jbd(bh)) {
134 mlog(ML_ERROR,
135 "block %llu had the JBD bit set "
136 "while I was in lock_buffer!",
137 (unsigned long long)bh->b_blocknr);
138 BUG();
139 }
140
141 clear_buffer_uptodate(bh);
142 get_bh(bh); /* for end_buffer_read_sync() */
143 bh->b_end_io = end_buffer_read_sync;
144 submit_bh(READ, bh);
145 }
146
147 for (i = nr; i > 0; i--) {
148 bh = bhs[i - 1];
149
150 if (buffer_jbd(bh)) {
151 mlog(ML_ERROR,
152 "the journal got the buffer while it was "
153 "locked for io! (blocknr = %llu)\n",
154 (unsigned long long)bh->b_blocknr);
155 BUG();
156 }
157
158 wait_on_buffer(bh);
159 if (!buffer_uptodate(bh)) {
160 /* Status won't be cleared from here on out,
161 * so we can safely record this and loop back
162 * to cleanup the other buffers. */
163 status = -EIO;
164 put_bh(bh);
165 bhs[i - 1] = NULL;
166 }
167 }
168
169bail:
170 return status;
171}
172
173int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
174 struct buffer_head *bhs[], int flags)
94{ 175{
95 int status = 0; 176 int status = 0;
96 struct super_block *sb;
97 int i, ignore_cache = 0; 177 int i, ignore_cache = 0;
98 struct buffer_head *bh; 178 struct buffer_head *bh;
99 179
100 mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", 180 mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
101 (unsigned long long)block, nr, flags, inode); 181 inode, (unsigned long long)block, nr, flags);
102 182
183 BUG_ON(!inode);
103 BUG_ON((flags & OCFS2_BH_READAHEAD) && 184 BUG_ON((flags & OCFS2_BH_READAHEAD) &&
104 (!inode || !(flags & OCFS2_BH_CACHED))); 185 (flags & OCFS2_BH_IGNORE_CACHE));
105 186
106 if (osb == NULL || osb->sb == NULL || bhs == NULL) { 187 if (bhs == NULL) {
107 status = -EINVAL; 188 status = -EINVAL;
108 mlog_errno(status); 189 mlog_errno(status);
109 goto bail; 190 goto bail;
@@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
122 goto bail; 203 goto bail;
123 } 204 }
124 205
125 sb = osb->sb; 206 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
126
127 if (flags & OCFS2_BH_CACHED && !inode)
128 flags &= ~OCFS2_BH_CACHED;
129
130 if (inode)
131 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
132 for (i = 0 ; i < nr ; i++) { 207 for (i = 0 ; i < nr ; i++) {
133 if (bhs[i] == NULL) { 208 if (bhs[i] == NULL) {
134 bhs[i] = sb_getblk(sb, block++); 209 bhs[i] = sb_getblk(inode->i_sb, block++);
135 if (bhs[i] == NULL) { 210 if (bhs[i] == NULL) {
136 if (inode) 211 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
137 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
138 status = -EIO; 212 status = -EIO;
139 mlog_errno(status); 213 mlog_errno(status);
140 goto bail; 214 goto bail;
141 } 215 }
142 } 216 }
143 bh = bhs[i]; 217 bh = bhs[i];
144 ignore_cache = 0; 218 ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
145 219
146 /* There are three read-ahead cases here which we need to 220 /* There are three read-ahead cases here which we need to
147 * be concerned with. All three assume a buffer has 221 * be concerned with. All three assume a buffer has
@@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
167 * before our is-it-in-flight check. 241 * before our is-it-in-flight check.
168 */ 242 */
169 243
170 if (flags & OCFS2_BH_CACHED && 244 if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
171 !ocfs2_buffer_uptodate(inode, bh)) {
172 mlog(ML_UPTODATE, 245 mlog(ML_UPTODATE,
173 "bh (%llu), inode %llu not uptodate\n", 246 "bh (%llu), inode %llu not uptodate\n",
174 (unsigned long long)bh->b_blocknr, 247 (unsigned long long)bh->b_blocknr,
175 (unsigned long long)OCFS2_I(inode)->ip_blkno); 248 (unsigned long long)OCFS2_I(inode)->ip_blkno);
249 /* We're using ignore_cache here to say
250 * "go to disk" */
176 ignore_cache = 1; 251 ignore_cache = 1;
177 } 252 }
178 253
179 /* XXX: Can we ever get this and *not* have the cached 254 /* XXX: Can we ever get this and *not* have the cached
180 * flag set? */ 255 * flag set? */
181 if (buffer_jbd(bh)) { 256 if (buffer_jbd(bh)) {
182 if (!(flags & OCFS2_BH_CACHED) || ignore_cache) 257 if (ignore_cache)
183 mlog(ML_BH_IO, "trying to sync read a jbd " 258 mlog(ML_BH_IO, "trying to sync read a jbd "
184 "managed bh (blocknr = %llu)\n", 259 "managed bh (blocknr = %llu)\n",
185 (unsigned long long)bh->b_blocknr); 260 (unsigned long long)bh->b_blocknr);
186 continue; 261 continue;
187 } 262 }
188 263
189 if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { 264 if (ignore_cache) {
190 if (buffer_dirty(bh)) { 265 if (buffer_dirty(bh)) {
191 /* This should probably be a BUG, or 266 /* This should probably be a BUG, or
192 * at least return an error. */ 267 * at least return an error. */
@@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
221 * previously read-ahead buffer may have 296 * previously read-ahead buffer may have
222 * completed I/O while we were waiting for the 297 * completed I/O while we were waiting for the
223 * buffer lock. */ 298 * buffer lock. */
224 if ((flags & OCFS2_BH_CACHED) 299 if (!(flags & OCFS2_BH_IGNORE_CACHE)
225 && !(flags & OCFS2_BH_READAHEAD) 300 && !(flags & OCFS2_BH_READAHEAD)
226 && ocfs2_buffer_uptodate(inode, bh)) { 301 && ocfs2_buffer_uptodate(inode, bh)) {
227 unlock_buffer(bh); 302 unlock_buffer(bh);
@@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
265 /* Always set the buffer in the cache, even if it was 340 /* Always set the buffer in the cache, even if it was
266 * a forced read, or read-ahead which hasn't yet 341 * a forced read, or read-ahead which hasn't yet
267 * completed. */ 342 * completed. */
268 if (inode) 343 ocfs2_set_buffer_uptodate(inode, bh);
269 ocfs2_set_buffer_uptodate(inode, bh);
270 } 344 }
271 if (inode) 345 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
272 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
273 346
274 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 347 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
275 (unsigned long long)block, nr, 348 (unsigned long long)block, nr,
276 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags); 349 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
350 flags);
277 351
278bail: 352bail:
279 353
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c2e78614c3e5..75e1dcb1ade7 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,31 +31,29 @@
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh, 31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate); 32 int uptodate);
33 33
34static inline int ocfs2_read_block(struct ocfs2_super *osb, 34static inline int ocfs2_read_block(struct inode *inode,
35 u64 off, 35 u64 off,
36 struct buffer_head **bh, 36 struct buffer_head **bh);
37 int flags,
38 struct inode *inode);
39 37
40int ocfs2_write_block(struct ocfs2_super *osb, 38int ocfs2_write_block(struct ocfs2_super *osb,
41 struct buffer_head *bh, 39 struct buffer_head *bh,
42 struct inode *inode); 40 struct inode *inode);
43int ocfs2_read_blocks(struct ocfs2_super *osb, 41int ocfs2_read_blocks(struct inode *inode,
44 u64 block, 42 u64 block,
45 int nr, 43 int nr,
46 struct buffer_head *bhs[], 44 struct buffer_head *bhs[],
47 int flags, 45 int flags);
48 struct inode *inode); 46int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
47 unsigned int nr, struct buffer_head *bhs[]);
49 48
50int ocfs2_write_super_or_backup(struct ocfs2_super *osb, 49int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
51 struct buffer_head *bh); 50 struct buffer_head *bh);
52 51
53#define OCFS2_BH_CACHED 1 52#define OCFS2_BH_IGNORE_CACHE 1
54#define OCFS2_BH_READAHEAD 8 53#define OCFS2_BH_READAHEAD 8
55 54
56static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, 55static inline int ocfs2_read_block(struct inode *inode, u64 off,
57 struct buffer_head **bh, int flags, 56 struct buffer_head **bh)
58 struct inode *inode)
59{ 57{
60 int status = 0; 58 int status = 0;
61 59
@@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
65 goto bail; 63 goto bail;
66 } 64 }
67 65
68 status = ocfs2_read_blocks(osb, off, 1, bh, 66 status = ocfs2_read_blocks(inode, off, 1, bh, 0);
69 flags, inode);
70 67
71bail: 68bail:
72 return status; 69 return status;
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 23c732f27529..d8a0cb92cef6 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
109 define_mask(CONN), 109 define_mask(CONN),
110 define_mask(QUORUM), 110 define_mask(QUORUM),
111 define_mask(EXPORT), 111 define_mask(EXPORT),
112 define_mask(XATTR),
112 define_mask(ERROR), 113 define_mask(ERROR),
113 define_mask(NOTICE), 114 define_mask(NOTICE),
114 define_mask(KTHREAD), 115 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 597e064bb94f..57670c680471 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -112,6 +112,7 @@
112#define ML_CONN 0x0000000004000000ULL /* net connection management */ 112#define ML_CONN 0x0000000004000000ULL /* net connection management */
113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ 113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ 114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
115#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115/* bits that are infrequently given and frequently matched in the high word */ 116/* bits that are infrequently given and frequently matched in the high word */
116#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 117#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
117#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 118#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index d8bfa0eb41b2..52276c02f710 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v)
138 " message id: %d\n" 138 " message id: %d\n"
139 " message type: %u\n" 139 " message type: %u\n"
140 " message key: 0x%08x\n" 140 " message key: 0x%08x\n"
141 " sock acquiry: %lu.%lu\n" 141 " sock acquiry: %lu.%ld\n"
142 " send start: %lu.%lu\n" 142 " send start: %lu.%ld\n"
143 " wait start: %lu.%lu\n", 143 " wait start: %lu.%ld\n",
144 nst, (unsigned long)nst->st_task->pid, 144 nst, (unsigned long)nst->st_task->pid,
145 (unsigned long)nst->st_task->tgid, 145 (unsigned long)nst->st_task->tgid,
146 nst->st_task->comm, nst->st_node, 146 nst->st_task->comm, nst->st_node,
147 nst->st_sc, nst->st_id, nst->st_msg_type, 147 nst->st_sc, nst->st_id, nst->st_msg_type,
148 nst->st_msg_key, 148 nst->st_msg_key,
149 nst->st_sock_time.tv_sec, 149 nst->st_sock_time.tv_sec,
150 (unsigned long)nst->st_sock_time.tv_usec, 150 (long)nst->st_sock_time.tv_usec,
151 nst->st_send_time.tv_sec, 151 nst->st_send_time.tv_sec,
152 (unsigned long)nst->st_send_time.tv_usec, 152 (long)nst->st_send_time.tv_usec,
153 nst->st_status_time.tv_sec, 153 nst->st_status_time.tv_sec,
154 nst->st_status_time.tv_usec); 154 (long)nst->st_status_time.tv_usec);
155 } 155 }
156 156
157 spin_unlock(&o2net_debug_lock); 157 spin_unlock(&o2net_debug_lock);
@@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
276 return sc; /* unused, just needs to be null when done */ 276 return sc; /* unused, just needs to be null when done */
277} 277}
278 278
279#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec 279#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
280 280
281static int sc_seq_show(struct seq_file *seq, void *v) 281static int sc_seq_show(struct seq_file *seq, void *v)
282{ 282{
@@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v)
309 " remote node: %s\n" 309 " remote node: %s\n"
310 " page off: %zu\n" 310 " page off: %zu\n"
311 " handshake ok: %u\n" 311 " handshake ok: %u\n"
312 " timer: %lu.%lu\n" 312 " timer: %lu.%ld\n"
313 " data ready: %lu.%lu\n" 313 " data ready: %lu.%ld\n"
314 " advance start: %lu.%lu\n" 314 " advance start: %lu.%ld\n"
315 " advance stop: %lu.%lu\n" 315 " advance stop: %lu.%ld\n"
316 " func start: %lu.%lu\n" 316 " func start: %lu.%ld\n"
317 " func stop: %lu.%lu\n" 317 " func stop: %lu.%ld\n"
318 " func key: %u\n" 318 " func key: %u\n"
319 " func type: %u\n", 319 " func type: %u\n",
320 sc, 320 sc,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a27d61581bd6..2bcf706d9dd3 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); 143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
144 144
145#ifdef CONFIG_DEBUG_FS 145#ifdef CONFIG_DEBUG_FS
146void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, 146static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
147 u32 msgkey, struct task_struct *task, u8 node) 147 u32 msgkey, struct task_struct *task, u8 node)
148{ 148{
149 INIT_LIST_HEAD(&nst->st_net_debug_item); 149 INIT_LIST_HEAD(&nst->st_net_debug_item);
150 nst->st_task = task; 150 nst->st_task = task;
@@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
153 nst->st_node = node; 153 nst->st_node = node;
154} 154}
155 155
156void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) 156static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
157{ 157{
158 do_gettimeofday(&nst->st_sock_time); 158 do_gettimeofday(&nst->st_sock_time);
159} 159}
160 160
161void o2net_set_nst_send_time(struct o2net_send_tracking *nst) 161static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
162{ 162{
163 do_gettimeofday(&nst->st_send_time); 163 do_gettimeofday(&nst->st_send_time);
164} 164}
165 165
166void o2net_set_nst_status_time(struct o2net_send_tracking *nst) 166static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
167{ 167{
168 do_gettimeofday(&nst->st_status_time); 168 do_gettimeofday(&nst->st_status_time);
169} 169}
170 170
171void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, 171static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
172 struct o2net_sock_container *sc) 172 struct o2net_sock_container *sc)
173{ 173{
174 nst->st_sc = sc; 174 nst->st_sc = sc;
175} 175}
176 176
177void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) 177static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
178{ 178{
179 nst->st_id = msg_id; 179 nst->st_id = msg_id;
180} 180}
181
182#else /* CONFIG_DEBUG_FS */
183
184static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
185 u32 msgkey, struct task_struct *task, u8 node)
186{
187}
188
189static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
190{
191}
192
193static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
194{
195}
196
197static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
198{
199}
200
201static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
202 struct o2net_sock_container *sc)
203{
204}
205
206static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
207 u32 msg_id)
208{
209}
210
181#endif /* CONFIG_DEBUG_FS */ 211#endif /* CONFIG_DEBUG_FS */
182 212
183static inline int o2net_reconnect_delay(void) 213static inline int o2net_reconnect_delay(void)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 18307ff81b77..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,42 +224,10 @@ struct o2net_send_tracking {
224 struct timeval st_send_time; 224 struct timeval st_send_time;
225 struct timeval st_status_time; 225 struct timeval st_status_time;
226}; 226};
227
228void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
229 u32 msgkey, struct task_struct *task, u8 node);
230void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
231void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
232void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
233void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
234 struct o2net_sock_container *sc);
235void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
236
237#else 227#else
238struct o2net_send_tracking { 228struct o2net_send_tracking {
239 u32 dummy; 229 u32 dummy;
240}; 230};
241
242static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
243 u32 msgkey, struct task_struct *task, u8 node)
244{
245}
246static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
247{
248}
249static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
250{
251}
252static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
253{
254}
255static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
256 struct o2net_sock_container *sc)
257{
258}
259static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
260 u32 msg_id)
261{
262}
263#endif /* CONFIG_DEBUG_FS */ 231#endif /* CONFIG_DEBUG_FS */
264 232
265#endif /* O2CLUSTER_TCP_INTERNAL_H */ 233#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a1875848080..026e6eb85187 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
82 struct ocfs2_alloc_context *meta_ac, 82 struct ocfs2_alloc_context *meta_ac,
83 struct buffer_head **new_bh); 83 struct buffer_head **new_bh);
84 84
85static struct buffer_head *ocfs2_bread(struct inode *inode,
86 int block, int *err, int reada)
87{
88 struct buffer_head *bh = NULL;
89 int tmperr;
90 u64 p_blkno;
91 int readflags = 0;
92
93 if (reada)
94 readflags |= OCFS2_BH_READAHEAD;
95
96 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
97 i_size_read(inode)) {
98 BUG_ON(!reada);
99 return NULL;
100 }
101
102 down_read(&OCFS2_I(inode)->ip_alloc_sem);
103 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
104 NULL);
105 up_read(&OCFS2_I(inode)->ip_alloc_sem);
106 if (tmperr < 0) {
107 mlog_errno(tmperr);
108 goto fail;
109 }
110
111 tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
112 if (tmperr < 0)
113 goto fail;
114
115 tmperr = 0;
116
117 *err = 0;
118 return bh;
119
120fail:
121 brelse(bh);
122 bh = NULL;
123
124 *err = -EIO;
125 return NULL;
126}
127
85/* 128/*
86 * bh passed here can be an inode block or a dir data block, depending 129 * bh passed here can be an inode block or a dir data block, depending
87 * on the inode inline data flag. 130 * on the inode inline data flag.
@@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
188 struct ocfs2_dinode *di; 231 struct ocfs2_dinode *di;
189 struct ocfs2_inline_data *data; 232 struct ocfs2_inline_data *data;
190 233
191 ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, 234 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
192 &di_bh, OCFS2_BH_CACHED, dir);
193 if (ret) { 235 if (ret) {
194 mlog_errno(ret); 236 mlog_errno(ret);
195 goto out; 237 goto out;
@@ -260,14 +302,13 @@ restart:
260 } 302 }
261 if ((bh = bh_use[ra_ptr++]) == NULL) 303 if ((bh = bh_use[ra_ptr++]) == NULL)
262 goto next; 304 goto next;
263 wait_on_buffer(bh); 305 if (ocfs2_read_block(dir, block, &bh)) {
264 if (!buffer_uptodate(bh)) { 306 /* read error, skip block & hope for the best.
265 /* read error, skip block & hope for the best */ 307 * ocfs2_read_block() has released the bh. */
266 ocfs2_error(dir->i_sb, "reading directory %llu, " 308 ocfs2_error(dir->i_sb, "reading directory %llu, "
267 "offset %lu\n", 309 "offset %lu\n",
268 (unsigned long long)OCFS2_I(dir)->ip_blkno, 310 (unsigned long long)OCFS2_I(dir)->ip_blkno,
269 block); 311 block);
270 brelse(bh);
271 goto next; 312 goto next;
272 } 313 }
273 i = ocfs2_search_dirblock(bh, dir, name, namelen, 314 i = ocfs2_search_dirblock(bh, dir, name, namelen,
@@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
417 struct ocfs2_dinode *di; 458 struct ocfs2_dinode *di;
418 struct ocfs2_inline_data *data; 459 struct ocfs2_inline_data *data;
419 460
420 ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, 461 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
421 &di_bh, OCFS2_BH_CACHED, dir);
422 if (ret) { 462 if (ret) {
423 mlog_errno(ret); 463 mlog_errno(ret);
424 goto out; 464 goto out;
@@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
596 struct ocfs2_inline_data *data; 636 struct ocfs2_inline_data *data;
597 struct ocfs2_dir_entry *de; 637 struct ocfs2_dir_entry *de;
598 638
599 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, 639 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
600 &di_bh, OCFS2_BH_CACHED, inode);
601 if (ret) { 640 if (ret) {
602 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", 641 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
603 (unsigned long long)OCFS2_I(inode)->ip_blkno); 642 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
716 for (i = ra_sectors >> (sb->s_blocksize_bits - 9); 755 for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
717 i > 0; i--) { 756 i > 0; i--) {
718 tmp = ocfs2_bread(inode, ++blk, &err, 1); 757 tmp = ocfs2_bread(inode, ++blk, &err, 1);
719 if (tmp) 758 brelse(tmp);
720 brelse(tmp);
721 } 759 }
722 last_ra_blk = blk; 760 last_ra_blk = blk;
723 ra_sectors = 8; 761 ra_sectors = 8;
@@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name,
899leave: 937leave:
900 if (status < 0) { 938 if (status < 0) {
901 *dirent = NULL; 939 *dirent = NULL;
902 if (*dirent_bh) { 940 brelse(*dirent_bh);
903 brelse(*dirent_bh); 941 *dirent_bh = NULL;
904 *dirent_bh = NULL;
905 }
906 } 942 }
907 943
908 mlog_exit(status); 944 mlog_exit(status);
@@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
951 987
952 ret = 0; 988 ret = 0;
953bail: 989bail:
954 if (dirent_bh) 990 brelse(dirent_bh);
955 brelse(dirent_bh);
956 991
957 mlog_exit(ret); 992 mlog_exit(ret);
958 return ret; 993 return ret;
@@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1127 1162
1128 status = 0; 1163 status = 0;
1129bail: 1164bail:
1130 if (new_bh) 1165 brelse(new_bh);
1131 brelse(new_bh);
1132 1166
1133 mlog_exit(status); 1167 mlog_exit(status);
1134 return status; 1168 return status;
@@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1192 struct buffer_head *dirdata_bh = NULL; 1226 struct buffer_head *dirdata_bh = NULL;
1193 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1227 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1194 handle_t *handle; 1228 handle_t *handle;
1229 struct ocfs2_extent_tree et;
1230
1231 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1195 1232
1196 alloc = ocfs2_clusters_for_bytes(sb, bytes); 1233 alloc = ocfs2_clusters_for_bytes(sb, bytes);
1197 1234
@@ -1300,19 +1337,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1300 di->i_size = cpu_to_le64(sb->s_blocksize); 1337 di->i_size = cpu_to_le64(sb->s_blocksize);
1301 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); 1338 di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
1302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); 1339 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
1303 dir->i_blocks = ocfs2_inode_sector_count(dir);
1304 1340
1305 /* 1341 /*
1306 * This should never fail as our extent list is empty and all 1342 * This should never fail as our extent list is empty and all
1307 * related blocks have been journaled already. 1343 * related blocks have been journaled already.
1308 */ 1344 */
1309 ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0, 1345 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
1310 NULL); 1346 0, NULL);
1311 if (ret) { 1347 if (ret) {
1312 mlog_errno(ret); 1348 mlog_errno(ret);
1313 goto out; 1349 goto out_commit;
1314 } 1350 }
1315 1351
1352 /*
1353 * Set i_blocks after the extent insert for the most up to
1354 * date ip_clusters value.
1355 */
1356 dir->i_blocks = ocfs2_inode_sector_count(dir);
1357
1316 ret = ocfs2_journal_dirty(handle, di_bh); 1358 ret = ocfs2_journal_dirty(handle, di_bh);
1317 if (ret) { 1359 if (ret) {
1318 mlog_errno(ret); 1360 mlog_errno(ret);
@@ -1332,11 +1374,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1332 } 1374 }
1333 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); 1375 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
1334 1376
1335 ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno, 1377 ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
1336 len, 0, NULL); 1378 blkno, len, 0, NULL);
1337 if (ret) { 1379 if (ret) {
1338 mlog_errno(ret); 1380 mlog_errno(ret);
1339 goto out; 1381 goto out_commit;
1340 } 1382 }
1341 } 1383 }
1342 1384
@@ -1378,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1378 if (extend) { 1420 if (extend) {
1379 u32 offset = OCFS2_I(dir)->ip_clusters; 1421 u32 offset = OCFS2_I(dir)->ip_clusters;
1380 1422
1381 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, 1423 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
1382 1, 0, parent_fe_bh, handle, 1424 1, 0, parent_fe_bh, handle,
1383 data_ac, meta_ac, NULL); 1425 data_ac, meta_ac, NULL);
1384 BUG_ON(status == -EAGAIN); 1426 BUG_ON(status == -EAGAIN);
1385 if (status < 0) { 1427 if (status < 0) {
1386 mlog_errno(status); 1428 mlog_errno(status);
@@ -1425,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1425 int credits, num_free_extents, drop_alloc_sem = 0; 1467 int credits, num_free_extents, drop_alloc_sem = 0;
1426 loff_t dir_i_size; 1468 loff_t dir_i_size;
1427 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; 1469 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1470 struct ocfs2_extent_list *el = &fe->id2.i_list;
1428 struct ocfs2_alloc_context *data_ac = NULL; 1471 struct ocfs2_alloc_context *data_ac = NULL;
1429 struct ocfs2_alloc_context *meta_ac = NULL; 1472 struct ocfs2_alloc_context *meta_ac = NULL;
1430 handle_t *handle = NULL; 1473 handle_t *handle = NULL;
1431 struct buffer_head *new_bh = NULL; 1474 struct buffer_head *new_bh = NULL;
1432 struct ocfs2_dir_entry * de; 1475 struct ocfs2_dir_entry * de;
1433 struct super_block *sb = osb->sb; 1476 struct super_block *sb = osb->sb;
1477 struct ocfs2_extent_tree et;
1434 1478
1435 mlog_entry_void(); 1479 mlog_entry_void();
1436 1480
@@ -1474,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1474 spin_lock(&OCFS2_I(dir)->ip_lock); 1518 spin_lock(&OCFS2_I(dir)->ip_lock);
1475 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { 1519 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
1476 spin_unlock(&OCFS2_I(dir)->ip_lock); 1520 spin_unlock(&OCFS2_I(dir)->ip_lock);
1477 num_free_extents = ocfs2_num_free_extents(osb, dir, fe); 1521 ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
1522 num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
1478 if (num_free_extents < 0) { 1523 if (num_free_extents < 0) {
1479 status = num_free_extents; 1524 status = num_free_extents;
1480 mlog_errno(status); 1525 mlog_errno(status);
@@ -1482,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1482 } 1527 }
1483 1528
1484 if (!num_free_extents) { 1529 if (!num_free_extents) {
1485 status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); 1530 status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
1486 if (status < 0) { 1531 if (status < 0) {
1487 if (status != -ENOSPC) 1532 if (status != -ENOSPC)
1488 mlog_errno(status); 1533 mlog_errno(status);
@@ -1497,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
1497 goto bail; 1542 goto bail;
1498 } 1543 }
1499 1544
1500 credits = ocfs2_calc_extend_credits(sb, fe, 1); 1545 credits = ocfs2_calc_extend_credits(sb, el, 1);
1501 } else { 1546 } else {
1502 spin_unlock(&OCFS2_I(dir)->ip_lock); 1547 spin_unlock(&OCFS2_I(dir)->ip_lock);
1503 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; 1548 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -1563,8 +1608,7 @@ bail:
1563 if (meta_ac) 1608 if (meta_ac)
1564 ocfs2_free_alloc_context(meta_ac); 1609 ocfs2_free_alloc_context(meta_ac);
1565 1610
1566 if (new_bh) 1611 brelse(new_bh);
1567 brelse(new_bh);
1568 1612
1569 mlog_exit(status); 1613 mlog_exit(status);
1570 return status; 1614 return status;
@@ -1691,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1691 1735
1692 status = 0; 1736 status = 0;
1693bail: 1737bail:
1694 if (bh) 1738 brelse(bh);
1695 brelse(bh);
1696 1739
1697 mlog_exit(status); 1740 mlog_exit(status);
1698 return status; 1741 return status;
@@ -1751,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
1751 *ret_de_bh = bh; 1794 *ret_de_bh = bh;
1752 bh = NULL; 1795 bh = NULL;
1753out: 1796out:
1754 if (bh) 1797 brelse(bh);
1755 brelse(bh);
1756 return ret; 1798 return ret;
1757} 1799}
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index eae3d643a5e4..ec684426034b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2024 } else { 2024 } else {
2025 /* Boo, we have to go to disk. */ 2025 /* Boo, we have to go to disk. */
2026 /* read bh, cast, ocfs2_refresh_inode */ 2026 /* read bh, cast, ocfs2_refresh_inode */
2027 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, 2027 status = ocfs2_read_block(inode, oi->ip_blkno, bh);
2028 bh, OCFS2_BH_CACHED, inode);
2029 if (status < 0) { 2028 if (status < 0) {
2030 mlog_errno(status); 2029 mlog_errno(status);
2031 goto bail_refresh; 2030 goto bail_refresh;
@@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
2086 return 0; 2085 return 0;
2087 } 2086 }
2088 2087
2089 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 2088 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
2090 OCFS2_I(inode)->ip_blkno,
2091 ret_bh,
2092 OCFS2_BH_CACHED,
2093 inode);
2094 if (status < 0) 2089 if (status < 0)
2095 mlog_errno(status); 2090 mlog_errno(status);
2096 2091
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326fe..2baedac58234 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/fiemap.h>
28 29
29#define MLOG_MASK_PREFIX ML_EXTENT_MAP 30#define MLOG_MASK_PREFIX ML_EXTENT_MAP
30#include <cluster/masklog.h> 31#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
32#include "ocfs2.h" 33#include "ocfs2.h"
33 34
34#include "alloc.h" 35#include "alloc.h"
36#include "dlmglue.h"
35#include "extent_map.h" 37#include "extent_map.h"
36#include "inode.h" 38#include "inode.h"
37#include "super.h" 39#include "super.h"
@@ -282,6 +284,50 @@ out:
282 kfree(new_emi); 284 kfree(new_emi);
283} 285}
284 286
287static int ocfs2_last_eb_is_empty(struct inode *inode,
288 struct ocfs2_dinode *di)
289{
290 int ret, next_free;
291 u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
292 struct buffer_head *eb_bh = NULL;
293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el;
295
296 ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
297 if (ret) {
298 mlog_errno(ret);
299 goto out;
300 }
301
302 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
303 el = &eb->h_list;
304
305 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
306 ret = -EROFS;
307 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
308 goto out;
309 }
310
311 if (el->l_tree_depth) {
312 ocfs2_error(inode->i_sb,
313 "Inode %lu has non zero tree depth in "
314 "leaf block %llu\n", inode->i_ino,
315 (unsigned long long)eb_bh->b_blocknr);
316 ret = -EROFS;
317 goto out;
318 }
319
320 next_free = le16_to_cpu(el->l_next_free_rec);
321
322 if (next_free == 0 ||
323 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
324 ret = 1;
325
326out:
327 brelse(eb_bh);
328 return ret;
329}
330
285/* 331/*
286 * Return the 1st index within el which contains an extent start 332 * Return the 1st index within el which contains an extent start
287 * larger than v_cluster. 333 * larger than v_cluster.
@@ -335,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
335 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 381 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
336 goto no_more_extents; 382 goto no_more_extents;
337 383
338 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 384 ret = ocfs2_read_block(inode,
339 le64_to_cpu(eb->h_next_leaf_blk), 385 le64_to_cpu(eb->h_next_leaf_blk),
340 &next_eb_bh, OCFS2_BH_CACHED, inode); 386 &next_eb_bh);
341 if (ret) { 387 if (ret) {
342 mlog_errno(ret); 388 mlog_errno(ret);
343 goto out; 389 goto out;
@@ -373,42 +419,28 @@ out:
373 return ret; 419 return ret;
374} 420}
375 421
376int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, 422static int ocfs2_get_clusters_nocache(struct inode *inode,
377 u32 *p_cluster, u32 *num_clusters, 423 struct buffer_head *di_bh,
378 unsigned int *extent_flags) 424 u32 v_cluster, unsigned int *hole_len,
425 struct ocfs2_extent_rec *ret_rec,
426 unsigned int *is_last)
379{ 427{
380 int ret, i; 428 int i, ret, tree_height, len;
381 unsigned int flags = 0;
382 struct buffer_head *di_bh = NULL;
383 struct buffer_head *eb_bh = NULL;
384 struct ocfs2_dinode *di; 429 struct ocfs2_dinode *di;
385 struct ocfs2_extent_block *eb; 430 struct ocfs2_extent_block *uninitialized_var(eb);
386 struct ocfs2_extent_list *el; 431 struct ocfs2_extent_list *el;
387 struct ocfs2_extent_rec *rec; 432 struct ocfs2_extent_rec *rec;
388 u32 coff; 433 struct buffer_head *eb_bh = NULL;
389
390 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
391 ret = -ERANGE;
392 mlog_errno(ret);
393 goto out;
394 }
395
396 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
397 num_clusters, extent_flags);
398 if (ret == 0)
399 goto out;
400 434
401 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, 435 memset(ret_rec, 0, sizeof(*ret_rec));
402 &di_bh, OCFS2_BH_CACHED, inode); 436 if (is_last)
403 if (ret) { 437 *is_last = 0;
404 mlog_errno(ret);
405 goto out;
406 }
407 438
408 di = (struct ocfs2_dinode *) di_bh->b_data; 439 di = (struct ocfs2_dinode *) di_bh->b_data;
409 el = &di->id2.i_list; 440 el = &di->id2.i_list;
441 tree_height = le16_to_cpu(el->l_tree_depth);
410 442
411 if (el->l_tree_depth) { 443 if (tree_height > 0) {
412 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 444 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
413 if (ret) { 445 if (ret) {
414 mlog_errno(ret); 446 mlog_errno(ret);
@@ -431,46 +463,202 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
431 i = ocfs2_search_extent_list(el, v_cluster); 463 i = ocfs2_search_extent_list(el, v_cluster);
432 if (i == -1) { 464 if (i == -1) {
433 /* 465 /*
434 * A hole was found. Return some canned values that 466 * Holes can be larger than the maximum size of an
435 * callers can key on. If asked for, num_clusters will 467 * extent, so we return their lengths in a seperate
436 * be populated with the size of the hole. 468 * field.
437 */ 469 */
438 *p_cluster = 0; 470 if (hole_len) {
439 if (num_clusters) {
440 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, 471 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
441 v_cluster, 472 v_cluster, &len);
442 num_clusters);
443 if (ret) { 473 if (ret) {
444 mlog_errno(ret); 474 mlog_errno(ret);
445 goto out; 475 goto out;
446 } 476 }
477
478 *hole_len = len;
479 }
480 goto out_hole;
481 }
482
483 rec = &el->l_recs[i];
484
485 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
486
487 if (!rec->e_blkno) {
488 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
489 "record (%u, %u, 0)", inode->i_ino,
490 le32_to_cpu(rec->e_cpos),
491 ocfs2_rec_clusters(el, rec));
492 ret = -EROFS;
493 goto out;
494 }
495
496 *ret_rec = *rec;
497
498 /*
499 * Checking for last extent is potentially expensive - we
500 * might have to look at the next leaf over to see if it's
501 * empty.
502 *
503 * The first two checks are to see whether the caller even
504 * cares for this information, and if the extent is at least
505 * the last in it's list.
506 *
507 * If those hold true, then the extent is last if any of the
508 * additional conditions hold true:
509 * - Extent list is in-inode
510 * - Extent list is right-most
511 * - Extent list is 2nd to rightmost, with empty right-most
512 */
513 if (is_last) {
514 if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
515 if (tree_height == 0)
516 *is_last = 1;
517 else if (eb->h_blkno == di->i_last_eb_blk)
518 *is_last = 1;
519 else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
520 ret = ocfs2_last_eb_is_empty(inode, di);
521 if (ret < 0) {
522 mlog_errno(ret);
523 goto out;
524 }
525 if (ret == 1)
526 *is_last = 1;
527 }
528 }
529 }
530
531out_hole:
532 ret = 0;
533out:
534 brelse(eb_bh);
535 return ret;
536}
537
538static void ocfs2_relative_extent_offsets(struct super_block *sb,
539 u32 v_cluster,
540 struct ocfs2_extent_rec *rec,
541 u32 *p_cluster, u32 *num_clusters)
542
543{
544 u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
545
546 *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
547 *p_cluster = *p_cluster + coff;
548
549 if (num_clusters)
550 *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
551}
552
553int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
554 u32 *p_cluster, u32 *num_clusters,
555 struct ocfs2_extent_list *el)
556{
557 int ret = 0, i;
558 struct buffer_head *eb_bh = NULL;
559 struct ocfs2_extent_block *eb;
560 struct ocfs2_extent_rec *rec;
561 u32 coff;
562
563 if (el->l_tree_depth) {
564 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
565 if (ret) {
566 mlog_errno(ret);
567 goto out;
447 } 568 }
569
570 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
571 el = &eb->h_list;
572
573 if (el->l_tree_depth) {
574 ocfs2_error(inode->i_sb,
575 "Inode %lu has non zero tree depth in "
576 "xattr leaf block %llu\n", inode->i_ino,
577 (unsigned long long)eb_bh->b_blocknr);
578 ret = -EROFS;
579 goto out;
580 }
581 }
582
583 i = ocfs2_search_extent_list(el, v_cluster);
584 if (i == -1) {
585 ret = -EROFS;
586 mlog_errno(ret);
587 goto out;
448 } else { 588 } else {
449 rec = &el->l_recs[i]; 589 rec = &el->l_recs[i];
450
451 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); 590 BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
452 591
453 if (!rec->e_blkno) { 592 if (!rec->e_blkno) {
454 ocfs2_error(inode->i_sb, "Inode %lu has bad extent " 593 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
455 "record (%u, %u, 0)", inode->i_ino, 594 "record (%u, %u, 0) in xattr", inode->i_ino,
456 le32_to_cpu(rec->e_cpos), 595 le32_to_cpu(rec->e_cpos),
457 ocfs2_rec_clusters(el, rec)); 596 ocfs2_rec_clusters(el, rec));
458 ret = -EROFS; 597 ret = -EROFS;
459 goto out; 598 goto out;
460 } 599 }
461
462 coff = v_cluster - le32_to_cpu(rec->e_cpos); 600 coff = v_cluster - le32_to_cpu(rec->e_cpos);
463
464 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, 601 *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
465 le64_to_cpu(rec->e_blkno)); 602 le64_to_cpu(rec->e_blkno));
466 *p_cluster = *p_cluster + coff; 603 *p_cluster = *p_cluster + coff;
467
468 if (num_clusters) 604 if (num_clusters)
469 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 605 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
606 }
607out:
608 if (eb_bh)
609 brelse(eb_bh);
610 return ret;
611}
612
613int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
614 u32 *p_cluster, u32 *num_clusters,
615 unsigned int *extent_flags)
616{
617 int ret;
618 unsigned int uninitialized_var(hole_len), flags = 0;
619 struct buffer_head *di_bh = NULL;
620 struct ocfs2_extent_rec rec;
621
622 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
623 ret = -ERANGE;
624 mlog_errno(ret);
625 goto out;
626 }
627
628 ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
629 num_clusters, extent_flags);
630 if (ret == 0)
631 goto out;
632
633 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
634 if (ret) {
635 mlog_errno(ret);
636 goto out;
637 }
470 638
471 flags = rec->e_flags; 639 ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
640 &rec, NULL);
641 if (ret) {
642 mlog_errno(ret);
643 goto out;
644 }
645
646 if (rec.e_blkno == 0ULL) {
647 /*
648 * A hole was found. Return some canned values that
649 * callers can key on. If asked for, num_clusters will
650 * be populated with the size of the hole.
651 */
652 *p_cluster = 0;
653 if (num_clusters) {
654 *num_clusters = hole_len;
655 }
656 } else {
657 ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
658 p_cluster, num_clusters);
659 flags = rec.e_flags;
472 660
473 ocfs2_extent_map_insert_rec(inode, rec); 661 ocfs2_extent_map_insert_rec(inode, &rec);
474 } 662 }
475 663
476 if (extent_flags) 664 if (extent_flags)
@@ -478,7 +666,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
478 666
479out: 667out:
480 brelse(di_bh); 668 brelse(di_bh);
481 brelse(eb_bh);
482 return ret; 669 return ret;
483} 670}
484 671
@@ -521,3 +708,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
521out: 708out:
522 return ret; 709 return ret;
523} 710}
711
712static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
713 struct fiemap_extent_info *fieinfo,
714 u64 map_start)
715{
716 int ret;
717 unsigned int id_count;
718 struct ocfs2_dinode *di;
719 u64 phys;
720 u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
721 struct ocfs2_inode_info *oi = OCFS2_I(inode);
722
723 di = (struct ocfs2_dinode *)di_bh->b_data;
724 id_count = le16_to_cpu(di->id2.i_data.id_count);
725
726 if (map_start < id_count) {
727 phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
728 phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
729
730 ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
731 flags);
732 if (ret < 0)
733 return ret;
734 }
735
736 return 0;
737}
738
739#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
740
741int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
742 u64 map_start, u64 map_len)
743{
744 int ret, is_last;
745 u32 mapping_end, cpos;
746 unsigned int hole_size;
747 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
748 u64 len_bytes, phys_bytes, virt_bytes;
749 struct buffer_head *di_bh = NULL;
750 struct ocfs2_extent_rec rec;
751
752 ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
753 if (ret)
754 return ret;
755
756 ret = ocfs2_inode_lock(inode, &di_bh, 0);
757 if (ret) {
758 mlog_errno(ret);
759 goto out;
760 }
761
762 down_read(&OCFS2_I(inode)->ip_alloc_sem);
763
764 /*
765 * Handle inline-data separately.
766 */
767 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
768 ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
769 goto out_unlock;
770 }
771
772 cpos = map_start >> osb->s_clustersize_bits;
773 mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
774 map_start + map_len);
775 mapping_end -= cpos;
776 is_last = 0;
777 while (cpos < mapping_end && !is_last) {
778 u32 fe_flags;
779
780 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
781 &hole_size, &rec, &is_last);
782 if (ret) {
783 mlog_errno(ret);
784 goto out;
785 }
786
787 if (rec.e_blkno == 0ULL) {
788 cpos += hole_size;
789 continue;
790 }
791
792 fe_flags = 0;
793 if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
794 fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
795 if (is_last)
796 fe_flags |= FIEMAP_EXTENT_LAST;
797 len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
798 phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
799 virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
800
801 ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
802 len_bytes, fe_flags);
803 if (ret)
804 break;
805
806 cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
807 }
808
809 if (ret > 0)
810 ret = 0;
811
812out_unlock:
813 brelse(di_bh);
814
815 up_read(&OCFS2_I(inode)->ip_alloc_sem);
816
817 ocfs2_inode_unlock(inode, 0);
818out:
819
820 return ret;
821}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a22..1c4aa8b06f34 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, 50int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
51 u64 *ret_count, unsigned int *extent_flags); 51 u64 *ret_count, unsigned int *extent_flags);
52 52
53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
54 u64 map_start, u64 map_len);
55
56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el);
59
53#endif /* _EXTENT_MAP_H */ 60#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15c3daa..8d3225a78073 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
55#include "mmap.h" 55#include "mmap.h"
56#include "suballoc.h" 56#include "suballoc.h"
57#include "super.h" 57#include "super.h"
58#include "xattr.h"
58 59
59#include "buffer_head_io.h" 60#include "buffer_head_io.h"
60 61
@@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
184 goto bail; 185 goto bail;
185 186
186 journal = osb->journal->j_journal; 187 journal = osb->journal->j_journal;
187 err = journal_force_commit(journal); 188 err = jbd2_journal_force_commit(journal);
188 189
189bail: 190bail:
190 mlog_exit(err); 191 mlog_exit(err);
@@ -488,7 +489,7 @@ bail:
488} 489}
489 490
490/* 491/*
491 * extend allocation only here. 492 * extend file allocation only here.
492 * we'll update all the disk stuff, and oip->alloc_size 493 * we'll update all the disk stuff, and oip->alloc_size
493 * 494 *
494 * expect stuff to be locked, a transaction started and enough data / 495 * expect stuff to be locked, a transaction started and enough data /
@@ -497,189 +498,25 @@ bail:
497 * Will return -EAGAIN, and a reason if a restart is needed. 498 * Will return -EAGAIN, and a reason if a restart is needed.
498 * If passed in, *reason will always be set, even in error. 499 * If passed in, *reason will always be set, even in error.
499 */ 500 */
500int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 501int ocfs2_add_inode_data(struct ocfs2_super *osb,
501 struct inode *inode, 502 struct inode *inode,
502 u32 *logical_offset, 503 u32 *logical_offset,
503 u32 clusters_to_add, 504 u32 clusters_to_add,
504 int mark_unwritten, 505 int mark_unwritten,
505 struct buffer_head *fe_bh, 506 struct buffer_head *fe_bh,
506 handle_t *handle, 507 handle_t *handle,
507 struct ocfs2_alloc_context *data_ac, 508 struct ocfs2_alloc_context *data_ac,
508 struct ocfs2_alloc_context *meta_ac, 509 struct ocfs2_alloc_context *meta_ac,
509 enum ocfs2_alloc_restarted *reason_ret) 510 enum ocfs2_alloc_restarted *reason_ret)
510{ 511{
511 int status = 0; 512 int ret;
512 int free_extents; 513 struct ocfs2_extent_tree et;
513 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
514 enum ocfs2_alloc_restarted reason = RESTART_NONE;
515 u32 bit_off, num_bits;
516 u64 block;
517 u8 flags = 0;
518
519 BUG_ON(!clusters_to_add);
520
521 if (mark_unwritten)
522 flags = OCFS2_EXT_UNWRITTEN;
523
524 free_extents = ocfs2_num_free_extents(osb, inode, fe);
525 if (free_extents < 0) {
526 status = free_extents;
527 mlog_errno(status);
528 goto leave;
529 }
530
531 /* there are two cases which could cause us to EAGAIN in the
532 * we-need-more-metadata case:
533 * 1) we haven't reserved *any*
534 * 2) we are so fragmented, we've needed to add metadata too
535 * many times. */
536 if (!free_extents && !meta_ac) {
537 mlog(0, "we haven't reserved any metadata!\n");
538 status = -EAGAIN;
539 reason = RESTART_META;
540 goto leave;
541 } else if ((!free_extents)
542 && (ocfs2_alloc_context_bits_left(meta_ac)
543 < ocfs2_extend_meta_needed(fe))) {
544 mlog(0, "filesystem is really fragmented...\n");
545 status = -EAGAIN;
546 reason = RESTART_META;
547 goto leave;
548 }
549
550 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
551 clusters_to_add, &bit_off, &num_bits);
552 if (status < 0) {
553 if (status != -ENOSPC)
554 mlog_errno(status);
555 goto leave;
556 }
557
558 BUG_ON(num_bits > clusters_to_add);
559
560 /* reserve our write early -- insert_extent may update the inode */
561 status = ocfs2_journal_access(handle, inode, fe_bh,
562 OCFS2_JOURNAL_ACCESS_WRITE);
563 if (status < 0) {
564 mlog_errno(status);
565 goto leave;
566 }
567
568 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
569 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
570 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
571 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
572 *logical_offset, block, num_bits,
573 flags, meta_ac);
574 if (status < 0) {
575 mlog_errno(status);
576 goto leave;
577 }
578
579 status = ocfs2_journal_dirty(handle, fe_bh);
580 if (status < 0) {
581 mlog_errno(status);
582 goto leave;
583 }
584
585 clusters_to_add -= num_bits;
586 *logical_offset += num_bits;
587
588 if (clusters_to_add) {
589 mlog(0, "need to alloc once more, clusters = %u, wanted = "
590 "%u\n", fe->i_clusters, clusters_to_add);
591 status = -EAGAIN;
592 reason = RESTART_TRANS;
593 }
594
595leave:
596 mlog_exit(status);
597 if (reason_ret)
598 *reason_ret = reason;
599 return status;
600}
601
602/*
603 * For a given allocation, determine which allocators will need to be
604 * accessed, and lock them, reserving the appropriate number of bits.
605 *
606 * Sparse file systems call this from ocfs2_write_begin_nolock()
607 * and ocfs2_allocate_unwritten_extents().
608 *
609 * File systems which don't support holes call this from
610 * ocfs2_extend_allocation().
611 */
612int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
613 u32 clusters_to_add, u32 extents_to_split,
614 struct ocfs2_alloc_context **data_ac,
615 struct ocfs2_alloc_context **meta_ac)
616{
617 int ret = 0, num_free_extents;
618 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
619 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
620
621 *meta_ac = NULL;
622 if (data_ac)
623 *data_ac = NULL;
624
625 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
626
627 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
628 "clusters_to_add = %u, extents_to_split = %u\n",
629 (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
630 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
631
632 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
633 if (num_free_extents < 0) {
634 ret = num_free_extents;
635 mlog_errno(ret);
636 goto out;
637 }
638
639 /*
640 * Sparse allocation file systems need to be more conservative
641 * with reserving room for expansion - the actual allocation
642 * happens while we've got a journal handle open so re-taking
643 * a cluster lock (because we ran out of room for another
644 * extent) will violate ordering rules.
645 *
646 * Most of the time we'll only be seeing this 1 cluster at a time
647 * anyway.
648 *
649 * Always lock for any unwritten extents - we might want to
650 * add blocks during a split.
651 */
652 if (!num_free_extents ||
653 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
654 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
655 if (ret < 0) {
656 if (ret != -ENOSPC)
657 mlog_errno(ret);
658 goto out;
659 }
660 }
661
662 if (clusters_to_add == 0)
663 goto out;
664
665 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
666 if (ret < 0) {
667 if (ret != -ENOSPC)
668 mlog_errno(ret);
669 goto out;
670 }
671
672out:
673 if (ret) {
674 if (*meta_ac) {
675 ocfs2_free_alloc_context(*meta_ac);
676 *meta_ac = NULL;
677 }
678 514
679 /* 515 ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
680 * We cannot have an error and a non null *data_ac. 516 ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
681 */ 517 clusters_to_add, mark_unwritten,
682 } 518 &et, handle,
519 data_ac, meta_ac, reason_ret);
683 520
684 return ret; 521 return ret;
685} 522}
@@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
698 struct ocfs2_alloc_context *meta_ac = NULL; 535 struct ocfs2_alloc_context *meta_ac = NULL;
699 enum ocfs2_alloc_restarted why; 536 enum ocfs2_alloc_restarted why;
700 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
538 struct ocfs2_extent_tree et;
701 539
702 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 540 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
703 541
@@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
707 */ 545 */
708 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 546 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
709 547
710 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 548 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
711 OCFS2_BH_CACHED, inode);
712 if (status < 0) { 549 if (status < 0) {
713 mlog_errno(status); 550 mlog_errno(status);
714 goto leave; 551 goto leave;
@@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
724restart_all: 561restart_all:
725 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 562 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
726 563
727 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, 564 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
728 &meta_ac); 565 "clusters_to_add = %u\n",
566 (unsigned long long)OCFS2_I(inode)->ip_blkno,
567 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
568 clusters_to_add);
569 ocfs2_init_dinode_extent_tree(&et, inode, bh);
570 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
571 &data_ac, &meta_ac);
729 if (status) { 572 if (status) {
730 mlog_errno(status); 573 mlog_errno(status);
731 goto leave; 574 goto leave;
732 } 575 }
733 576
734 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 577 credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
578 clusters_to_add);
735 handle = ocfs2_start_trans(osb, credits); 579 handle = ocfs2_start_trans(osb, credits);
736 if (IS_ERR(handle)) { 580 if (IS_ERR(handle)) {
737 status = PTR_ERR(handle); 581 status = PTR_ERR(handle);
@@ -753,16 +597,16 @@ restarted_transaction:
753 597
754 prev_clusters = OCFS2_I(inode)->ip_clusters; 598 prev_clusters = OCFS2_I(inode)->ip_clusters;
755 599
756 status = ocfs2_do_extend_allocation(osb, 600 status = ocfs2_add_inode_data(osb,
757 inode, 601 inode,
758 &logical_start, 602 &logical_start,
759 clusters_to_add, 603 clusters_to_add,
760 mark_unwritten, 604 mark_unwritten,
761 bh, 605 bh,
762 handle, 606 handle,
763 data_ac, 607 data_ac,
764 meta_ac, 608 meta_ac,
765 &why); 609 &why);
766 if ((status < 0) && (status != -EAGAIN)) { 610 if ((status < 0) && (status != -EAGAIN)) {
767 if (status != -ENOSPC) 611 if (status != -ENOSPC)
768 mlog_errno(status); 612 mlog_errno(status);
@@ -789,7 +633,7 @@ restarted_transaction:
789 mlog(0, "restarting transaction.\n"); 633 mlog(0, "restarting transaction.\n");
790 /* TODO: This can be more intelligent. */ 634 /* TODO: This can be more intelligent. */
791 credits = ocfs2_calc_extend_credits(osb->sb, 635 credits = ocfs2_calc_extend_credits(osb->sb,
792 fe, 636 &fe->id2.i_list,
793 clusters_to_add); 637 clusters_to_add);
794 status = ocfs2_extend_trans(handle, credits); 638 status = ocfs2_extend_trans(handle, credits);
795 if (status < 0) { 639 if (status < 0) {
@@ -826,10 +670,8 @@ leave:
826 restart_func = 0; 670 restart_func = 0;
827 goto restart_all; 671 goto restart_all;
828 } 672 }
829 if (bh) { 673 brelse(bh);
830 brelse(bh); 674 bh = NULL;
831 bh = NULL;
832 }
833 675
834 mlog_exit(status); 676 mlog_exit(status);
835 return status; 677 return status;
@@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1096 goto bail_unlock; 938 goto bail_unlock;
1097 } 939 }
1098 940
1099 if (i_size_read(inode) > attr->ia_size) 941 if (i_size_read(inode) > attr->ia_size) {
942 if (ocfs2_should_order_data(inode)) {
943 status = ocfs2_begin_ordered_truncate(inode,
944 attr->ia_size);
945 if (status)
946 goto bail_unlock;
947 }
1100 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 948 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1101 else 949 } else
1102 status = ocfs2_extend_file(inode, bh, attr->ia_size); 950 status = ocfs2_extend_file(inode, bh, attr->ia_size);
1103 if (status < 0) { 951 if (status < 0) {
1104 if (status != -ENOSPC) 952 if (status != -ENOSPC)
@@ -1140,8 +988,7 @@ bail_unlock_rw:
1140 if (size_change) 988 if (size_change)
1141 ocfs2_rw_unlock(inode, 1); 989 ocfs2_rw_unlock(inode, 1);
1142bail: 990bail:
1143 if (bh) 991 brelse(bh);
1144 brelse(bh);
1145 992
1146 mlog_exit(status); 993 mlog_exit(status);
1147 return status; 994 return status;
@@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1284 struct buffer_head *bh = NULL; 1131 struct buffer_head *bh = NULL;
1285 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1132 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1286 1133
1287 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 1134 ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
1288 oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1289 if (ret < 0) { 1135 if (ret < 0) {
1290 mlog_errno(ret); 1136 mlog_errno(ret);
1291 goto out; 1137 goto out;
@@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1311 struct buffer_head *di_bh = NULL; 1157 struct buffer_head *di_bh = NULL;
1312 1158
1313 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1159 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1314 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 1160 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
1315 OCFS2_I(inode)->ip_blkno, &di_bh, 1161 &di_bh);
1316 OCFS2_BH_CACHED, inode);
1317 if (ret) { 1162 if (ret) {
1318 mlog_errno(ret); 1163 mlog_errno(ret);
1319 goto out; 1164 goto out;
@@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
1394 handle_t *handle; 1239 handle_t *handle;
1395 struct ocfs2_alloc_context *meta_ac = NULL; 1240 struct ocfs2_alloc_context *meta_ac = NULL;
1396 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1241 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1242 struct ocfs2_extent_tree et;
1397 1243
1398 ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); 1244 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1245
1246 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
1399 if (ret) { 1247 if (ret) {
1400 mlog_errno(ret); 1248 mlog_errno(ret);
1401 return ret; 1249 return ret;
@@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
1425 goto out; 1273 goto out;
1426 } 1274 }
1427 1275
1428 ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, 1276 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
1429 dealloc); 1277 dealloc);
1430 if (ret) { 1278 if (ret) {
1431 mlog_errno(ret); 1279 mlog_errno(ret);
@@ -2040,7 +1888,7 @@ out_dio:
2040 */ 1888 */
2041 if (old_size != i_size_read(inode) || 1889 if (old_size != i_size_read(inode) ||
2042 old_clusters != OCFS2_I(inode)->ip_clusters) { 1890 old_clusters != OCFS2_I(inode)->ip_clusters) {
2043 ret = journal_force_commit(osb->journal->j_journal); 1891 ret = jbd2_journal_force_commit(osb->journal->j_journal);
2044 if (ret < 0) 1892 if (ret < 0)
2045 written = ret; 1893 written = ret;
2046 } 1894 }
@@ -2227,7 +2075,12 @@ const struct inode_operations ocfs2_file_iops = {
2227 .setattr = ocfs2_setattr, 2075 .setattr = ocfs2_setattr,
2228 .getattr = ocfs2_getattr, 2076 .getattr = ocfs2_getattr,
2229 .permission = ocfs2_permission, 2077 .permission = ocfs2_permission,
2078 .setxattr = generic_setxattr,
2079 .getxattr = generic_getxattr,
2080 .listxattr = ocfs2_listxattr,
2081 .removexattr = generic_removexattr,
2230 .fallocate = ocfs2_fallocate, 2082 .fallocate = ocfs2_fallocate,
2083 .fiemap = ocfs2_fiemap,
2231}; 2084};
2232 2085
2233const struct inode_operations ocfs2_special_file_iops = { 2086const struct inode_operations ocfs2_special_file_iops = {
@@ -2236,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = {
2236 .permission = ocfs2_permission, 2089 .permission = ocfs2_permission,
2237}; 2090};
2238 2091
2092/*
2093 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2094 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2095 */
2239const struct file_operations ocfs2_fops = { 2096const struct file_operations ocfs2_fops = {
2240 .llseek = generic_file_llseek, 2097 .llseek = generic_file_llseek,
2241 .read = do_sync_read, 2098 .read = do_sync_read,
@@ -2250,6 +2107,7 @@ const struct file_operations ocfs2_fops = {
2250#ifdef CONFIG_COMPAT 2107#ifdef CONFIG_COMPAT
2251 .compat_ioctl = ocfs2_compat_ioctl, 2108 .compat_ioctl = ocfs2_compat_ioctl,
2252#endif 2109#endif
2110 .lock = ocfs2_lock,
2253 .flock = ocfs2_flock, 2111 .flock = ocfs2_flock,
2254 .splice_read = ocfs2_file_splice_read, 2112 .splice_read = ocfs2_file_splice_read,
2255 .splice_write = ocfs2_file_splice_write, 2113 .splice_write = ocfs2_file_splice_write,
@@ -2266,5 +2124,51 @@ const struct file_operations ocfs2_dops = {
2266#ifdef CONFIG_COMPAT 2124#ifdef CONFIG_COMPAT
2267 .compat_ioctl = ocfs2_compat_ioctl, 2125 .compat_ioctl = ocfs2_compat_ioctl,
2268#endif 2126#endif
2127 .lock = ocfs2_lock,
2128 .flock = ocfs2_flock,
2129};
2130
2131/*
2132 * POSIX-lockless variants of our file_operations.
2133 *
2134 * These will be used if the underlying cluster stack does not support
2135 * posix file locking, if the user passes the "localflocks" mount
2136 * option, or if we have a local-only fs.
2137 *
2138 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2139 * so we still want it in the case of no stack support for
2140 * plocks. Internally, it will do the right thing when asked to ignore
2141 * the cluster.
2142 */
2143const struct file_operations ocfs2_fops_no_plocks = {
2144 .llseek = generic_file_llseek,
2145 .read = do_sync_read,
2146 .write = do_sync_write,
2147 .mmap = ocfs2_mmap,
2148 .fsync = ocfs2_sync_file,
2149 .release = ocfs2_file_release,
2150 .open = ocfs2_file_open,
2151 .aio_read = ocfs2_file_aio_read,
2152 .aio_write = ocfs2_file_aio_write,
2153 .unlocked_ioctl = ocfs2_ioctl,
2154#ifdef CONFIG_COMPAT
2155 .compat_ioctl = ocfs2_compat_ioctl,
2156#endif
2157 .flock = ocfs2_flock,
2158 .splice_read = ocfs2_file_splice_read,
2159 .splice_write = ocfs2_file_splice_write,
2160};
2161
2162const struct file_operations ocfs2_dops_no_plocks = {
2163 .llseek = generic_file_llseek,
2164 .read = generic_read_dir,
2165 .readdir = ocfs2_readdir,
2166 .fsync = ocfs2_sync_file,
2167 .release = ocfs2_dir_release,
2168 .open = ocfs2_dir_open,
2169 .unlocked_ioctl = ocfs2_ioctl,
2170#ifdef CONFIG_COMPAT
2171 .compat_ioctl = ocfs2_compat_ioctl,
2172#endif
2269 .flock = ocfs2_flock, 2173 .flock = ocfs2_flock,
2270}; 2174};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 1e27b4d017ea..e92382cbca5f 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,9 +28,12 @@
28 28
29extern const struct file_operations ocfs2_fops; 29extern const struct file_operations ocfs2_fops;
30extern const struct file_operations ocfs2_dops; 30extern const struct file_operations ocfs2_dops;
31extern const struct file_operations ocfs2_fops_no_plocks;
32extern const struct file_operations ocfs2_dops_no_plocks;
31extern const struct inode_operations ocfs2_file_iops; 33extern const struct inode_operations ocfs2_file_iops;
32extern const struct inode_operations ocfs2_special_file_iops; 34extern const struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context; 35struct ocfs2_alloc_context;
36enum ocfs2_alloc_restarted;
34 37
35struct ocfs2_file_private { 38struct ocfs2_file_private {
36 struct file *fp_file; 39 struct file *fp_file;
@@ -38,27 +41,18 @@ struct ocfs2_file_private {
38 struct ocfs2_lock_res fp_flock; 41 struct ocfs2_lock_res fp_flock;
39}; 42};
40 43
41enum ocfs2_alloc_restarted { 44int ocfs2_add_inode_data(struct ocfs2_super *osb,
42 RESTART_NONE = 0, 45 struct inode *inode,
43 RESTART_TRANS, 46 u32 *logical_offset,
44 RESTART_META 47 u32 clusters_to_add,
45}; 48 int mark_unwritten,
46int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 49 struct buffer_head *fe_bh,
47 struct inode *inode, 50 handle_t *handle,
48 u32 *logical_offset, 51 struct ocfs2_alloc_context *data_ac,
49 u32 clusters_to_add, 52 struct ocfs2_alloc_context *meta_ac,
50 int mark_unwritten, 53 enum ocfs2_alloc_restarted *reason_ret);
51 struct buffer_head *fe_bh,
52 handle_t *handle,
53 struct ocfs2_alloc_context *data_ac,
54 struct ocfs2_alloc_context *meta_ac,
55 enum ocfs2_alloc_restarted *reason_ret);
56int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, 54int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
57 u64 zero_to); 55 u64 zero_to);
58int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
59 u32 clusters_to_add, u32 extents_to_split,
60 struct ocfs2_alloc_context **data_ac,
61 struct ocfs2_alloc_context **meta_ac);
62int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 56int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
63int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 57int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
64 struct kstat *stat); 58 struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec7..4903688f72a9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
49#include "symlink.h" 49#include "symlink.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h" 51#include "uptodate.h"
52#include "xattr.h"
52 53
53#include "buffer_head_io.h" 54#include "buffer_head_io.h"
54 55
@@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
219 struct super_block *sb; 220 struct super_block *sb;
220 struct ocfs2_super *osb; 221 struct ocfs2_super *osb;
221 int status = -EINVAL; 222 int status = -EINVAL;
223 int use_plocks = 1;
222 224
223 mlog_entry("(0x%p, size:%llu)\n", inode, 225 mlog_entry("(0x%p, size:%llu)\n", inode,
224 (unsigned long long)le64_to_cpu(fe->i_size)); 226 (unsigned long long)le64_to_cpu(fe->i_size));
@@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
226 sb = inode->i_sb; 228 sb = inode->i_sb;
227 osb = OCFS2_SB(sb); 229 osb = OCFS2_SB(sb);
228 230
231 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
232 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
233 use_plocks = 0;
234
229 /* this means that read_inode cannot create a superblock inode 235 /* this means that read_inode cannot create a superblock inode
230 * today. change if needed. */ 236 * today. change if needed. */
231 if (!OCFS2_IS_VALID_DINODE(fe) || 237 if (!OCFS2_IS_VALID_DINODE(fe) ||
@@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
295 301
296 switch (inode->i_mode & S_IFMT) { 302 switch (inode->i_mode & S_IFMT) {
297 case S_IFREG: 303 case S_IFREG:
298 inode->i_fop = &ocfs2_fops; 304 if (use_plocks)
305 inode->i_fop = &ocfs2_fops;
306 else
307 inode->i_fop = &ocfs2_fops_no_plocks;
299 inode->i_op = &ocfs2_file_iops; 308 inode->i_op = &ocfs2_file_iops;
300 i_size_write(inode, le64_to_cpu(fe->i_size)); 309 i_size_write(inode, le64_to_cpu(fe->i_size));
301 break; 310 break;
302 case S_IFDIR: 311 case S_IFDIR:
303 inode->i_op = &ocfs2_dir_iops; 312 inode->i_op = &ocfs2_dir_iops;
304 inode->i_fop = &ocfs2_dops; 313 if (use_plocks)
314 inode->i_fop = &ocfs2_dops;
315 else
316 inode->i_fop = &ocfs2_dops_no_plocks;
305 i_size_write(inode, le64_to_cpu(fe->i_size)); 317 i_size_write(inode, le64_to_cpu(fe->i_size));
306 break; 318 break;
307 case S_IFLNK: 319 case S_IFLNK:
@@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
448 } 460 }
449 } 461 }
450 462
451 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, 463 if (can_lock)
452 can_lock ? inode : NULL); 464 status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
465 OCFS2_BH_IGNORE_CACHE);
466 else
467 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
453 if (status < 0) { 468 if (status < 0) {
454 mlog_errno(status); 469 mlog_errno(status);
455 goto bail; 470 goto bail;
@@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
522 * data and fast symlinks. 537 * data and fast symlinks.
523 */ 538 */
524 if (fe->i_clusters) { 539 if (fe->i_clusters) {
540 if (ocfs2_should_order_data(inode))
541 ocfs2_begin_ordered_truncate(inode, 0);
542
525 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 543 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
526 if (IS_ERR(handle)) { 544 if (IS_ERR(handle)) {
527 status = PTR_ERR(handle); 545 status = PTR_ERR(handle);
@@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
730 goto bail_unlock_dir; 748 goto bail_unlock_dir;
731 } 749 }
732 750
751 /*Free extended attribute resources associated with this inode.*/
752 status = ocfs2_xattr_remove(inode, di_bh);
753 if (status < 0) {
754 mlog_errno(status);
755 goto bail_unlock_dir;
756 }
757
733 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, 758 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
734 orphan_dir_bh); 759 orphan_dir_bh);
735 if (status < 0) 760 if (status < 0)
@@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode)
1081 oi->ip_last_trans = 0; 1106 oi->ip_last_trans = 0;
1082 oi->ip_dir_start_lookup = 0; 1107 oi->ip_dir_start_lookup = 0;
1083 oi->ip_blkno = 0ULL; 1108 oi->ip_blkno = 0ULL;
1109 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
1110 &oi->ip_jinode);
1084 1111
1085bail: 1112bail:
1086 mlog_exit_void(); 1113 mlog_exit_void();
@@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode)
1107} 1134}
1108 1135
1109/* 1136/*
1110 * TODO: this should probably be merged into ocfs2_get_block
1111 *
1112 * However, you now need to pay attention to the cont_prepare_write()
1113 * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
1114 * expects never to extend).
1115 */
1116struct buffer_head *ocfs2_bread(struct inode *inode,
1117 int block, int *err, int reada)
1118{
1119 struct buffer_head *bh = NULL;
1120 int tmperr;
1121 u64 p_blkno;
1122 int readflags = OCFS2_BH_CACHED;
1123
1124 if (reada)
1125 readflags |= OCFS2_BH_READAHEAD;
1126
1127 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
1128 i_size_read(inode)) {
1129 BUG_ON(!reada);
1130 return NULL;
1131 }
1132
1133 down_read(&OCFS2_I(inode)->ip_alloc_sem);
1134 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
1135 NULL);
1136 up_read(&OCFS2_I(inode)->ip_alloc_sem);
1137 if (tmperr < 0) {
1138 mlog_errno(tmperr);
1139 goto fail;
1140 }
1141
1142 tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
1143 readflags, inode);
1144 if (tmperr < 0)
1145 goto fail;
1146
1147 tmperr = 0;
1148
1149 *err = 0;
1150 return bh;
1151
1152fail:
1153 if (bh) {
1154 brelse(bh);
1155 bh = NULL;
1156 }
1157 *err = -EIO;
1158 return NULL;
1159}
1160
1161/*
1162 * This is called from our getattr. 1137 * This is called from our getattr.
1163 */ 1138 */
1164int ocfs2_inode_revalidate(struct dentry *dentry) 1139int ocfs2_inode_revalidate(struct dentry *dentry)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 390a85596aa0..2f37af9bcc4a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -40,6 +40,9 @@ struct ocfs2_inode_info
40 /* protects allocation changes on this inode. */ 40 /* protects allocation changes on this inode. */
41 struct rw_semaphore ip_alloc_sem; 41 struct rw_semaphore ip_alloc_sem;
42 42
43 /* protects extended attribute changes on this inode */
44 struct rw_semaphore ip_xattr_sem;
45
43 /* These fields are protected by ip_lock */ 46 /* These fields are protected by ip_lock */
44 spinlock_t ip_lock; 47 spinlock_t ip_lock;
45 u32 ip_open_count; 48 u32 ip_open_count;
@@ -68,6 +71,7 @@ struct ocfs2_inode_info
68 struct ocfs2_extent_map ip_extent_map; 71 struct ocfs2_extent_map ip_extent_map;
69 72
70 struct inode vfs_inode; 73 struct inode vfs_inode;
74 struct jbd2_inode ip_jinode;
71}; 75};
72 76
73/* 77/*
@@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;
113 117
114extern const struct address_space_operations ocfs2_aops; 118extern const struct address_space_operations ocfs2_aops;
115 119
116struct buffer_head *ocfs2_bread(struct inode *inode, int block,
117 int *err, int reada);
118void ocfs2_clear_inode(struct inode *inode); 120void ocfs2_clear_inode(struct inode *inode);
119void ocfs2_delete_inode(struct inode *inode); 121void ocfs2_delete_inode(struct inode *inode);
120void ocfs2_drop_inode(struct inode *inode); 122void ocfs2_drop_inode(struct inode *inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7b142f0ce995..9fcd36dcc9a0 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -102,8 +102,7 @@ bail_unlock:
102bail: 102bail:
103 mutex_unlock(&inode->i_mutex); 103 mutex_unlock(&inode->i_mutex);
104 104
105 if (bh) 105 brelse(bh);
106 brelse(bh);
107 106
108 mlog_exit(status); 107 mlog_exit(status);
109 return status; 108 return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7a37240f7a31..81e40677eecb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
215 goto finally; 215 goto finally;
216 } 216 }
217 217
218 journal_lock_updates(journal->j_journal); 218 jbd2_journal_lock_updates(journal->j_journal);
219 status = journal_flush(journal->j_journal); 219 status = jbd2_journal_flush(journal->j_journal);
220 journal_unlock_updates(journal->j_journal); 220 jbd2_journal_unlock_updates(journal->j_journal);
221 if (status < 0) { 221 if (status < 0) {
222 up_write(&journal->j_trans_barrier); 222 up_write(&journal->j_trans_barrier);
223 mlog_errno(status); 223 mlog_errno(status);
@@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
264 264
265 down_read(&osb->journal->j_trans_barrier); 265 down_read(&osb->journal->j_trans_barrier);
266 266
267 handle = journal_start(journal, max_buffs); 267 handle = jbd2_journal_start(journal, max_buffs);
268 if (IS_ERR(handle)) { 268 if (IS_ERR(handle)) {
269 up_read(&osb->journal->j_trans_barrier); 269 up_read(&osb->journal->j_trans_barrier);
270 270
@@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
290 290
291 BUG_ON(!handle); 291 BUG_ON(!handle);
292 292
293 ret = journal_stop(handle); 293 ret = jbd2_journal_stop(handle);
294 if (ret < 0) 294 if (ret < 0)
295 mlog_errno(ret); 295 mlog_errno(ret);
296 296
@@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
304 * transaction. extend_trans will either extend the current handle by 304 * transaction. extend_trans will either extend the current handle by
305 * nblocks, or commit it and start a new one with nblocks credits. 305 * nblocks, or commit it and start a new one with nblocks credits.
306 * 306 *
307 * This might call journal_restart() which will commit dirty buffers 307 * This might call jbd2_journal_restart() which will commit dirty buffers
308 * and then restart the transaction. Before calling 308 * and then restart the transaction. Before calling
309 * ocfs2_extend_trans(), any changed blocks should have been 309 * ocfs2_extend_trans(), any changed blocks should have been
310 * dirtied. After calling it, all blocks which need to be changed must 310 * dirtied. After calling it, all blocks which need to be changed must
@@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
332#ifdef CONFIG_OCFS2_DEBUG_FS 332#ifdef CONFIG_OCFS2_DEBUG_FS
333 status = 1; 333 status = 1;
334#else 334#else
335 status = journal_extend(handle, nblocks); 335 status = jbd2_journal_extend(handle, nblocks);
336 if (status < 0) { 336 if (status < 0) {
337 mlog_errno(status); 337 mlog_errno(status);
338 goto bail; 338 goto bail;
@@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
340#endif 340#endif
341 341
342 if (status > 0) { 342 if (status > 0) {
343 mlog(0, "journal_extend failed, trying journal_restart\n"); 343 mlog(0,
344 status = journal_restart(handle, nblocks); 344 "jbd2_journal_extend failed, trying "
345 "jbd2_journal_restart\n");
346 status = jbd2_journal_restart(handle, nblocks);
345 if (status < 0) { 347 if (status < 0) {
346 mlog_errno(status); 348 mlog_errno(status);
347 goto bail; 349 goto bail;
@@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
393 switch (type) { 395 switch (type) {
394 case OCFS2_JOURNAL_ACCESS_CREATE: 396 case OCFS2_JOURNAL_ACCESS_CREATE:
395 case OCFS2_JOURNAL_ACCESS_WRITE: 397 case OCFS2_JOURNAL_ACCESS_WRITE:
396 status = journal_get_write_access(handle, bh); 398 status = jbd2_journal_get_write_access(handle, bh);
397 break; 399 break;
398 400
399 case OCFS2_JOURNAL_ACCESS_UNDO: 401 case OCFS2_JOURNAL_ACCESS_UNDO:
400 status = journal_get_undo_access(handle, bh); 402 status = jbd2_journal_get_undo_access(handle, bh);
401 break; 403 break;
402 404
403 default: 405 default:
@@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
422 mlog_entry("(bh->b_blocknr=%llu)\n", 424 mlog_entry("(bh->b_blocknr=%llu)\n",
423 (unsigned long long)bh->b_blocknr); 425 (unsigned long long)bh->b_blocknr);
424 426
425 status = journal_dirty_metadata(handle, bh); 427 status = jbd2_journal_dirty_metadata(handle, bh);
426 if (status < 0) 428 if (status < 0)
427 mlog(ML_ERROR, "Could not dirty metadata buffer. " 429 mlog(ML_ERROR, "Could not dirty metadata buffer. "
428 "(bh->b_blocknr=%llu)\n", 430 "(bh->b_blocknr=%llu)\n",
@@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
432 return status; 434 return status;
433} 435}
434 436
437#ifdef CONFIG_OCFS2_COMPAT_JBD
435int ocfs2_journal_dirty_data(handle_t *handle, 438int ocfs2_journal_dirty_data(handle_t *handle,
436 struct buffer_head *bh) 439 struct buffer_head *bh)
437{ 440{
@@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
443 446
444 return err; 447 return err;
445} 448}
449#endif
446 450
447#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE) 451#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
448 452
449void ocfs2_set_journal_params(struct ocfs2_super *osb) 453void ocfs2_set_journal_params(struct ocfs2_super *osb)
450{ 454{
@@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
457 spin_lock(&journal->j_state_lock); 461 spin_lock(&journal->j_state_lock);
458 journal->j_commit_interval = commit_interval; 462 journal->j_commit_interval = commit_interval;
459 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 463 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
460 journal->j_flags |= JFS_BARRIER; 464 journal->j_flags |= JBD2_BARRIER;
461 else 465 else
462 journal->j_flags &= ~JFS_BARRIER; 466 journal->j_flags &= ~JBD2_BARRIER;
463 spin_unlock(&journal->j_state_lock); 467 spin_unlock(&journal->j_state_lock);
464} 468}
465 469
@@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
524 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); 528 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
525 529
526 /* call the kernels journal init function now */ 530 /* call the kernels journal init function now */
527 j_journal = journal_init_inode(inode); 531 j_journal = jbd2_journal_init_inode(inode);
528 if (j_journal == NULL) { 532 if (j_journal == NULL) {
529 mlog(ML_ERROR, "Linux journal layer error\n"); 533 mlog(ML_ERROR, "Linux journal layer error\n");
530 status = -EINVAL; 534 status = -EINVAL;
531 goto done; 535 goto done;
532 } 536 }
533 537
534 mlog(0, "Returned from journal_init_inode\n"); 538 mlog(0, "Returned from jbd2_journal_init_inode\n");
535 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); 539 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
536 540
537 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & 541 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@@ -550,8 +554,7 @@ done:
550 if (status < 0) { 554 if (status < 0) {
551 if (inode_lock) 555 if (inode_lock)
552 ocfs2_inode_unlock(inode, 1); 556 ocfs2_inode_unlock(inode, 1);
553 if (bh != NULL) 557 brelse(bh);
554 brelse(bh);
555 if (inode) { 558 if (inode) {
556 OCFS2_I(inode)->ip_open_count--; 559 OCFS2_I(inode)->ip_open_count--;
557 iput(inode); 560 iput(inode);
@@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
639 if (journal->j_state != OCFS2_JOURNAL_LOADED) 642 if (journal->j_state != OCFS2_JOURNAL_LOADED)
640 goto done; 643 goto done;
641 644
642 /* need to inc inode use count as journal_destroy will iput. */ 645 /* need to inc inode use count - jbd2_journal_destroy will iput. */
643 if (!igrab(inode)) 646 if (!igrab(inode))
644 BUG(); 647 BUG();
645 648
@@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
668 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); 671 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
669 672
670 if (ocfs2_mount_local(osb)) { 673 if (ocfs2_mount_local(osb)) {
671 journal_lock_updates(journal->j_journal); 674 jbd2_journal_lock_updates(journal->j_journal);
672 status = journal_flush(journal->j_journal); 675 status = jbd2_journal_flush(journal->j_journal);
673 journal_unlock_updates(journal->j_journal); 676 jbd2_journal_unlock_updates(journal->j_journal);
674 if (status < 0) 677 if (status < 0)
675 mlog_errno(status); 678 mlog_errno(status);
676 } 679 }
@@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
686 } 689 }
687 690
688 /* Shutdown the kernel journal system */ 691 /* Shutdown the kernel journal system */
689 journal_destroy(journal->j_journal); 692 jbd2_journal_destroy(journal->j_journal);
690 693
691 OCFS2_I(inode)->ip_open_count--; 694 OCFS2_I(inode)->ip_open_count--;
692 695
@@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
711{ 714{
712 int olderr; 715 int olderr;
713 716
714 olderr = journal_errno(journal); 717 olderr = jbd2_journal_errno(journal);
715 if (olderr) { 718 if (olderr) {
716 mlog(ML_ERROR, "File system error %d recorded in " 719 mlog(ML_ERROR, "File system error %d recorded in "
717 "journal %u.\n", olderr, slot); 720 "journal %u.\n", olderr, slot);
718 mlog(ML_ERROR, "File system on device %s needs checking.\n", 721 mlog(ML_ERROR, "File system on device %s needs checking.\n",
719 sb->s_id); 722 sb->s_id);
720 723
721 journal_ack_err(journal); 724 jbd2_journal_ack_err(journal);
722 journal_clear_err(journal); 725 jbd2_journal_clear_err(journal);
723 } 726 }
724} 727}
725 728
@@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
734 737
735 osb = journal->j_osb; 738 osb = journal->j_osb;
736 739
737 status = journal_load(journal->j_journal); 740 status = jbd2_journal_load(journal->j_journal);
738 if (status < 0) { 741 if (status < 0) {
739 mlog(ML_ERROR, "Failed to load journal!\n"); 742 mlog(ML_ERROR, "Failed to load journal!\n");
740 goto done; 743 goto done;
@@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
778 781
779 BUG_ON(!journal); 782 BUG_ON(!journal);
780 783
781 status = journal_wipe(journal->j_journal, full); 784 status = jbd2_journal_wipe(journal->j_journal, full);
782 if (status < 0) { 785 if (status < 0) {
783 mlog_errno(status); 786 mlog_errno(status);
784 goto bail; 787 goto bail;
@@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
847 850
848 /* We are reading journal data which should not 851 /* We are reading journal data which should not
849 * be put in the uptodate cache */ 852 * be put in the uptodate cache */
850 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), 853 status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
851 p_blkno, p_blocks, bhs, 0, 854 p_blkno, p_blocks, bhs);
852 NULL);
853 if (status < 0) { 855 if (status < 0) {
854 mlog_errno(status); 856 mlog_errno(status);
855 goto bail; 857 goto bail;
@@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
865 867
866bail: 868bail:
867 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) 869 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
868 if (bhs[i]) 870 brelse(bhs[i]);
869 brelse(bhs[i]);
870 mlog_exit(status); 871 mlog_exit(status);
871 return status; 872 return status;
872} 873}
@@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
1133 } 1134 }
1134 SET_INODE_JOURNAL(inode); 1135 SET_INODE_JOURNAL(inode);
1135 1136
1136 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode); 1137 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
1138 OCFS2_BH_IGNORE_CACHE);
1137 if (status < 0) { 1139 if (status < 0) {
1138 mlog_errno(status); 1140 mlog_errno(status);
1139 goto bail; 1141 goto bail;
@@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1229 } 1231 }
1230 1232
1231 mlog(0, "calling journal_init_inode\n"); 1233 mlog(0, "calling journal_init_inode\n");
1232 journal = journal_init_inode(inode); 1234 journal = jbd2_journal_init_inode(inode);
1233 if (journal == NULL) { 1235 if (journal == NULL) {
1234 mlog(ML_ERROR, "Linux journal layer error\n"); 1236 mlog(ML_ERROR, "Linux journal layer error\n");
1235 status = -EIO; 1237 status = -EIO;
1236 goto done; 1238 goto done;
1237 } 1239 }
1238 1240
1239 status = journal_load(journal); 1241 status = jbd2_journal_load(journal);
1240 if (status < 0) { 1242 if (status < 0) {
1241 mlog_errno(status); 1243 mlog_errno(status);
1242 if (!igrab(inode)) 1244 if (!igrab(inode))
1243 BUG(); 1245 BUG();
1244 journal_destroy(journal); 1246 jbd2_journal_destroy(journal);
1245 goto done; 1247 goto done;
1246 } 1248 }
1247 1249
@@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1249 1251
1250 /* wipe the journal */ 1252 /* wipe the journal */
1251 mlog(0, "flushing the journal.\n"); 1253 mlog(0, "flushing the journal.\n");
1252 journal_lock_updates(journal); 1254 jbd2_journal_lock_updates(journal);
1253 status = journal_flush(journal); 1255 status = jbd2_journal_flush(journal);
1254 journal_unlock_updates(journal); 1256 jbd2_journal_unlock_updates(journal);
1255 if (status < 0) 1257 if (status < 0)
1256 mlog_errno(status); 1258 mlog_errno(status);
1257 1259
@@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1272 if (!igrab(inode)) 1274 if (!igrab(inode))
1273 BUG(); 1275 BUG();
1274 1276
1275 journal_destroy(journal); 1277 jbd2_journal_destroy(journal);
1276 1278
1277done: 1279done:
1278 /* drop the lock on this nodes journal */ 1280 /* drop the lock on this nodes journal */
@@ -1282,8 +1284,7 @@ done:
1282 if (inode) 1284 if (inode)
1283 iput(inode); 1285 iput(inode);
1284 1286
1285 if (bh) 1287 brelse(bh);
1286 brelse(bh);
1287 1288
1288 mlog_exit(status); 1289 mlog_exit(status);
1289 return status; 1290 return status;
@@ -1418,13 +1419,13 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1418{ 1419{
1419 unsigned int node_num; 1420 unsigned int node_num;
1420 int status, i; 1421 int status, i;
1422 u32 gen;
1421 struct buffer_head *bh = NULL; 1423 struct buffer_head *bh = NULL;
1422 struct ocfs2_dinode *di; 1424 struct ocfs2_dinode *di;
1423 1425
1424 /* This is called with the super block cluster lock, so we 1426 /* This is called with the super block cluster lock, so we
1425 * know that the slot map can't change underneath us. */ 1427 * know that the slot map can't change underneath us. */
1426 1428
1427 spin_lock(&osb->osb_lock);
1428 for (i = 0; i < osb->max_slots; i++) { 1429 for (i = 0; i < osb->max_slots; i++) {
1429 /* Read journal inode to get the recovery generation */ 1430 /* Read journal inode to get the recovery generation */
1430 status = ocfs2_read_journal_inode(osb, i, &bh, NULL); 1431 status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
@@ -1433,23 +1434,31 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1433 goto bail; 1434 goto bail;
1434 } 1435 }
1435 di = (struct ocfs2_dinode *)bh->b_data; 1436 di = (struct ocfs2_dinode *)bh->b_data;
1436 osb->slot_recovery_generations[i] = 1437 gen = ocfs2_get_recovery_generation(di);
1437 ocfs2_get_recovery_generation(di);
1438 brelse(bh); 1438 brelse(bh);
1439 bh = NULL; 1439 bh = NULL;
1440 1440
1441 spin_lock(&osb->osb_lock);
1442 osb->slot_recovery_generations[i] = gen;
1443
1441 mlog(0, "Slot %u recovery generation is %u\n", i, 1444 mlog(0, "Slot %u recovery generation is %u\n", i,
1442 osb->slot_recovery_generations[i]); 1445 osb->slot_recovery_generations[i]);
1443 1446
1444 if (i == osb->slot_num) 1447 if (i == osb->slot_num) {
1448 spin_unlock(&osb->osb_lock);
1445 continue; 1449 continue;
1450 }
1446 1451
1447 status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); 1452 status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
1448 if (status == -ENOENT) 1453 if (status == -ENOENT) {
1454 spin_unlock(&osb->osb_lock);
1449 continue; 1455 continue;
1456 }
1450 1457
1451 if (__ocfs2_recovery_map_test(osb, node_num)) 1458 if (__ocfs2_recovery_map_test(osb, node_num)) {
1459 spin_unlock(&osb->osb_lock);
1452 continue; 1460 continue;
1461 }
1453 spin_unlock(&osb->osb_lock); 1462 spin_unlock(&osb->osb_lock);
1454 1463
1455 /* Ok, we have a slot occupied by another node which 1464 /* Ok, we have a slot occupied by another node which
@@ -1465,10 +1474,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1465 mlog_errno(status); 1474 mlog_errno(status);
1466 goto bail; 1475 goto bail;
1467 } 1476 }
1468
1469 spin_lock(&osb->osb_lock);
1470 } 1477 }
1471 spin_unlock(&osb->osb_lock);
1472 1478
1473 status = 0; 1479 status = 0;
1474bail: 1480bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2178ebffa05f..d4d14e9a3cea 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,12 @@
27#define OCFS2_JOURNAL_H 27#define OCFS2_JOURNAL_H
28 28
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/jbd.h> 30#ifndef CONFIG_OCFS2_COMPAT_JBD
31# include <linux/jbd2.h>
32#else
33# include <linux/jbd.h>
34# include "ocfs2_jbd_compat.h"
35#endif
31 36
32enum ocfs2_journal_state { 37enum ocfs2_journal_state {
33 OCFS2_JOURNAL_FREE = 0, 38 OCFS2_JOURNAL_FREE = 0,
@@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
215 * buffer. Will have to call ocfs2_journal_dirty once 220 * buffer. Will have to call ocfs2_journal_dirty once
216 * we've actually dirtied it. Type is one of . or . 221 * we've actually dirtied it. Type is one of . or .
217 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. 222 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
218 * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before 223 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
219 * the current handle commits. 224 * the current handle commits.
220 */ 225 */
221 226
222/* You must always start_trans with a number of buffs > 0, but it's 227/* You must always start_trans with a number of buffs > 0, but it's
@@ -268,8 +273,10 @@ int ocfs2_journal_access(handle_t *handle,
268 */ 273 */
269int ocfs2_journal_dirty(handle_t *handle, 274int ocfs2_journal_dirty(handle_t *handle,
270 struct buffer_head *bh); 275 struct buffer_head *bh);
276#ifdef CONFIG_OCFS2_COMPAT_JBD
271int ocfs2_journal_dirty_data(handle_t *handle, 277int ocfs2_journal_dirty_data(handle_t *handle,
272 struct buffer_head *bh); 278 struct buffer_head *bh);
279#endif
273 280
274/* 281/*
275 * Credit Macros: 282 * Credit Macros:
@@ -283,6 +290,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
283/* simple file updates like chmod, etc. */ 290/* simple file updates like chmod, etc. */
284#define OCFS2_INODE_UPDATE_CREDITS 1 291#define OCFS2_INODE_UPDATE_CREDITS 1
285 292
293/* extended attribute block update */
294#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
295
286/* group extend. inode update and last group update. */ 296/* group extend. inode update and last group update. */
287#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 297#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
288 298
@@ -340,11 +350,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
340#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ 350#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
341 + OCFS2_UNLINK_CREDITS) 351 + OCFS2_UNLINK_CREDITS)
342 352
353/* global bitmap dinode, group desc., relinked group,
354 * suballocator dinode, group desc., relinked group,
355 * dinode, xattr block */
356#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
357 + OCFS2_INODE_UPDATE_CREDITS \
358 + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
359
360/*
361 * Please note that the caller must make sure that root_el is the root
362 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
363 * the result may be wrong.
364 */
343static inline int ocfs2_calc_extend_credits(struct super_block *sb, 365static inline int ocfs2_calc_extend_credits(struct super_block *sb,
344 struct ocfs2_dinode *fe, 366 struct ocfs2_extent_list *root_el,
345 u32 bits_wanted) 367 u32 bits_wanted)
346{ 368{
347 int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; 369 int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
348 370
349 /* bitmap dinode, group desc. + relinked group. */ 371 /* bitmap dinode, group desc. + relinked group. */
350 bitmap_blocks = OCFS2_SUBALLOC_ALLOC; 372 bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
355 * however many metadata chunks needed * a remaining suballoc 377 * however many metadata chunks needed * a remaining suballoc
356 * alloc. */ 378 * alloc. */
357 sysfile_bitmap_blocks = 1 + 379 sysfile_bitmap_blocks = 1 +
358 (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); 380 (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
359 381
360 /* this does not include *new* metadata blocks, which are 382 /* this does not include *new* metadata blocks, which are
361 * accounted for in sysfile_bitmap_blocks. fe + 383 * accounted for in sysfile_bitmap_blocks. root_el +
362 * prev. last_eb_blk + blocks along edge of tree. 384 * prev. last_eb_blk + blocks along edge of tree.
363 * calc_symlink_credits passes because we just need 1 385 * calc_symlink_credits passes because we just need 1
364 * credit for the dinode there. */ 386 * credit for the dinode there. */
365 dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); 387 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
366 388
367 return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; 389 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
368} 390}
369 391
370static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 392static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
415 return credits; 437 return credits;
416} 438}
417 439
440static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
441{
442 return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
443}
444
445static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
446 loff_t new_size)
447{
448 return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
449 new_size);
450}
451
418#endif /* OCFS2_JOURNAL_H */ 452#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 28e492e4ec88..687b28713c32 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,6 +28,7 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/debugfs.h>
31 32
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC 33#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h> 34#include <cluster/masklog.h>
@@ -47,8 +48,6 @@
47 48
48#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) 49#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
49 50
50static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
51
52static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); 51static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
53 52
54static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, 53static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
75static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, 74static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
76 struct inode *local_alloc_inode); 75 struct inode *local_alloc_inode);
77 76
78static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) 77#ifdef CONFIG_OCFS2_FS_STATS
78
79static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
80{
81 file->private_data = inode->i_private;
82 return 0;
83}
84
85#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
86#define LA_DEBUG_VER 1
87static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
88 size_t count, loff_t *ppos)
89{
90 static DEFINE_MUTEX(la_debug_mutex);
91 struct ocfs2_super *osb = file->private_data;
92 int written, ret;
93 char *buf = osb->local_alloc_debug_buf;
94
95 mutex_lock(&la_debug_mutex);
96 memset(buf, 0, LA_DEBUG_BUF_SZ);
97
98 written = snprintf(buf, LA_DEBUG_BUF_SZ,
99 "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
100 LA_DEBUG_VER,
101 (unsigned long long)osb->la_last_gd,
102 osb->local_alloc_default_bits,
103 osb->local_alloc_bits, osb->local_alloc_state);
104
105 ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
106
107 mutex_unlock(&la_debug_mutex);
108 return ret;
109}
110
111static const struct file_operations ocfs2_la_debug_fops = {
112 .open = ocfs2_la_debug_open,
113 .read = ocfs2_la_debug_read,
114};
115
116static void ocfs2_init_la_debug(struct ocfs2_super *osb)
117{
118 osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
119 if (!osb->local_alloc_debug_buf)
120 return;
121
122 osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
123 S_IFREG|S_IRUSR,
124 osb->osb_debug_root,
125 osb,
126 &ocfs2_la_debug_fops);
127 if (!osb->local_alloc_debug) {
128 kfree(osb->local_alloc_debug_buf);
129 osb->local_alloc_debug_buf = NULL;
130 }
131}
132
133static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
134{
135 if (osb->local_alloc_debug)
136 debugfs_remove(osb->local_alloc_debug);
137
138 if (osb->local_alloc_debug_buf)
139 kfree(osb->local_alloc_debug_buf);
140
141 osb->local_alloc_debug_buf = NULL;
142 osb->local_alloc_debug = NULL;
143}
144#else /* CONFIG_OCFS2_FS_STATS */
145static void ocfs2_init_la_debug(struct ocfs2_super *osb)
146{
147 return;
148}
149static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
150{
151 return;
152}
153#endif
154
155static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
79{ 156{
80 BUG_ON(osb->s_clustersize_bits > 20); 157 return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
158 osb->local_alloc_state == OCFS2_LA_ENABLED);
159}
81 160
82 /* Size local alloc windows by the megabyte */ 161void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
83 return osb->local_alloc_size << (20 - osb->s_clustersize_bits); 162 unsigned int num_clusters)
163{
164 spin_lock(&osb->osb_lock);
165 if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
166 osb->local_alloc_state == OCFS2_LA_THROTTLED)
167 if (num_clusters >= osb->local_alloc_default_bits) {
168 cancel_delayed_work(&osb->la_enable_wq);
169 osb->local_alloc_state = OCFS2_LA_ENABLED;
170 }
171 spin_unlock(&osb->osb_lock);
172}
173
174void ocfs2_la_enable_worker(struct work_struct *work)
175{
176 struct ocfs2_super *osb =
177 container_of(work, struct ocfs2_super,
178 la_enable_wq.work);
179 spin_lock(&osb->osb_lock);
180 osb->local_alloc_state = OCFS2_LA_ENABLED;
181 spin_unlock(&osb->osb_lock);
84} 182}
85 183
86/* 184/*
87 * Tell us whether a given allocation should use the local alloc 185 * Tell us whether a given allocation should use the local alloc
88 * file. Otherwise, it has to go to the main bitmap. 186 * file. Otherwise, it has to go to the main bitmap.
187 *
188 * This function does semi-dirty reads of local alloc size and state!
189 * This is ok however, as the values are re-checked once under mutex.
89 */ 190 */
90int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) 191int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
91{ 192{
92 int la_bits = ocfs2_local_alloc_window_bits(osb);
93 int ret = 0; 193 int ret = 0;
194 int la_bits;
195
196 spin_lock(&osb->osb_lock);
197 la_bits = osb->local_alloc_bits;
94 198
95 if (osb->local_alloc_state != OCFS2_LA_ENABLED) 199 if (!ocfs2_la_state_enabled(osb))
96 goto bail; 200 goto bail;
97 201
98 /* la_bits should be at least twice the size (in clusters) of 202 /* la_bits should be at least twice the size (in clusters) of
@@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
106bail: 210bail:
107 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", 211 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
108 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); 212 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
213 spin_unlock(&osb->osb_lock);
109 return ret; 214 return ret;
110} 215}
111 216
@@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
120 225
121 mlog_entry_void(); 226 mlog_entry_void();
122 227
123 if (osb->local_alloc_size == 0) 228 ocfs2_init_la_debug(osb);
229
230 if (osb->local_alloc_bits == 0)
124 goto bail; 231 goto bail;
125 232
126 if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) { 233 if (osb->local_alloc_bits >= osb->bitmap_cpg) {
127 mlog(ML_NOTICE, "Requested local alloc window %d is larger " 234 mlog(ML_NOTICE, "Requested local alloc window %d is larger "
128 "than max possible %u. Using defaults.\n", 235 "than max possible %u. Using defaults.\n",
129 ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1)); 236 osb->local_alloc_bits, (osb->bitmap_cpg - 1));
130 osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 237 osb->local_alloc_bits =
238 ocfs2_megabytes_to_clusters(osb->sb,
239 OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
131 } 240 }
132 241
133 /* read the alloc off disk */ 242 /* read the alloc off disk */
@@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
139 goto bail; 248 goto bail;
140 } 249 }
141 250
142 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, 251 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
143 &alloc_bh, 0, inode); 252 &alloc_bh, OCFS2_BH_IGNORE_CACHE);
144 if (status < 0) { 253 if (status < 0) {
145 mlog_errno(status); 254 mlog_errno(status);
146 goto bail; 255 goto bail;
@@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
185 294
186bail: 295bail:
187 if (status < 0) 296 if (status < 0)
188 if (alloc_bh) 297 brelse(alloc_bh);
189 brelse(alloc_bh);
190 if (inode) 298 if (inode)
191 iput(inode); 299 iput(inode);
192 300
193 mlog(0, "Local alloc window bits = %d\n", 301 if (status < 0)
194 ocfs2_local_alloc_window_bits(osb)); 302 ocfs2_shutdown_la_debug(osb);
303
304 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
195 305
196 mlog_exit(status); 306 mlog_exit(status);
197 return status; 307 return status;
@@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
217 327
218 mlog_entry_void(); 328 mlog_entry_void();
219 329
330 cancel_delayed_work(&osb->la_enable_wq);
331 flush_workqueue(ocfs2_wq);
332
333 ocfs2_shutdown_la_debug(osb);
334
220 if (osb->local_alloc_state == OCFS2_LA_UNUSED) 335 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
221 goto out; 336 goto out;
222 337
@@ -295,8 +410,7 @@ out_commit:
295 ocfs2_commit_trans(osb, handle); 410 ocfs2_commit_trans(osb, handle);
296 411
297out_unlock: 412out_unlock:
298 if (main_bm_bh) 413 brelse(main_bm_bh);
299 brelse(main_bm_bh);
300 414
301 ocfs2_inode_unlock(main_bm_inode, 1); 415 ocfs2_inode_unlock(main_bm_inode, 1);
302 416
@@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
345 459
346 mutex_lock(&inode->i_mutex); 460 mutex_lock(&inode->i_mutex);
347 461
348 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, 462 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
349 &alloc_bh, 0, inode); 463 &alloc_bh, OCFS2_BH_IGNORE_CACHE);
350 if (status < 0) { 464 if (status < 0) {
351 mlog_errno(status); 465 mlog_errno(status);
352 goto bail; 466 goto bail;
@@ -372,8 +486,7 @@ bail:
372 *alloc_copy = NULL; 486 *alloc_copy = NULL;
373 } 487 }
374 488
375 if (alloc_bh) 489 brelse(alloc_bh);
376 brelse(alloc_bh);
377 490
378 if (inode) { 491 if (inode) {
379 mutex_unlock(&inode->i_mutex); 492 mutex_unlock(&inode->i_mutex);
@@ -441,8 +554,7 @@ out_unlock:
441out_mutex: 554out_mutex:
442 mutex_unlock(&main_bm_inode->i_mutex); 555 mutex_unlock(&main_bm_inode->i_mutex);
443 556
444 if (main_bm_bh) 557 brelse(main_bm_bh);
445 brelse(main_bm_bh);
446 558
447 iput(main_bm_inode); 559 iput(main_bm_inode);
448 560
@@ -453,8 +565,48 @@ out:
453 return status; 565 return status;
454} 566}
455 567
568/* Check to see if the local alloc window is within ac->ac_max_block */
569static int ocfs2_local_alloc_in_range(struct inode *inode,
570 struct ocfs2_alloc_context *ac,
571 u32 bits_wanted)
572{
573 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
574 struct ocfs2_dinode *alloc;
575 struct ocfs2_local_alloc *la;
576 int start;
577 u64 block_off;
578
579 if (!ac->ac_max_block)
580 return 1;
581
582 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
583 la = OCFS2_LOCAL_ALLOC(alloc);
584
585 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
586 if (start == -1) {
587 mlog_errno(-ENOSPC);
588 return 0;
589 }
590
591 /*
592 * Converting (bm_off + start + bits_wanted) to blocks gives us
593 * the blkno just past our actual allocation. This is perfect
594 * to compare with ac_max_block.
595 */
596 block_off = ocfs2_clusters_to_blocks(inode->i_sb,
597 le32_to_cpu(la->la_bm_off) +
598 start + bits_wanted);
599 mlog(0, "Checking %llu against %llu\n",
600 (unsigned long long)block_off,
601 (unsigned long long)ac->ac_max_block);
602 if (block_off > ac->ac_max_block)
603 return 0;
604
605 return 1;
606}
607
456/* 608/*
457 * make sure we've got at least bitswanted contiguous bits in the 609 * make sure we've got at least bits_wanted contiguous bits in the
458 * local alloc. You lose them when you drop i_mutex. 610 * local alloc. You lose them when you drop i_mutex.
459 * 611 *
460 * We will add ourselves to the transaction passed in, but may start 612 * We will add ourselves to the transaction passed in, but may start
@@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
485 637
486 mutex_lock(&local_alloc_inode->i_mutex); 638 mutex_lock(&local_alloc_inode->i_mutex);
487 639
488 if (osb->local_alloc_state != OCFS2_LA_ENABLED) { 640 /*
489 status = -ENOSPC; 641 * We must double check state and allocator bits because
490 goto bail; 642 * another process may have changed them while holding i_mutex.
491 } 643 */
492 644 spin_lock(&osb->osb_lock);
493 if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { 645 if (!ocfs2_la_state_enabled(osb) ||
494 mlog(0, "Asking for more than my max window size!\n"); 646 (bits_wanted > osb->local_alloc_bits)) {
647 spin_unlock(&osb->osb_lock);
495 status = -ENOSPC; 648 status = -ENOSPC;
496 goto bail; 649 goto bail;
497 } 650 }
651 spin_unlock(&osb->osb_lock);
498 652
499 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 653 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
500 654
@@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
522 mlog_errno(status); 676 mlog_errno(status);
523 goto bail; 677 goto bail;
524 } 678 }
679
680 /*
681 * Under certain conditions, the window slide code
682 * might have reduced the number of bits available or
683 * disabled the the local alloc entirely. Re-check
684 * here and return -ENOSPC if necessary.
685 */
686 status = -ENOSPC;
687 if (!ocfs2_la_state_enabled(osb))
688 goto bail;
689
690 free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
691 le32_to_cpu(alloc->id1.bitmap1.i_used);
692 if (bits_wanted > free_bits)
693 goto bail;
694 }
695
696 if (ac->ac_max_block)
697 mlog(0, "Calling in_range for max block %llu\n",
698 (unsigned long long)ac->ac_max_block);
699
700 if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
701 bits_wanted)) {
702 /*
703 * The window is outside ac->ac_max_block.
704 * This errno tells the caller to keep localalloc enabled
705 * but to get the allocation from the main bitmap.
706 */
707 status = -EFBIG;
708 goto bail;
525 } 709 }
526 710
527 ac->ac_inode = local_alloc_inode; 711 ac->ac_inode = local_alloc_inode;
@@ -789,6 +973,85 @@ bail:
789 return status; 973 return status;
790} 974}
791 975
976enum ocfs2_la_event {
977 OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */
978 OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has
979 * enough bits theoretically
980 * free, but a contiguous
981 * allocation could not be
982 * found. */
983 OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have
984 * enough bits free to satisfy
985 * our request. */
986};
987#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
988/*
989 * Given an event, calculate the size of our next local alloc window.
990 *
991 * This should always be called under i_mutex of the local alloc inode
992 * so that local alloc disabling doesn't race with processes trying to
993 * use the allocator.
994 *
995 * Returns the state which the local alloc was left in. This value can
996 * be ignored by some paths.
997 */
998static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
999 enum ocfs2_la_event event)
1000{
1001 unsigned int bits;
1002 int state;
1003
1004 spin_lock(&osb->osb_lock);
1005 if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
1006 WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
1007 goto out_unlock;
1008 }
1009
1010 /*
1011 * ENOSPC and fragmentation are treated similarly for now.
1012 */
1013 if (event == OCFS2_LA_EVENT_ENOSPC ||
1014 event == OCFS2_LA_EVENT_FRAGMENTED) {
1015 /*
1016 * We ran out of contiguous space in the primary
1017 * bitmap. Drastically reduce the number of bits used
1018 * by local alloc until we have to disable it.
1019 */
1020 bits = osb->local_alloc_bits >> 1;
1021 if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
1022 /*
1023 * By setting state to THROTTLED, we'll keep
1024 * the number of local alloc bits used down
1025 * until an event occurs which would give us
1026 * reason to assume the bitmap situation might
1027 * have changed.
1028 */
1029 osb->local_alloc_state = OCFS2_LA_THROTTLED;
1030 osb->local_alloc_bits = bits;
1031 } else {
1032 osb->local_alloc_state = OCFS2_LA_DISABLED;
1033 }
1034 queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
1035 OCFS2_LA_ENABLE_INTERVAL);
1036 goto out_unlock;
1037 }
1038
1039 /*
1040 * Don't increase the size of the local alloc window until we
1041 * know we might be able to fulfill the request. Otherwise, we
1042 * risk bouncing around the global bitmap during periods of
1043 * low space.
1044 */
1045 if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
1046 osb->local_alloc_bits = osb->local_alloc_default_bits;
1047
1048out_unlock:
1049 state = osb->local_alloc_state;
1050 spin_unlock(&osb->osb_lock);
1051
1052 return state;
1053}
1054
792static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, 1055static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
793 struct ocfs2_alloc_context **ac, 1056 struct ocfs2_alloc_context **ac,
794 struct inode **bitmap_inode, 1057 struct inode **bitmap_inode,
@@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
803 goto bail; 1066 goto bail;
804 } 1067 }
805 1068
806 (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); 1069retry_enospc:
1070 (*ac)->ac_bits_wanted = osb->local_alloc_bits;
807 1071
808 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); 1072 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1073 if (status == -ENOSPC) {
1074 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
1075 OCFS2_LA_DISABLED)
1076 goto bail;
1077
1078 ocfs2_free_ac_resource(*ac);
1079 memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
1080 goto retry_enospc;
1081 }
809 if (status < 0) { 1082 if (status < 0) {
810 if (status != -ENOSPC) 1083 mlog_errno(status);
811 mlog_errno(status);
812 goto bail; 1084 goto bail;
813 } 1085 }
814 1086
@@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
849 "one\n"); 1121 "one\n");
850 1122
851 mlog(0, "Allocating %u clusters for a new window.\n", 1123 mlog(0, "Allocating %u clusters for a new window.\n",
852 ocfs2_local_alloc_window_bits(osb)); 1124 osb->local_alloc_bits);
853 1125
854 /* Instruct the allocation code to try the most recently used 1126 /* Instruct the allocation code to try the most recently used
855 * cluster group. We'll re-record the group used this pass 1127 * cluster group. We'll re-record the group used this pass
@@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
859 /* we used the generic suballoc reserve function, but we set 1131 /* we used the generic suballoc reserve function, but we set
860 * everything up nicely, so there's no reason why we can't use 1132 * everything up nicely, so there's no reason why we can't use
861 * the more specific cluster api to claim bits. */ 1133 * the more specific cluster api to claim bits. */
862 status = ocfs2_claim_clusters(osb, handle, ac, 1134 status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
863 ocfs2_local_alloc_window_bits(osb),
864 &cluster_off, &cluster_count); 1135 &cluster_off, &cluster_count);
1136 if (status == -ENOSPC) {
1137retry_enospc:
1138 /*
1139 * Note: We could also try syncing the journal here to
1140 * allow use of any free bits which the current
1141 * transaction can't give us access to. --Mark
1142 */
1143 if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
1144 OCFS2_LA_DISABLED)
1145 goto bail;
1146
1147 status = ocfs2_claim_clusters(osb, handle, ac,
1148 osb->local_alloc_bits,
1149 &cluster_off,
1150 &cluster_count);
1151 if (status == -ENOSPC)
1152 goto retry_enospc;
1153 /*
1154 * We only shrunk the *minimum* number of in our
1155 * request - it's entirely possible that the allocator
1156 * might give us more than we asked for.
1157 */
1158 if (status == 0) {
1159 spin_lock(&osb->osb_lock);
1160 osb->local_alloc_bits = cluster_count;
1161 spin_unlock(&osb->osb_lock);
1162 }
1163 }
865 if (status < 0) { 1164 if (status < 0) {
866 if (status != -ENOSPC) 1165 if (status != -ENOSPC)
867 mlog_errno(status); 1166 mlog_errno(status);
@@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
905 1204
906 mlog_entry_void(); 1205 mlog_entry_void();
907 1206
1207 ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
1208
908 /* This will lock the main bitmap for us. */ 1209 /* This will lock the main bitmap for us. */
909 status = ocfs2_local_alloc_reserve_for_window(osb, 1210 status = ocfs2_local_alloc_reserve_for_window(osb,
910 &ac, 1211 &ac,
@@ -976,8 +1277,7 @@ bail:
976 if (handle) 1277 if (handle)
977 ocfs2_commit_trans(osb, handle); 1278 ocfs2_commit_trans(osb, handle);
978 1279
979 if (main_bm_bh) 1280 brelse(main_bm_bh);
980 brelse(main_bm_bh);
981 1281
982 if (main_bm_inode) 1282 if (main_bm_inode)
983 iput(main_bm_inode); 1283 iput(main_bm_inode);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 3f76631e110c..ac5ea9f86653 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
52 u32 *bit_off, 52 u32 *bit_off,
53 u32 *num_bits); 53 u32 *num_bits);
54 54
55void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
56 unsigned int num_clusters);
57void ocfs2_la_enable_worker(struct work_struct *work);
58
55#endif /* OCFS2_LOCALALLOC_H */ 59#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 203f87143877..544ac6245175 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -24,6 +24,7 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fcntl.h>
27 28
28#define MLOG_MASK_PREFIX ML_INODE 29#define MLOG_MASK_PREFIX ML_INODE
29#include <cluster/masklog.h> 30#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
32 33
33#include "dlmglue.h" 34#include "dlmglue.h"
34#include "file.h" 35#include "file.h"
36#include "inode.h"
35#include "locks.h" 37#include "locks.h"
36 38
37static int ocfs2_do_flock(struct file *file, struct inode *inode, 39static int ocfs2_do_flock(struct file *file, struct inode *inode,
@@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
123 else 125 else
124 return ocfs2_do_flock(file, inode, cmd, fl); 126 return ocfs2_do_flock(file, inode, cmd, fl);
125} 127}
128
129int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
130{
131 struct inode *inode = file->f_mapping->host;
132 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
133
134 if (!(fl->fl_flags & FL_POSIX))
135 return -ENOLCK;
136 if (__mandatory_lock(inode))
137 return -ENOLCK;
138
139 return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
140}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 9743ef2324ec..496d488b271f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -27,5 +27,6 @@
27#define OCFS2_LOCKS_H 27#define OCFS2_LOCKS_H
28 28
29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); 29int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
30int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
30 31
31#endif /* OCFS2_LOCKS_H */ 32#endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808fe0140..485a6aa0ad39 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
60#include "symlink.h" 60#include "symlink.h"
61#include "sysfile.h" 61#include "sysfile.h"
62#include "uptodate.h" 62#include "uptodate.h"
63#include "xattr.h"
63 64
64#include "buffer_head_io.h" 65#include "buffer_head_io.h"
65 66
@@ -327,14 +328,9 @@ leave:
327 if (status == -ENOSPC) 328 if (status == -ENOSPC)
328 mlog(0, "Disk is full\n"); 329 mlog(0, "Disk is full\n");
329 330
330 if (new_fe_bh) 331 brelse(new_fe_bh);
331 brelse(new_fe_bh); 332 brelse(de_bh);
332 333 brelse(parent_fe_bh);
333 if (de_bh)
334 brelse(de_bh);
335
336 if (parent_fe_bh)
337 brelse(parent_fe_bh);
338 334
339 if ((status < 0) && inode) 335 if ((status < 0) && inode)
340 iput(inode); 336 iput(inode);
@@ -647,12 +643,9 @@ out_unlock_inode:
647out: 643out:
648 ocfs2_inode_unlock(dir, 1); 644 ocfs2_inode_unlock(dir, 1);
649 645
650 if (de_bh) 646 brelse(de_bh);
651 brelse(de_bh); 647 brelse(fe_bh);
652 if (fe_bh) 648 brelse(parent_fe_bh);
653 brelse(fe_bh);
654 if (parent_fe_bh)
655 brelse(parent_fe_bh);
656 649
657 mlog_exit(err); 650 mlog_exit(err);
658 651
@@ -851,17 +844,10 @@ leave:
851 iput(orphan_dir); 844 iput(orphan_dir);
852 } 845 }
853 846
854 if (fe_bh) 847 brelse(fe_bh);
855 brelse(fe_bh); 848 brelse(dirent_bh);
856 849 brelse(parent_node_bh);
857 if (dirent_bh) 850 brelse(orphan_entry_bh);
858 brelse(dirent_bh);
859
860 if (parent_node_bh)
861 brelse(parent_node_bh);
862
863 if (orphan_entry_bh)
864 brelse(orphan_entry_bh);
865 851
866 mlog_exit(status); 852 mlog_exit(status);
867 853
@@ -1372,24 +1358,15 @@ bail:
1372 1358
1373 if (new_inode) 1359 if (new_inode)
1374 iput(new_inode); 1360 iput(new_inode);
1375 if (newfe_bh) 1361 brelse(newfe_bh);
1376 brelse(newfe_bh); 1362 brelse(old_inode_bh);
1377 if (old_inode_bh) 1363 brelse(old_dir_bh);
1378 brelse(old_inode_bh); 1364 brelse(new_dir_bh);
1379 if (old_dir_bh) 1365 brelse(new_de_bh);
1380 brelse(old_dir_bh); 1366 brelse(old_de_bh);
1381 if (new_dir_bh) 1367 brelse(old_inode_de_bh);
1382 brelse(new_dir_bh); 1368 brelse(orphan_entry_bh);
1383 if (new_de_bh) 1369 brelse(insert_entry_bh);
1384 brelse(new_de_bh);
1385 if (old_de_bh)
1386 brelse(old_de_bh);
1387 if (old_inode_de_bh)
1388 brelse(old_inode_de_bh);
1389 if (orphan_entry_bh)
1390 brelse(orphan_entry_bh);
1391 if (insert_entry_bh)
1392 brelse(insert_entry_bh);
1393 1370
1394 mlog_exit(status); 1371 mlog_exit(status);
1395 1372
@@ -1492,8 +1469,7 @@ bail:
1492 1469
1493 if (bhs) { 1470 if (bhs) {
1494 for(i = 0; i < blocks; i++) 1471 for(i = 0; i < blocks; i++)
1495 if (bhs[i]) 1472 brelse(bhs[i]);
1496 brelse(bhs[i]);
1497 kfree(bhs); 1473 kfree(bhs);
1498 } 1474 }
1499 1475
@@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir,
1598 u32 offset = 0; 1574 u32 offset = 0;
1599 1575
1600 inode->i_op = &ocfs2_symlink_inode_operations; 1576 inode->i_op = &ocfs2_symlink_inode_operations;
1601 status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0, 1577 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1602 new_fe_bh, 1578 new_fe_bh,
1603 handle, data_ac, NULL, 1579 handle, data_ac, NULL,
1604 NULL); 1580 NULL);
1605 if (status < 0) { 1581 if (status < 0) {
1606 if (status != -ENOSPC && status != -EINTR) { 1582 if (status != -ENOSPC && status != -EINTR) {
1607 mlog(ML_ERROR, 1583 mlog(ML_ERROR,
@@ -1659,12 +1635,9 @@ bail:
1659 1635
1660 ocfs2_inode_unlock(dir, 1); 1636 ocfs2_inode_unlock(dir, 1);
1661 1637
1662 if (new_fe_bh) 1638 brelse(new_fe_bh);
1663 brelse(new_fe_bh); 1639 brelse(parent_fe_bh);
1664 if (parent_fe_bh) 1640 brelse(de_bh);
1665 brelse(parent_fe_bh);
1666 if (de_bh)
1667 brelse(de_bh);
1668 if (inode_ac) 1641 if (inode_ac)
1669 ocfs2_free_alloc_context(inode_ac); 1642 ocfs2_free_alloc_context(inode_ac);
1670 if (data_ac) 1643 if (data_ac)
@@ -1759,8 +1732,7 @@ leave:
1759 iput(orphan_dir_inode); 1732 iput(orphan_dir_inode);
1760 } 1733 }
1761 1734
1762 if (orphan_dir_bh) 1735 brelse(orphan_dir_bh);
1763 brelse(orphan_dir_bh);
1764 1736
1765 mlog_exit(status); 1737 mlog_exit(status);
1766 return status; 1738 return status;
@@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1780 1752
1781 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1753 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1782 1754
1783 status = ocfs2_read_block(osb, 1755 status = ocfs2_read_block(orphan_dir_inode,
1784 OCFS2_I(orphan_dir_inode)->ip_blkno, 1756 OCFS2_I(orphan_dir_inode)->ip_blkno,
1785 &orphan_dir_bh, OCFS2_BH_CACHED, 1757 &orphan_dir_bh);
1786 orphan_dir_inode);
1787 if (status < 0) { 1758 if (status < 0) {
1788 mlog_errno(status); 1759 mlog_errno(status);
1789 goto leave; 1760 goto leave;
@@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1829 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 1800 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
1830 1801
1831leave: 1802leave:
1832 if (orphan_dir_bh) 1803 brelse(orphan_dir_bh);
1833 brelse(orphan_dir_bh);
1834 1804
1835 mlog_exit(status); 1805 mlog_exit(status);
1836 return status; 1806 return status;
@@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1898 } 1868 }
1899 1869
1900leave: 1870leave:
1901 if (target_de_bh) 1871 brelse(target_de_bh);
1902 brelse(target_de_bh);
1903 1872
1904 mlog_exit(status); 1873 mlog_exit(status);
1905 return status; 1874 return status;
@@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = {
1918 .setattr = ocfs2_setattr, 1887 .setattr = ocfs2_setattr,
1919 .getattr = ocfs2_getattr, 1888 .getattr = ocfs2_getattr,
1920 .permission = ocfs2_permission, 1889 .permission = ocfs2_permission,
1890 .setxattr = generic_setxattr,
1891 .getxattr = generic_getxattr,
1892 .listxattr = ocfs2_listxattr,
1893 .removexattr = generic_removexattr,
1921}; 1894};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7f625f2b1117..a21a465490c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,7 +34,12 @@
34#include <linux/workqueue.h> 34#include <linux/workqueue.h>
35#include <linux/kref.h> 35#include <linux/kref.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/jbd.h> 37#ifndef CONFIG_OCFS2_COMPAT_JBD
38# include <linux/jbd2.h>
39#else
40# include <linux/jbd.h>
41# include "ocfs2_jbd_compat.h"
42#endif
38 43
39/* For union ocfs2_dlm_lksb */ 44/* For union ocfs2_dlm_lksb */
40#include "stackglue.h" 45#include "stackglue.h"
@@ -171,9 +176,13 @@ struct ocfs2_alloc_stats
171 176
172enum ocfs2_local_alloc_state 177enum ocfs2_local_alloc_state
173{ 178{
174 OCFS2_LA_UNUSED = 0, 179 OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for
175 OCFS2_LA_ENABLED, 180 * this mountpoint. */
176 OCFS2_LA_DISABLED 181 OCFS2_LA_ENABLED, /* Local alloc is in use. */
182 OCFS2_LA_THROTTLED, /* Local alloc is in use, but number
183 * of bits has been reduced. */
184 OCFS2_LA_DISABLED /* Local alloc has temporarily been
185 * disabled. */
177}; 186};
178 187
179enum ocfs2_mount_options 188enum ocfs2_mount_options
@@ -184,6 +193,8 @@ enum ocfs2_mount_options
184 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 193 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
185 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 194 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
186 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ 195 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
196 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
197 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
187}; 198};
188 199
189#define OCFS2_OSB_SOFT_RO 0x0001 200#define OCFS2_OSB_SOFT_RO 0x0001
@@ -214,6 +225,7 @@ struct ocfs2_super
214 u32 bitmap_cpg; 225 u32 bitmap_cpg;
215 u8 *uuid; 226 u8 *uuid;
216 char *uuid_str; 227 char *uuid_str;
228 u32 uuid_hash;
217 u8 *vol_label; 229 u8 *vol_label;
218 u64 first_cluster_group_blkno; 230 u64 first_cluster_group_blkno;
219 u32 fs_generation; 231 u32 fs_generation;
@@ -241,6 +253,7 @@ struct ocfs2_super
241 int s_sectsize_bits; 253 int s_sectsize_bits;
242 int s_clustersize; 254 int s_clustersize;
243 int s_clustersize_bits; 255 int s_clustersize_bits;
256 unsigned int s_xattr_inline_size;
244 257
245 atomic_t vol_state; 258 atomic_t vol_state;
246 struct mutex recovery_lock; 259 struct mutex recovery_lock;
@@ -252,11 +265,27 @@ struct ocfs2_super
252 struct ocfs2_journal *journal; 265 struct ocfs2_journal *journal;
253 unsigned long osb_commit_interval; 266 unsigned long osb_commit_interval;
254 267
255 int local_alloc_size; 268 struct delayed_work la_enable_wq;
256 enum ocfs2_local_alloc_state local_alloc_state; 269
270 /*
271 * Must hold local alloc i_mutex and osb->osb_lock to change
272 * local_alloc_bits. Reads can be done under either lock.
273 */
274 unsigned int local_alloc_bits;
275 unsigned int local_alloc_default_bits;
276
277 enum ocfs2_local_alloc_state local_alloc_state; /* protected
278 * by osb_lock */
279
257 struct buffer_head *local_alloc_bh; 280 struct buffer_head *local_alloc_bh;
281
258 u64 la_last_gd; 282 u64 la_last_gd;
259 283
284#ifdef CONFIG_OCFS2_FS_STATS
285 struct dentry *local_alloc_debug;
286 char *local_alloc_debug_buf;
287#endif
288
260 /* Next two fields are for local node slot recovery during 289 /* Next two fields are for local node slot recovery during
261 * mount. */ 290 * mount. */
262 int dirty; 291 int dirty;
@@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
340 return 0; 369 return 0;
341} 370}
342 371
372static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
373{
374 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
375 return 1;
376 return 0;
377}
378
343/* set / clear functions because cluster events can make these happen 379/* set / clear functions because cluster events can make these happen
344 * in parallel so we want the transitions to be atomic. this also 380 * in parallel so we want the transitions to be atomic. this also
345 * means that any future flags osb_flags must be protected by spinlock 381 * means that any future flags osb_flags must be protected by spinlock
@@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
554 return pages_per_cluster; 590 return pages_per_cluster;
555} 591}
556 592
593static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
594 unsigned int megs)
595{
596 BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
597
598 return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
599}
600
557static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) 601static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
558{ 602{
559 spin_lock(&osb->osb_lock); 603 spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 4f619850ccf7..f24ce3d3f956 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,7 @@
64#define OCFS2_INODE_SIGNATURE "INODE01" 64#define OCFS2_INODE_SIGNATURE "INODE01"
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" 65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
67 68
68/* Compatibility flags */ 69/* Compatibility flags */
69#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 70#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -90,7 +91,8 @@
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ 91 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ 92 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
92 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 93 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
93 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK) 94 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
95 | OCFS2_FEATURE_INCOMPAT_XATTR)
94#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 96#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
95 97
96/* 98/*
@@ -127,10 +129,6 @@
127/* Support for data packed into inode blocks */ 129/* Support for data packed into inode blocks */
128#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 130#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
129 131
130/* Support for the extended slot map */
131#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
132
133
134/* 132/*
135 * Support for alternate, userspace cluster stacks. If set, the superblock 133 * Support for alternate, userspace cluster stacks. If set, the superblock
136 * field s_cluster_info contains a tag for the alternate stack in use as 134 * field s_cluster_info contains a tag for the alternate stack in use as
@@ -142,6 +140,12 @@
142 */ 140 */
143#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 141#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
144 142
143/* Support for the extended slot map */
144#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
145
146/* Support for extended attributes */
147#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
148
145/* 149/*
146 * backup superblock flag is used to indicate that this volume 150 * backup superblock flag is used to indicate that this volume
147 * has backup superblocks. 151 * has backup superblocks.
@@ -299,6 +303,12 @@ struct ocfs2_new_group_input {
299 */ 303 */
300#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 304#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
301 305
306/*
307 * Inline extended attribute size (in bytes)
308 * The value chosen should be aligned to 16 byte boundaries.
309 */
310#define OCFS2_MIN_XATTR_INLINE_SIZE 256
311
302struct ocfs2_system_inode_info { 312struct ocfs2_system_inode_info {
303 char *si_name; 313 char *si_name;
304 int si_iflags; 314 int si_iflags;
@@ -563,7 +573,7 @@ struct ocfs2_super_block {
563/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts 573/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
564 before tunefs required */ 574 before tunefs required */
565 __le16 s_tunefs_flag; 575 __le16 s_tunefs_flag;
566 __le32 s_reserved1; 576 __le32 s_uuid_hash; /* hash value of uuid */
567 __le64 s_first_cluster_group; /* Block offset of 1st cluster 577 __le64 s_first_cluster_group; /* Block offset of 1st cluster
568 * group header */ 578 * group header */
569/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 579/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
@@ -571,7 +581,11 @@ struct ocfs2_super_block {
571/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace 581/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
572 stack. Only valid 582 stack. Only valid
573 with INCOMPAT flag. */ 583 with INCOMPAT flag. */
574/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */ 584/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
585 for this fs*/
586 __le16 s_reserved0;
587 __le32 s_reserved1;
588/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */
575/*140*/ 589/*140*/
576 590
577 /* 591 /*
@@ -621,7 +635,8 @@ struct ocfs2_dinode {
621 belongs to */ 635 belongs to */
622 __le16 i_suballoc_bit; /* Bit offset in suballocator 636 __le16 i_suballoc_bit; /* Bit offset in suballocator
623 block group */ 637 block group */
624/*10*/ __le32 i_reserved0; 638/*10*/ __le16 i_reserved0;
639 __le16 i_xattr_inline_size;
625 __le32 i_clusters; /* Cluster count */ 640 __le32 i_clusters; /* Cluster count */
626 __le32 i_uid; /* Owner UID */ 641 __le32 i_uid; /* Owner UID */
627 __le32 i_gid; /* Owning GID */ 642 __le32 i_gid; /* Owning GID */
@@ -640,11 +655,12 @@ struct ocfs2_dinode {
640 __le32 i_atime_nsec; 655 __le32 i_atime_nsec;
641 __le32 i_ctime_nsec; 656 __le32 i_ctime_nsec;
642 __le32 i_mtime_nsec; 657 __le32 i_mtime_nsec;
643 __le32 i_attr; 658/*70*/ __le32 i_attr;
644 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL 659 __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
645 was set in i_flags */ 660 was set in i_flags */
646 __le16 i_dyn_features; 661 __le16 i_dyn_features;
647/*70*/ __le64 i_reserved2[8]; 662 __le64 i_xattr_loc;
663/*80*/ __le64 i_reserved2[7];
648/*B8*/ union { 664/*B8*/ union {
649 __le64 i_pad1; /* Generic way to refer to this 665 __le64 i_pad1; /* Generic way to refer to this
650 64bit union */ 666 64bit union */
@@ -715,6 +731,136 @@ struct ocfs2_group_desc
715/*40*/ __u8 bg_bitmap[0]; 731/*40*/ __u8 bg_bitmap[0];
716}; 732};
717 733
734/*
735 * On disk extended attribute structure for OCFS2.
736 */
737
738/*
739 * ocfs2_xattr_entry indicates one extend attribute.
740 *
741 * Note that it can be stored in inode, one block or one xattr bucket.
742 */
743struct ocfs2_xattr_entry {
744 __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */
745 __le16 xe_name_offset; /* byte offset from the 1st etnry in the local
746 local xattr storage(inode, xattr block or
747 xattr bucket). */
748 __u8 xe_name_len; /* xattr name len, does't include prefix. */
749 __u8 xe_type; /* the low 7 bits indicates the name prefix's
750 * type and the highest 1 bits indicate whether
751 * the EA is stored in the local storage. */
752 __le64 xe_value_size; /* real xattr value length. */
753};
754
755/*
756 * On disk structure for xattr header.
757 *
758 * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
759 * the local xattr storage.
760 */
761struct ocfs2_xattr_header {
762 __le16 xh_count; /* contains the count of how
763 many records are in the
764 local xattr storage. */
765 __le16 xh_free_start; /* current offset for storing
766 xattr. */
767 __le16 xh_name_value_len; /* total length of name/value
768 length in this bucket. */
769 __le16 xh_num_buckets; /* bucket nums in one extent
770 record, only valid in the
771 first bucket. */
772 __le64 xh_csum;
773 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
774};
775
776/*
777 * On disk structure for xattr value root.
778 *
779 * It is used when one extended attribute's size is larger, and we will save it
780 * in an outside cluster. It will stored in a b-tree like file content.
781 */
782struct ocfs2_xattr_value_root {
783/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */
784 __le32 xr_reserved0;
785 __le64 xr_last_eb_blk; /* Pointer to last extent block */
786/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */
787};
788
789/*
790 * On disk structure for xattr tree root.
791 *
792 * It is used when there are too many extended attributes for one file. These
793 * attributes will be organized and stored in an indexed-btree.
794 */
795struct ocfs2_xattr_tree_root {
796/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */
797 __le32 xt_reserved0;
798 __le64 xt_last_eb_blk; /* Pointer to last extent block */
799/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */
800};
801
802#define OCFS2_XATTR_INDEXED 0x1
803#define OCFS2_HASH_SHIFT 5
804#define OCFS2_XATTR_ROUND 3
805#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \
806 ~(OCFS2_XATTR_ROUND))
807
808#define OCFS2_XATTR_BUCKET_SIZE 4096
809#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \
810 / OCFS2_MIN_BLOCKSIZE)
811
812/*
813 * On disk structure for xattr block.
814 */
815struct ocfs2_xattr_block {
816/*00*/ __u8 xb_signature[8]; /* Signature for verification */
817 __le16 xb_suballoc_slot; /* Slot suballocator this
818 block belongs to. */
819 __le16 xb_suballoc_bit; /* Bit offset in suballocator
820 block group */
821 __le32 xb_fs_generation; /* Must match super block */
822/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
823 __le64 xb_csum;
824/*20*/ __le16 xb_flags; /* Indicates whether this block contains
825 real xattr or a xattr tree. */
826 __le16 xb_reserved0;
827 __le32 xb_reserved1;
828 __le64 xb_reserved2;
829/*30*/ union {
830 struct ocfs2_xattr_header xb_header; /* xattr header if this
831 block contains xattr */
832 struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
833 block cotains xattr
834 tree. */
835 } xb_attrs;
836};
837
838#define OCFS2_XATTR_ENTRY_LOCAL 0x80
839#define OCFS2_XATTR_TYPE_MASK 0x7F
840static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
841 int local)
842{
843 if (local)
844 xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
845 else
846 xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
847}
848
849static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
850{
851 return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
852}
853
854static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
855{
856 xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
857}
858
859static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
860{
861 return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
862}
863
718#ifdef __KERNEL__ 864#ifdef __KERNEL__
719static inline int ocfs2_fast_symlink_chars(struct super_block *sb) 865static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
720{ 866{
@@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
728 offsetof(struct ocfs2_dinode, id2.i_data.id_data); 874 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
729} 875}
730 876
877static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
878 struct ocfs2_dinode *di)
879{
880 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
881
882 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
883 return sb->s_blocksize -
884 offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
885 xattrsize;
886 else
887 return sb->s_blocksize -
888 offsetof(struct ocfs2_dinode, id2.i_data.id_data);
889}
890
731static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) 891static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
732{ 892{
733 int size; 893 int size;
@@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
738 return size / sizeof(struct ocfs2_extent_rec); 898 return size / sizeof(struct ocfs2_extent_rec);
739} 899}
740 900
901static inline int ocfs2_extent_recs_per_inode_with_xattr(
902 struct super_block *sb,
903 struct ocfs2_dinode *di)
904{
905 int size;
906 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
907
908 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
909 size = sb->s_blocksize -
910 offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
911 xattrsize;
912 else
913 size = sb->s_blocksize -
914 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
915
916 return size / sizeof(struct ocfs2_extent_rec);
917}
918
741static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) 919static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
742{ 920{
743 int size; 921 int size;
@@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
801 return 0; 979 return 0;
802 980
803} 981}
982
983static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
984{
985 int size;
986
987 size = sb->s_blocksize -
988 offsetof(struct ocfs2_xattr_block,
989 xb_attrs.xb_root.xt_list.l_recs);
990
991 return size / sizeof(struct ocfs2_extent_rec);
992}
804#else 993#else
805static inline int ocfs2_fast_symlink_chars(int blocksize) 994static inline int ocfs2_fast_symlink_chars(int blocksize)
806{ 995{
@@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
884 1073
885 return 0; 1074 return 0;
886} 1075}
1076
1077static inline int ocfs2_xattr_recs_per_xb(int blocksize)
1078{
1079 int size;
1080
1081 size = blocksize -
1082 offsetof(struct ocfs2_xattr_block,
1083 xb_attrs.xb_root.xt_list.l_recs);
1084
1085 return size / sizeof(struct ocfs2_extent_rec);
1086}
887#endif /* __KERNEL__ */ 1087#endif /* __KERNEL__ */
888 1088
889 1089
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
new file mode 100644
index 000000000000..b91c78f8f558
--- /dev/null
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_jbd_compat.h
5 *
6 * Compatibility defines for JBD.
7 *
8 * Copyright (C) 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_JBD_COMPAT_H
21#define OCFS2_JBD_COMPAT_H
22
23#ifndef CONFIG_OCFS2_COMPAT_JBD
24# error Should not have been included
25#endif
26
27struct jbd2_inode {
28 unsigned int dummy;
29};
30
31#define JBD2_BARRIER JFS_BARRIER
32#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
33
34#define jbd2_journal_ack_err journal_ack_err
35#define jbd2_journal_clear_err journal_clear_err
36#define jbd2_journal_destroy journal_destroy
37#define jbd2_journal_dirty_metadata journal_dirty_metadata
38#define jbd2_journal_errno journal_errno
39#define jbd2_journal_extend journal_extend
40#define jbd2_journal_flush journal_flush
41#define jbd2_journal_force_commit journal_force_commit
42#define jbd2_journal_get_write_access journal_get_write_access
43#define jbd2_journal_get_undo_access journal_get_undo_access
44#define jbd2_journal_init_inode journal_init_inode
45#define jbd2_journal_invalidatepage journal_invalidatepage
46#define jbd2_journal_load journal_load
47#define jbd2_journal_lock_updates journal_lock_updates
48#define jbd2_journal_restart journal_restart
49#define jbd2_journal_start journal_start
50#define jbd2_journal_start_commit journal_start_commit
51#define jbd2_journal_stop journal_stop
52#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
53#define jbd2_journal_unlock_updates journal_unlock_updates
54#define jbd2_journal_wipe journal_wipe
55#define jbd2_log_wait_commit log_wait_commit
56
57static inline int jbd2_journal_file_inode(handle_t *handle,
58 struct jbd2_inode *inode)
59{
60 return 0;
61}
62
63static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
64 loff_t new_size)
65{
66 return 0;
67}
68
69static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
70 struct inode *inode)
71{
72 return;
73}
74
75static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
76 struct jbd2_inode *jinode)
77{
78 return;
79}
80
81
82#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 8166968e9015..ffd48db229a7 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
200 if (cluster > clusters) 200 if (cluster > clusters)
201 break; 201 break;
202 202
203 ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL); 203 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
204 if (ret < 0) { 204 if (ret < 0) {
205 mlog_errno(ret); 205 mlog_errno(ret);
206 break; 206 break;
@@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
236 * update the superblock last. 236 * update the superblock last.
237 * It doesn't matter if the write failed. 237 * It doesn't matter if the write failed.
238 */ 238 */
239 ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO, 239 ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
240 &super_bh, 0, NULL); 240 &super_bh);
241 if (ret < 0) { 241 if (ret < 0) {
242 mlog_errno(ret); 242 mlog_errno(ret);
243 goto out; 243 goto out;
@@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, 332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1); 333 first_new_cluster - 1);
334 334
335 ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED, 335 ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
336 main_bm_inode);
337 if (ret < 0) { 336 if (ret < 0) {
338 mlog_errno(ret); 337 mlog_errno(ret);
339 goto out_unlock; 338 goto out_unlock;
@@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
540 goto out_unlock; 539 goto out_unlock;
541 } 540 }
542 541
543 ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL); 542 ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
544 if (ret < 0) { 543 if (ret < 0) {
545 mlog(ML_ERROR, "Can't read the group descriptor # %llu " 544 mlog(ML_ERROR, "Can't read the group descriptor # %llu "
546 "from the device.", (unsigned long long)input->group); 545 "from the device.", (unsigned long long)input->group);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bb5ff8939bf1..bdda2d8f8508 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If 150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
151 * this is not true, the read of -1 (UINT64_MAX) will fail. 151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */ 152 */
153 ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, 153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
154 si->si_inode); 154 OCFS2_BH_IGNORE_CACHE);
155 if (ret == 0) { 155 if (ret == 0) {
156 spin_lock(&osb->osb_lock); 156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si); 157 ocfs2_update_slot_info(si);
@@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
404 (unsigned long long)blkno); 404 (unsigned long long)blkno);
405 405
406 bh = NULL; /* Acquire a fresh bh */ 406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); 407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
408 OCFS2_BH_IGNORE_CACHE);
408 if (status < 0) { 409 if (status < 0) {
409 mlog_errno(status); 410 mlog_errno(status);
410 goto bail; 411 goto bail;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 353fc35c6748..faec2d879357 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -28,6 +28,7 @@
28#include "ocfs2.h" /* For struct ocfs2_lock_res */ 28#include "ocfs2.h" /* For struct ocfs2_lock_res */
29#include "stackglue.h" 29#include "stackglue.h"
30 30
31#include <linux/dlm_plock.h>
31 32
32/* 33/*
33 * The control protocol starts with a handshake. Until the handshake 34 * The control protocol starts with a handshake. Until the handshake
@@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
746{ 747{
747} 748}
748 749
750static int user_plock(struct ocfs2_cluster_connection *conn,
751 u64 ino,
752 struct file *file,
753 int cmd,
754 struct file_lock *fl)
755{
756 /*
757 * This more or less just demuxes the plock request into any
758 * one of three dlm calls.
759 *
760 * Internally, fs/dlm will pass these to a misc device, which
761 * a userspace daemon will read and write to.
762 *
763 * For now, cancel requests (which happen internally only),
764 * are turned into unlocks. Most of this function taken from
765 * gfs2_lock.
766 */
767
768 if (cmd == F_CANCELLK) {
769 cmd = F_SETLK;
770 fl->fl_type = F_UNLCK;
771 }
772
773 if (IS_GETLK(cmd))
774 return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
775 else if (fl->fl_type == F_UNLCK)
776 return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
777 else
778 return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
779}
780
749/* 781/*
750 * Compare a requested locking protocol version against the current one. 782 * Compare a requested locking protocol version against the current one.
751 * 783 *
@@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
839 .dlm_unlock = user_dlm_unlock, 871 .dlm_unlock = user_dlm_unlock,
840 .lock_status = user_dlm_lock_status, 872 .lock_status = user_dlm_lock_status,
841 .lock_lvb = user_dlm_lvb, 873 .lock_lvb = user_dlm_lvb,
874 .plock = user_plock,
842 .dump_lksb = user_dlm_dump_lksb, 875 .dump_lksb = user_dlm_dump_lksb,
843}; 876};
844 877
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 10e149ae5e3a..68b668b0e60a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name,
97 goto out; 97 goto out;
98 } 98 }
99 99
100 /* Ok, the stack is pinned */
101 p->sp_count++;
102 active_stack = p; 100 active_stack = p;
103
104 rc = 0; 101 rc = 0;
105 102
106out: 103out:
104 /* If we found it, pin it */
105 if (!rc)
106 active_stack->sp_count++;
107
107 spin_unlock(&ocfs2_stack_lock); 108 spin_unlock(&ocfs2_stack_lock);
108 return rc; 109 return rc;
109} 110}
@@ -287,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
287} 288}
288EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); 289EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
289 290
291int ocfs2_stack_supports_plocks(void)
292{
293 return active_stack && active_stack->sp_ops->plock;
294}
295EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
296
297/*
298 * ocfs2_plock() can only be safely called if
299 * ocfs2_stack_supports_plocks() returned true
300 */
301int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
302 struct file *file, int cmd, struct file_lock *fl)
303{
304 WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
305 if (active_stack->sp_ops->plock)
306 return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
307 return -EOPNOTSUPP;
308}
309EXPORT_SYMBOL_GPL(ocfs2_plock);
310
290int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
291 const char *group, 312 const char *group,
292 int grouplen, 313 int grouplen,
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index db56281dd1be..c571af375ef8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -28,6 +28,10 @@
28#include "dlm/dlmapi.h" 28#include "dlm/dlmapi.h"
29#include <linux/dlm.h> 29#include <linux/dlm.h>
30 30
31/* Needed for plock-related prototypes */
32struct file;
33struct file_lock;
34
31/* 35/*
32 * dlmconstants.h does not have a LOCAL flag. We hope to remove it 36 * dlmconstants.h does not have a LOCAL flag. We hope to remove it
33 * some day, but right now we need it. Let's fake it. This value is larger 37 * some day, but right now we need it. Let's fake it. This value is larger
@@ -187,6 +191,17 @@ struct ocfs2_stack_operations {
187 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); 191 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
188 192
189 /* 193 /*
194 * Cluster-aware posix locks
195 *
196 * This is NULL for stacks which do not support posix locks.
197 */
198 int (*plock)(struct ocfs2_cluster_connection *conn,
199 u64 ino,
200 struct file *file,
201 int cmd,
202 struct file_lock *fl);
203
204 /*
190 * This is an optoinal debugging hook. If provided, the 205 * This is an optoinal debugging hook. If provided, the
191 * stack can dump debugging information about this lock. 206 * stack can dump debugging information about this lock.
192 */ 207 */
@@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
240void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); 255void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
241void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); 256void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
242 257
258int ocfs2_stack_supports_plocks(void);
259int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
260 struct file *file, int cmd, struct file_lock *fl);
261
243void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); 262void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
244 263
245 264
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278fb9819..c5ff18b46b57 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
62 struct ocfs2_chain_list *cl); 62 struct ocfs2_chain_list *cl);
63static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 63static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
64 struct inode *alloc_inode, 64 struct inode *alloc_inode,
65 struct buffer_head *bh); 65 struct buffer_head *bh,
66 u64 max_block);
66 67
67static int ocfs2_cluster_group_search(struct inode *inode, 68static int ocfs2_cluster_group_search(struct inode *inode,
68 struct buffer_head *group_bh, 69 struct buffer_head *group_bh,
69 u32 bits_wanted, u32 min_bits, 70 u32 bits_wanted, u32 min_bits,
71 u64 max_block,
70 u16 *bit_off, u16 *bits_found); 72 u16 *bit_off, u16 *bits_found);
71static int ocfs2_block_group_search(struct inode *inode, 73static int ocfs2_block_group_search(struct inode *inode,
72 struct buffer_head *group_bh, 74 struct buffer_head *group_bh,
73 u32 bits_wanted, u32 min_bits, 75 u32 bits_wanted, u32 min_bits,
76 u64 max_block,
74 u16 *bit_off, u16 *bits_found); 77 u16 *bit_off, u16 *bits_found);
75static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, 78static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
76 struct ocfs2_alloc_context *ac, 79 struct ocfs2_alloc_context *ac,
@@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
110 u64 data_blkno, 113 u64 data_blkno,
111 u64 *bg_blkno, 114 u64 *bg_blkno,
112 u16 *bg_bit_off); 115 u16 *bg_bit_off);
116static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
117 u32 bits_wanted, u64 max_block,
118 struct ocfs2_alloc_context **ac);
113 119
114static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) 120void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
115{ 121{
116 struct inode *inode = ac->ac_inode; 122 struct inode *inode = ac->ac_inode;
117 123
@@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
124 iput(inode); 130 iput(inode);
125 ac->ac_inode = NULL; 131 ac->ac_inode = NULL;
126 } 132 }
127 if (ac->ac_bh) { 133 brelse(ac->ac_bh);
128 brelse(ac->ac_bh); 134 ac->ac_bh = NULL;
129 ac->ac_bh = NULL;
130 }
131} 135}
132 136
133void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 137void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
276 */ 280 */
277static int ocfs2_block_group_alloc(struct ocfs2_super *osb, 281static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
278 struct inode *alloc_inode, 282 struct inode *alloc_inode,
279 struct buffer_head *bh) 283 struct buffer_head *bh,
284 u64 max_block)
280{ 285{
281 int status, credits; 286 int status, credits;
282 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 287 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
294 mlog_entry_void(); 299 mlog_entry_void();
295 300
296 cl = &fe->id2.i_chain; 301 cl = &fe->id2.i_chain;
297 status = ocfs2_reserve_clusters(osb, 302 status = ocfs2_reserve_clusters_with_limit(osb,
298 le16_to_cpu(cl->cl_cpg), 303 le16_to_cpu(cl->cl_cpg),
299 &ac); 304 max_block, &ac);
300 if (status < 0) { 305 if (status < 0) {
301 if (status != -ENOSPC) 306 if (status != -ENOSPC)
302 mlog_errno(status); 307 mlog_errno(status);
@@ -394,8 +399,7 @@ bail:
394 if (ac) 399 if (ac)
395 ocfs2_free_alloc_context(ac); 400 ocfs2_free_alloc_context(ac);
396 401
397 if (bg_bh) 402 brelse(bg_bh);
398 brelse(bg_bh);
399 403
400 mlog_exit(status); 404 mlog_exit(status);
401 return status; 405 return status;
@@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
469 goto bail; 473 goto bail;
470 } 474 }
471 475
472 status = ocfs2_block_group_alloc(osb, alloc_inode, bh); 476 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
477 ac->ac_max_block);
473 if (status < 0) { 478 if (status < 0) {
474 if (status != -ENOSPC) 479 if (status != -ENOSPC)
475 mlog_errno(status); 480 mlog_errno(status);
@@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
486 get_bh(bh); 491 get_bh(bh);
487 ac->ac_bh = bh; 492 ac->ac_bh = bh;
488bail: 493bail:
489 if (bh) 494 brelse(bh);
490 brelse(bh);
491 495
492 mlog_exit(status); 496 mlog_exit(status);
493 return status; 497 return status;
494} 498}
495 499
496int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 500int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
497 struct ocfs2_dinode *fe, 501 int blocks,
498 struct ocfs2_alloc_context **ac) 502 struct ocfs2_alloc_context **ac)
499{ 503{
500 int status; 504 int status;
501 u32 slot; 505 u32 slot;
@@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
507 goto bail; 511 goto bail;
508 } 512 }
509 513
510 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); 514 (*ac)->ac_bits_wanted = blocks;
511 (*ac)->ac_which = OCFS2_AC_USE_META; 515 (*ac)->ac_which = OCFS2_AC_USE_META;
512 slot = osb->slot_num; 516 slot = osb->slot_num;
513 (*ac)->ac_group_search = ocfs2_block_group_search; 517 (*ac)->ac_group_search = ocfs2_block_group_search;
@@ -532,6 +536,15 @@ bail:
532 return status; 536 return status;
533} 537}
534 538
539int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
540 struct ocfs2_extent_list *root_el,
541 struct ocfs2_alloc_context **ac)
542{
543 return ocfs2_reserve_new_metadata_blocks(osb,
544 ocfs2_extend_meta_needed(root_el),
545 ac);
546}
547
535static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, 548static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
536 struct ocfs2_alloc_context *ac) 549 struct ocfs2_alloc_context *ac)
537{ 550{
@@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
582 (*ac)->ac_group_search = ocfs2_block_group_search; 595 (*ac)->ac_group_search = ocfs2_block_group_search;
583 596
584 /* 597 /*
598 * stat(2) can't handle i_ino > 32bits, so we tell the
599 * lower levels not to allocate us a block group past that
600 * limit. The 'inode64' mount option avoids this behavior.
601 */
602 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
603 (*ac)->ac_max_block = (u32)~0U;
604
605 /*
585 * slot is set when we successfully steal inode from other nodes. 606 * slot is set when we successfully steal inode from other nodes.
586 * It is reset in 3 places: 607 * It is reset in 3 places:
587 * 1. when we flush the truncate log 608 * 1. when we flush the truncate log
@@ -661,9 +682,9 @@ bail:
661/* Callers don't need to care which bitmap (local alloc or main) to 682/* Callers don't need to care which bitmap (local alloc or main) to
662 * use so we figure it out for them, but unfortunately this clutters 683 * use so we figure it out for them, but unfortunately this clutters
663 * things a bit. */ 684 * things a bit. */
664int ocfs2_reserve_clusters(struct ocfs2_super *osb, 685static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
665 u32 bits_wanted, 686 u32 bits_wanted, u64 max_block,
666 struct ocfs2_alloc_context **ac) 687 struct ocfs2_alloc_context **ac)
667{ 688{
668 int status; 689 int status;
669 690
@@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
677 } 698 }
678 699
679 (*ac)->ac_bits_wanted = bits_wanted; 700 (*ac)->ac_bits_wanted = bits_wanted;
701 (*ac)->ac_max_block = max_block;
680 702
681 status = -ENOSPC; 703 status = -ENOSPC;
682 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { 704 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
683 status = ocfs2_reserve_local_alloc_bits(osb, 705 status = ocfs2_reserve_local_alloc_bits(osb,
684 bits_wanted, 706 bits_wanted,
685 *ac); 707 *ac);
686 if ((status < 0) && (status != -ENOSPC)) { 708 if (status == -EFBIG) {
709 /* The local alloc window is outside ac_max_block.
710 * use the main bitmap. */
711 status = -ENOSPC;
712 } else if ((status < 0) && (status != -ENOSPC)) {
687 mlog_errno(status); 713 mlog_errno(status);
688 goto bail; 714 goto bail;
689 } else if (status == -ENOSPC) {
690 /* reserve_local_bits will return enospc with
691 * the local alloc inode still locked, so we
692 * can change this safely here. */
693 mlog(0, "Disabling local alloc\n");
694 /* We set to OCFS2_LA_DISABLED so that umount
695 * can clean up what's left of the local
696 * allocation */
697 osb->local_alloc_state = OCFS2_LA_DISABLED;
698 } 715 }
699 } 716 }
700 717
@@ -718,6 +735,13 @@ bail:
718 return status; 735 return status;
719} 736}
720 737
738int ocfs2_reserve_clusters(struct ocfs2_super *osb,
739 u32 bits_wanted,
740 struct ocfs2_alloc_context **ac)
741{
742 return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
743}
744
721/* 745/*
722 * More or less lifted from ext3. I'll leave their description below: 746 * More or less lifted from ext3. I'll leave their description below:
723 * 747 *
@@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
1000static int ocfs2_cluster_group_search(struct inode *inode, 1024static int ocfs2_cluster_group_search(struct inode *inode,
1001 struct buffer_head *group_bh, 1025 struct buffer_head *group_bh,
1002 u32 bits_wanted, u32 min_bits, 1026 u32 bits_wanted, u32 min_bits,
1027 u64 max_block,
1003 u16 *bit_off, u16 *bits_found) 1028 u16 *bit_off, u16 *bits_found)
1004{ 1029{
1005 int search = -ENOSPC; 1030 int search = -ENOSPC;
1006 int ret; 1031 int ret;
1032 u64 blkoff;
1007 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; 1033 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1034 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1008 u16 tmp_off, tmp_found; 1035 u16 tmp_off, tmp_found;
1009 unsigned int max_bits, gd_cluster_off; 1036 unsigned int max_bits, gd_cluster_off;
1010 1037
@@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1037 if (ret) 1064 if (ret)
1038 return ret; 1065 return ret;
1039 1066
1067 if (max_block) {
1068 blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1069 gd_cluster_off +
1070 tmp_off + tmp_found);
1071 mlog(0, "Checking %llu against %llu\n",
1072 (unsigned long long)blkoff,
1073 (unsigned long long)max_block);
1074 if (blkoff > max_block)
1075 return -ENOSPC;
1076 }
1077
1040 /* ocfs2_block_group_find_clear_bits() might 1078 /* ocfs2_block_group_find_clear_bits() might
1041 * return success, but we still want to return 1079 * return success, but we still want to return
1042 * -ENOSPC unless it found the minimum number 1080 * -ENOSPC unless it found the minimum number
@@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1045 *bit_off = tmp_off; 1083 *bit_off = tmp_off;
1046 *bits_found = tmp_found; 1084 *bits_found = tmp_found;
1047 search = 0; /* success */ 1085 search = 0; /* success */
1086 } else if (tmp_found) {
1087 /*
1088 * Don't show bits which we'll be returning
1089 * for allocation to the local alloc bitmap.
1090 */
1091 ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1048 } 1092 }
1049 } 1093 }
1050 1094
@@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1054static int ocfs2_block_group_search(struct inode *inode, 1098static int ocfs2_block_group_search(struct inode *inode,
1055 struct buffer_head *group_bh, 1099 struct buffer_head *group_bh,
1056 u32 bits_wanted, u32 min_bits, 1100 u32 bits_wanted, u32 min_bits,
1101 u64 max_block,
1057 u16 *bit_off, u16 *bits_found) 1102 u16 *bit_off, u16 *bits_found)
1058{ 1103{
1059 int ret = -ENOSPC; 1104 int ret = -ENOSPC;
1105 u64 blkoff;
1060 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; 1106 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1061 1107
1062 BUG_ON(min_bits != 1); 1108 BUG_ON(min_bits != 1);
1063 BUG_ON(ocfs2_is_cluster_bitmap(inode)); 1109 BUG_ON(ocfs2_is_cluster_bitmap(inode));
1064 1110
1065 if (bg->bg_free_bits_count) 1111 if (bg->bg_free_bits_count) {
1066 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1112 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1067 group_bh, bits_wanted, 1113 group_bh, bits_wanted,
1068 le16_to_cpu(bg->bg_bits), 1114 le16_to_cpu(bg->bg_bits),
1069 bit_off, bits_found); 1115 bit_off, bits_found);
1116 if (!ret && max_block) {
1117 blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1118 *bits_found;
1119 mlog(0, "Checking %llu against %llu\n",
1120 (unsigned long long)blkoff,
1121 (unsigned long long)max_block);
1122 if (blkoff > max_block)
1123 ret = -ENOSPC;
1124 }
1125 }
1070 1126
1071 return ret; 1127 return ret;
1072} 1128}
@@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1116 struct ocfs2_group_desc *gd; 1172 struct ocfs2_group_desc *gd;
1117 struct inode *alloc_inode = ac->ac_inode; 1173 struct inode *alloc_inode = ac->ac_inode;
1118 1174
1119 ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, 1175 ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
1120 &group_bh, OCFS2_BH_CACHED, alloc_inode);
1121 if (ret < 0) { 1176 if (ret < 0) {
1122 mlog_errno(ret); 1177 mlog_errno(ret);
1123 return ret; 1178 return ret;
@@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1131 } 1186 }
1132 1187
1133 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1188 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1134 bit_off, &found); 1189 ac->ac_max_block, bit_off, &found);
1135 if (ret < 0) { 1190 if (ret < 0) {
1136 if (ret != -ENOSPC) 1191 if (ret != -ENOSPC)
1137 mlog_errno(ret); 1192 mlog_errno(ret);
@@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1186 bits_wanted, chain, 1241 bits_wanted, chain,
1187 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1242 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1188 1243
1189 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), 1244 status = ocfs2_read_block(alloc_inode,
1190 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1245 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1191 &group_bh, OCFS2_BH_CACHED, alloc_inode); 1246 &group_bh);
1192 if (status < 0) { 1247 if (status < 0) {
1193 mlog_errno(status); 1248 mlog_errno(status);
1194 goto bail; 1249 goto bail;
@@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1204 /* for now, the chain search is a bit simplistic. We just use 1259 /* for now, the chain search is a bit simplistic. We just use
1205 * the 1st group with any empty bits. */ 1260 * the 1st group with any empty bits. */
1206 while ((status = ac->ac_group_search(alloc_inode, group_bh, 1261 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1207 bits_wanted, min_bits, bit_off, 1262 bits_wanted, min_bits,
1263 ac->ac_max_block, bit_off,
1208 &tmp_bits)) == -ENOSPC) { 1264 &tmp_bits)) == -ENOSPC) {
1209 if (!bg->bg_next_group) 1265 if (!bg->bg_next_group)
1210 break; 1266 break;
1211 1267
1212 if (prev_group_bh) { 1268 brelse(prev_group_bh);
1213 brelse(prev_group_bh); 1269 prev_group_bh = NULL;
1214 prev_group_bh = NULL; 1270
1215 }
1216 next_group = le64_to_cpu(bg->bg_next_group); 1271 next_group = le64_to_cpu(bg->bg_next_group);
1217 prev_group_bh = group_bh; 1272 prev_group_bh = group_bh;
1218 group_bh = NULL; 1273 group_bh = NULL;
1219 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), 1274 status = ocfs2_read_block(alloc_inode,
1220 next_group, &group_bh, 1275 next_group, &group_bh);
1221 OCFS2_BH_CACHED, alloc_inode);
1222 if (status < 0) { 1276 if (status < 0) {
1223 mlog_errno(status); 1277 mlog_errno(status);
1224 goto bail; 1278 goto bail;
@@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1307 *bg_blkno = le64_to_cpu(bg->bg_blkno); 1361 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1308 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1362 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1309bail: 1363bail:
1310 if (group_bh) 1364 brelse(group_bh);
1311 brelse(group_bh); 1365 brelse(prev_group_bh);
1312 if (prev_group_bh)
1313 brelse(prev_group_bh);
1314 1366
1315 mlog_exit(status); 1367 mlog_exit(status);
1316 return status; 1368 return status;
@@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1723{ 1775{
1724 int status = 0; 1776 int status = 0;
1725 u32 tmp_used; 1777 u32 tmp_used;
1726 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1727 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 1778 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1728 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 1779 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1729 struct buffer_head *group_bh = NULL; 1780 struct buffer_head *group_bh = NULL;
@@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1742 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 1793 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1743 (unsigned long long)bg_blkno, start_bit); 1794 (unsigned long long)bg_blkno, start_bit);
1744 1795
1745 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, 1796 status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
1746 alloc_inode);
1747 if (status < 0) { 1797 if (status < 0) {
1748 mlog_errno(status); 1798 mlog_errno(status);
1749 goto bail; 1799 goto bail;
@@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1784 } 1834 }
1785 1835
1786bail: 1836bail:
1787 if (group_bh) 1837 brelse(group_bh);
1788 brelse(group_bh);
1789 1838
1790 mlog_exit(status); 1839 mlog_exit(status);
1791 return status; 1840 return status;
@@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle,
1838 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 1887 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1839 bg_start_bit, bg_blkno, 1888 bg_start_bit, bg_blkno,
1840 num_clusters); 1889 num_clusters);
1841 if (status < 0) 1890 if (status < 0) {
1842 mlog_errno(status); 1891 mlog_errno(status);
1892 goto out;
1893 }
1843 1894
1895 ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
1896 num_clusters);
1897
1898out:
1844 mlog_exit(status); 1899 mlog_exit(status);
1845 return status; 1900 return status;
1846} 1901}
@@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1891 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); 1946 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
1892 } 1947 }
1893} 1948}
1949
1950/*
1951 * For a given allocation, determine which allocators will need to be
1952 * accessed, and lock them, reserving the appropriate number of bits.
1953 *
1954 * Sparse file systems call this from ocfs2_write_begin_nolock()
1955 * and ocfs2_allocate_unwritten_extents().
1956 *
1957 * File systems which don't support holes call this from
1958 * ocfs2_extend_allocation().
1959 */
1960int ocfs2_lock_allocators(struct inode *inode,
1961 struct ocfs2_extent_tree *et,
1962 u32 clusters_to_add, u32 extents_to_split,
1963 struct ocfs2_alloc_context **data_ac,
1964 struct ocfs2_alloc_context **meta_ac)
1965{
1966 int ret = 0, num_free_extents;
1967 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
1968 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1969
1970 *meta_ac = NULL;
1971 if (data_ac)
1972 *data_ac = NULL;
1973
1974 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
1975
1976 num_free_extents = ocfs2_num_free_extents(osb, inode, et);
1977 if (num_free_extents < 0) {
1978 ret = num_free_extents;
1979 mlog_errno(ret);
1980 goto out;
1981 }
1982
1983 /*
1984 * Sparse allocation file systems need to be more conservative
1985 * with reserving room for expansion - the actual allocation
1986 * happens while we've got a journal handle open so re-taking
1987 * a cluster lock (because we ran out of room for another
1988 * extent) will violate ordering rules.
1989 *
1990 * Most of the time we'll only be seeing this 1 cluster at a time
1991 * anyway.
1992 *
1993 * Always lock for any unwritten extents - we might want to
1994 * add blocks during a split.
1995 */
1996 if (!num_free_extents ||
1997 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
1998 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
1999 if (ret < 0) {
2000 if (ret != -ENOSPC)
2001 mlog_errno(ret);
2002 goto out;
2003 }
2004 }
2005
2006 if (clusters_to_add == 0)
2007 goto out;
2008
2009 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2010 if (ret < 0) {
2011 if (ret != -ENOSPC)
2012 mlog_errno(ret);
2013 goto out;
2014 }
2015
2016out:
2017 if (ret) {
2018 if (*meta_ac) {
2019 ocfs2_free_alloc_context(*meta_ac);
2020 *meta_ac = NULL;
2021 }
2022
2023 /*
2024 * We cannot have an error and a non null *data_ac.
2025 */
2026 }
2027
2028 return ret;
2029}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600662bd..4df159d8f450 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -28,10 +28,11 @@
28 28
29typedef int (group_search_t)(struct inode *, 29typedef int (group_search_t)(struct inode *,
30 struct buffer_head *, 30 struct buffer_head *,
31 u32, 31 u32, /* bits_wanted */
32 u32, 32 u32, /* min_bits */
33 u16 *, 33 u64, /* max_block */
34 u16 *); 34 u16 *, /* *bit_off */
35 u16 *); /* *bits_found */
35 36
36struct ocfs2_alloc_context { 37struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */ 38 struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
51 group_search_t *ac_group_search; 52 group_search_t *ac_group_search;
52 53
53 u64 ac_last_group; 54 u64 ac_last_group;
55 u64 ac_max_block; /* Highest block number to allocate. 0 is
56 is the same as ~0 - unlimited */
54}; 57};
55 58
56void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); 59void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
@@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
59 return ac->ac_bits_wanted - ac->ac_bits_given; 62 return ac->ac_bits_wanted - ac->ac_bits_given;
60} 63}
61 64
65/*
66 * Please note that the caller must make sure that root_el is the root
67 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
68 * the result may be wrong.
69 */
62int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, 70int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
63 struct ocfs2_dinode *fe, 71 struct ocfs2_extent_list *root_el,
64 struct ocfs2_alloc_context **ac); 72 struct ocfs2_alloc_context **ac);
73int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
74 int blocks,
75 struct ocfs2_alloc_context **ac);
65int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 76int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
66 struct ocfs2_alloc_context **ac); 77 struct ocfs2_alloc_context **ac);
67int ocfs2_reserve_clusters(struct ocfs2_super *osb, 78int ocfs2_reserve_clusters(struct ocfs2_super *osb,
@@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
147 * apis above. */ 158 * apis above. */
148int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, 159int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
149 struct ocfs2_alloc_context *ac); 160 struct ocfs2_alloc_context *ac);
161void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
150 162
151/* given a cluster offset, calculate which block group it belongs to 163/* given a cluster offset, calculate which block group it belongs to
152 * and return that block offset. */ 164 * and return that block offset. */
@@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
156int ocfs2_check_group_descriptor(struct super_block *sb, 168int ocfs2_check_group_descriptor(struct super_block *sb,
157 struct ocfs2_dinode *di, 169 struct ocfs2_dinode *di,
158 struct ocfs2_group_desc *gd); 170 struct ocfs2_group_desc *gd);
171int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
172 u32 clusters_to_add, u32 extents_to_split,
173 struct ocfs2_alloc_context **data_ac,
174 struct ocfs2_alloc_context **meta_ac);
159#endif /* _CHAINALLOC_H_ */ 175#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 88255d3f52b4..304b63ac78cf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
64#include "sysfile.h" 64#include "sysfile.h"
65#include "uptodate.h" 65#include "uptodate.h"
66#include "ver.h" 66#include "ver.h"
67#include "xattr.h"
67 68
68#include "buffer_head_io.h" 69#include "buffer_head_io.h"
69 70
@@ -154,10 +155,13 @@ enum {
154 Opt_localalloc, 155 Opt_localalloc,
155 Opt_localflocks, 156 Opt_localflocks,
156 Opt_stack, 157 Opt_stack,
158 Opt_user_xattr,
159 Opt_nouser_xattr,
160 Opt_inode64,
157 Opt_err, 161 Opt_err,
158}; 162};
159 163
160static match_table_t tokens = { 164static const match_table_t tokens = {
161 {Opt_barrier, "barrier=%u"}, 165 {Opt_barrier, "barrier=%u"},
162 {Opt_err_panic, "errors=panic"}, 166 {Opt_err_panic, "errors=panic"},
163 {Opt_err_ro, "errors=remount-ro"}, 167 {Opt_err_ro, "errors=remount-ro"},
@@ -173,6 +177,9 @@ static match_table_t tokens = {
173 {Opt_localalloc, "localalloc=%d"}, 177 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"}, 178 {Opt_localflocks, "localflocks"},
175 {Opt_stack, "cluster_stack=%s"}, 179 {Opt_stack, "cluster_stack=%s"},
180 {Opt_user_xattr, "user_xattr"},
181 {Opt_nouser_xattr, "nouser_xattr"},
182 {Opt_inode64, "inode64"},
176 {Opt_err, NULL} 183 {Opt_err, NULL}
177}; 184};
178 185
@@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
205 ocfs2_schedule_truncate_log_flush(osb, 0); 212 ocfs2_schedule_truncate_log_flush(osb, 0);
206 } 213 }
207 214
208 if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { 215 if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
216 &target)) {
209 if (wait) 217 if (wait)
210 log_wait_commit(OCFS2_SB(sb)->journal->j_journal, 218 jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
211 target); 219 target);
212 } 220 }
213 return 0; 221 return 0;
214} 222}
@@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
325 if (!oi) 333 if (!oi)
326 return NULL; 334 return NULL;
327 335
336 jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
328 return &oi->vfs_inode; 337 return &oi->vfs_inode;
329} 338}
330 339
@@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
406 goto out; 415 goto out;
407 } 416 }
408 417
418 /* Probably don't want this on remount; it might
419 * mess with other nodes */
420 if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
421 (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
422 ret = -EINVAL;
423 mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
424 goto out;
425 }
426
409 /* We're going to/from readonly mode. */ 427 /* We're going to/from readonly mode. */
410 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { 428 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
411 /* Lock here so the check of HARD_RO and the potential 429 /* Lock here so the check of HARD_RO and the potential
@@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
637 osb->s_atime_quantum = parsed_options.atime_quantum; 655 osb->s_atime_quantum = parsed_options.atime_quantum;
638 osb->preferred_slot = parsed_options.slot; 656 osb->preferred_slot = parsed_options.slot;
639 osb->osb_commit_interval = parsed_options.commit_interval; 657 osb->osb_commit_interval = parsed_options.commit_interval;
640 osb->local_alloc_size = parsed_options.localalloc_opt; 658 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
659 osb->local_alloc_bits = osb->local_alloc_default_bits;
641 660
642 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 661 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
643 if (status) 662 if (status)
@@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
743 return status; 762 return status;
744 763
745read_super_error: 764read_super_error:
746 if (bh != NULL) 765 brelse(bh);
747 brelse(bh);
748 766
749 if (inode) 767 if (inode)
750 iput(inode); 768 iput(inode);
@@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb,
847 case Opt_data_writeback: 865 case Opt_data_writeback:
848 mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; 866 mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
849 break; 867 break;
868 case Opt_user_xattr:
869 mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
870 break;
871 case Opt_nouser_xattr:
872 mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
873 break;
850 case Opt_atime_quantum: 874 case Opt_atime_quantum:
851 if (match_int(&args[0], &option)) { 875 if (match_int(&args[0], &option)) {
852 status = 0; 876 status = 0;
@@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb,
873 if (option < 0) 897 if (option < 0)
874 return 0; 898 return 0;
875 if (option == 0) 899 if (option == 0)
876 option = JBD_DEFAULT_MAX_COMMIT_AGE; 900 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
877 mopt->commit_interval = HZ * option; 901 mopt->commit_interval = HZ * option;
878 break; 902 break;
879 case Opt_localalloc: 903 case Opt_localalloc:
@@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb,
918 OCFS2_STACK_LABEL_LEN); 942 OCFS2_STACK_LABEL_LEN);
919 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; 943 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
920 break; 944 break;
945 case Opt_inode64:
946 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
947 break;
921 default: 948 default:
922 mlog(ML_ERROR, 949 mlog(ML_ERROR,
923 "Unrecognized mount option \"%s\" " 950 "Unrecognized mount option \"%s\" "
@@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
938{ 965{
939 struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); 966 struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
940 unsigned long opts = osb->s_mount_opt; 967 unsigned long opts = osb->s_mount_opt;
968 unsigned int local_alloc_megs;
941 969
942 if (opts & OCFS2_MOUNT_HB_LOCAL) 970 if (opts & OCFS2_MOUNT_HB_LOCAL)
943 seq_printf(s, ",_netdev,heartbeat=local"); 971 seq_printf(s, ",_netdev,heartbeat=local");
@@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
970 seq_printf(s, ",commit=%u", 998 seq_printf(s, ",commit=%u",
971 (unsigned) (osb->osb_commit_interval / HZ)); 999 (unsigned) (osb->osb_commit_interval / HZ));
972 1000
973 if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) 1001 local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
974 seq_printf(s, ",localalloc=%d", osb->local_alloc_size); 1002 if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
1003 seq_printf(s, ",localalloc=%d", local_alloc_megs);
975 1004
976 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 1005 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
977 seq_printf(s, ",localflocks,"); 1006 seq_printf(s, ",localflocks,");
@@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
980 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, 1009 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
981 osb->osb_cluster_stack); 1010 osb->osb_cluster_stack);
982 1011
1012 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1013 seq_printf(s, ",nouser_xattr");
1014 else
1015 seq_printf(s, ",user_xattr");
1016
1017 if (opts & OCFS2_MOUNT_INODE64)
1018 seq_printf(s, ",inode64");
1019
983 return 0; 1020 return 0;
984} 1021}
985 1022
@@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data)
1132 oi->ip_dir_start_lookup = 0; 1169 oi->ip_dir_start_lookup = 0;
1133 1170
1134 init_rwsem(&oi->ip_alloc_sem); 1171 init_rwsem(&oi->ip_alloc_sem);
1172 init_rwsem(&oi->ip_xattr_sem);
1135 mutex_init(&oi->ip_io_mutex); 1173 mutex_init(&oi->ip_io_mutex);
1136 1174
1137 oi->ip_blkno = 0ULL; 1175 oi->ip_blkno = 0ULL;
@@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1375 sb->s_fs_info = osb; 1413 sb->s_fs_info = osb;
1376 sb->s_op = &ocfs2_sops; 1414 sb->s_op = &ocfs2_sops;
1377 sb->s_export_op = &ocfs2_export_ops; 1415 sb->s_export_op = &ocfs2_export_ops;
1416 sb->s_xattr = ocfs2_xattr_handlers;
1378 sb->s_time_gran = 1; 1417 sb->s_time_gran = 1;
1379 sb->s_flags |= MS_NOATIME; 1418 sb->s_flags |= MS_NOATIME;
1380 /* this is needed to support O_LARGEFILE */ 1419 /* this is needed to support O_LARGEFILE */
@@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
1421 1460
1422 osb->slot_num = OCFS2_INVALID_SLOT; 1461 osb->slot_num = OCFS2_INVALID_SLOT;
1423 1462
1463 osb->s_xattr_inline_size = le16_to_cpu(
1464 di->id2.i_super.s_xattr_inline_size);
1465
1424 osb->local_alloc_state = OCFS2_LA_UNUSED; 1466 osb->local_alloc_state = OCFS2_LA_UNUSED;
1425 osb->local_alloc_bh = NULL; 1467 osb->local_alloc_bh = NULL;
1468 INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
1426 1469
1427 init_waitqueue_head(&osb->osb_mount_event); 1470 init_waitqueue_head(&osb->osb_mount_event);
1428 1471
@@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1568 osb->first_cluster_group_blkno = 1611 osb->first_cluster_group_blkno =
1569 le64_to_cpu(di->id2.i_super.s_first_cluster_group); 1612 le64_to_cpu(di->id2.i_super.s_first_cluster_group);
1570 osb->fs_generation = le32_to_cpu(di->i_fs_generation); 1613 osb->fs_generation = le32_to_cpu(di->i_fs_generation);
1614 osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
1571 mlog(0, "vol_label: %s\n", osb->vol_label); 1615 mlog(0, "vol_label: %s\n", osb->vol_label);
1572 mlog(0, "uuid: %s\n", osb->uuid_str); 1616 mlog(0, "uuid: %s\n", osb->uuid_str);
1573 mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", 1617 mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ba9dbb51d25b..cbd03dfdc7b9 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -50,6 +50,7 @@
50#include "inode.h" 50#include "inode.h"
51#include "journal.h" 51#include "journal.h"
52#include "symlink.h" 52#include "symlink.h"
53#include "xattr.h"
53 54
54#include "buffer_head_io.h" 55#include "buffer_head_io.h"
55 56
@@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
83 84
84 mlog_entry_void(); 85 mlog_entry_void();
85 86
86 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 87 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
87 OCFS2_I(inode)->ip_blkno,
88 bh,
89 OCFS2_BH_CACHED,
90 inode);
91 if (status < 0) { 88 if (status < 0) {
92 mlog_errno(status); 89 mlog_errno(status);
93 link = ERR_PTR(status); 90 link = ERR_PTR(status);
@@ -157,8 +154,7 @@ bail:
157 kunmap(page); 154 kunmap(page);
158 page_cache_release(page); 155 page_cache_release(page);
159 } 156 }
160 if (bh) 157 brelse(bh);
161 brelse(bh);
162 158
163 return ERR_PTR(status); 159 return ERR_PTR(status);
164} 160}
@@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
168 .follow_link = ocfs2_follow_link, 164 .follow_link = ocfs2_follow_link,
169 .getattr = ocfs2_getattr, 165 .getattr = ocfs2_getattr,
170 .setattr = ocfs2_setattr, 166 .setattr = ocfs2_setattr,
167 .setxattr = generic_setxattr,
168 .getxattr = generic_getxattr,
169 .listxattr = ocfs2_listxattr,
170 .removexattr = generic_removexattr,
171}; 171};
172const struct inode_operations ocfs2_fast_symlink_inode_operations = { 172const struct inode_operations ocfs2_fast_symlink_inode_operations = {
173 .readlink = ocfs2_readlink, 173 .readlink = ocfs2_readlink,
174 .follow_link = ocfs2_follow_link, 174 .follow_link = ocfs2_follow_link,
175 .getattr = ocfs2_getattr, 175 .getattr = ocfs2_getattr,
176 .setattr = ocfs2_setattr, 176 .setattr = ocfs2_setattr,
177 .setxattr = generic_setxattr,
178 .getxattr = generic_getxattr,
179 .listxattr = ocfs2_listxattr,
180 .removexattr = generic_removexattr,
177}; 181};
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 4da8851f2b23..187b99ff0368 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,7 +53,11 @@
53#include <linux/highmem.h> 53#include <linux/highmem.h>
54#include <linux/buffer_head.h> 54#include <linux/buffer_head.h>
55#include <linux/rbtree.h> 55#include <linux/rbtree.h>
56#include <linux/jbd.h> 56#ifndef CONFIG_OCFS2_COMPAT_JBD
57# include <linux/jbd2.h>
58#else
59# include <linux/jbd.h>
60#endif
57 61
58#define MLOG_MASK_PREFIX ML_UPTODATE 62#define MLOG_MASK_PREFIX ML_UPTODATE
59 63
@@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
511 ci->ci_num_cached--; 515 ci->ci_num_cached--;
512} 516}
513 517
514/* Called when we remove a chunk of metadata from an inode. We don't 518static void ocfs2_remove_block_from_cache(struct inode *inode,
515 * bother reverting things to an inlined array in the case of a remove 519 sector_t block)
516 * which moves us back under the limit. */
517void ocfs2_remove_from_cache(struct inode *inode,
518 struct buffer_head *bh)
519{ 520{
520 int index; 521 int index;
521 sector_t block = bh->b_blocknr;
522 struct ocfs2_meta_cache_item *item = NULL; 522 struct ocfs2_meta_cache_item *item = NULL;
523 struct ocfs2_inode_info *oi = OCFS2_I(inode); 523 struct ocfs2_inode_info *oi = OCFS2_I(inode);
524 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 524 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
544 kmem_cache_free(ocfs2_uptodate_cachep, item); 544 kmem_cache_free(ocfs2_uptodate_cachep, item);
545} 545}
546 546
547/*
548 * Called when we remove a chunk of metadata from an inode. We don't
549 * bother reverting things to an inlined array in the case of a remove
550 * which moves us back under the limit.
551 */
552void ocfs2_remove_from_cache(struct inode *inode,
553 struct buffer_head *bh)
554{
555 sector_t block = bh->b_blocknr;
556
557 ocfs2_remove_block_from_cache(inode, block);
558}
559
560/* Called when we remove xattr clusters from an inode. */
561void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
562 sector_t block,
563 u32 c_len)
564{
565 unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
566
567 for (i = 0; i < b_len; i++, block++)
568 ocfs2_remove_block_from_cache(inode, block);
569}
570
547int __init init_ocfs2_uptodate_cache(void) 571int __init init_ocfs2_uptodate_cache(void)
548{ 572{
549 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", 573 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a8..531b4b3a0c47 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
40 struct buffer_head *bh); 40 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode, 41void ocfs2_remove_from_cache(struct inode *inode,
42 struct buffer_head *bh); 42 struct buffer_head *bh);
43void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
44 sector_t block,
45 u32 c_len);
43int ocfs2_buffer_read_ahead(struct inode *inode, 46int ocfs2_buffer_read_ahead(struct inode *inode,
44 struct buffer_head *bh); 47 struct buffer_head *bh);
45 48
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 000000000000..802c41492214
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,4832 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * xattr.c
5 *
6 * Copyright (C) 2008 Oracle. All rights reserved.
7 *
8 * CREDITS:
9 * Lots of code in this file is taken from ext3.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/capability.h>
28#include <linux/fs.h>
29#include <linux/types.h>
30#include <linux/slab.h>
31#include <linux/highmem.h>
32#include <linux/pagemap.h>
33#include <linux/uio.h>
34#include <linux/sched.h>
35#include <linux/splice.h>
36#include <linux/mount.h>
37#include <linux/writeback.h>
38#include <linux/falloc.h>
39#include <linux/sort.h>
40#include <linux/init.h>
41#include <linux/module.h>
42#include <linux/string.h>
43
44#define MLOG_MASK_PREFIX ML_XATTR
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48#include "alloc.h"
49#include "dlmglue.h"
50#include "file.h"
51#include "symlink.h"
52#include "sysfile.h"
53#include "inode.h"
54#include "journal.h"
55#include "ocfs2_fs.h"
56#include "suballoc.h"
57#include "uptodate.h"
58#include "buffer_head_io.h"
59#include "super.h"
60#include "xattr.h"
61
62
63struct ocfs2_xattr_def_value_root {
64 struct ocfs2_xattr_value_root xv;
65 struct ocfs2_extent_rec er;
66};
67
68struct ocfs2_xattr_bucket {
69 struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
70 struct ocfs2_xattr_header *xh;
71};
72
73#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
74#define OCFS2_XATTR_INLINE_SIZE 80
75
76static struct ocfs2_xattr_def_value_root def_xv = {
77 .xv.xr_list.l_count = cpu_to_le16(1),
78};
79
80struct xattr_handler *ocfs2_xattr_handlers[] = {
81 &ocfs2_xattr_user_handler,
82 &ocfs2_xattr_trusted_handler,
83 NULL
84};
85
86static struct xattr_handler *ocfs2_xattr_handler_map[] = {
87 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
88 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
89};
90
91struct ocfs2_xattr_info {
92 int name_index;
93 const char *name;
94 const void *value;
95 size_t value_len;
96};
97
98struct ocfs2_xattr_search {
99 struct buffer_head *inode_bh;
100 /*
101 * xattr_bh point to the block buffer head which has extended attribute
102 * when extended attribute in inode, xattr_bh is equal to inode_bh.
103 */
104 struct buffer_head *xattr_bh;
105 struct ocfs2_xattr_header *header;
106 struct ocfs2_xattr_bucket bucket;
107 void *base;
108 void *end;
109 struct ocfs2_xattr_entry *here;
110 int not_found;
111};
112
113static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
114 struct ocfs2_xattr_header *xh,
115 int index,
116 int *block_off,
117 int *new_offset);
118
119static int ocfs2_xattr_index_block_find(struct inode *inode,
120 struct buffer_head *root_bh,
121 int name_index,
122 const char *name,
123 struct ocfs2_xattr_search *xs);
124
125static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
126 struct ocfs2_xattr_tree_root *xt,
127 char *buffer,
128 size_t buffer_size);
129
130static int ocfs2_xattr_create_index_block(struct inode *inode,
131 struct ocfs2_xattr_search *xs);
132
133static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
134 struct ocfs2_xattr_info *xi,
135 struct ocfs2_xattr_search *xs);
136
137static int ocfs2_delete_xattr_index_block(struct inode *inode,
138 struct buffer_head *xb_bh);
139
140static inline const char *ocfs2_xattr_prefix(int name_index)
141{
142 struct xattr_handler *handler = NULL;
143
144 if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
145 handler = ocfs2_xattr_handler_map[name_index];
146
147 return handler ? handler->prefix : NULL;
148}
149
150static u32 ocfs2_xattr_name_hash(struct inode *inode,
151 const char *name,
152 int name_len)
153{
154 /* Get hash value of uuid from super block */
155 u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
156 int i;
157
158 /* hash extended attribute name */
159 for (i = 0; i < name_len; i++) {
160 hash = (hash << OCFS2_HASH_SHIFT) ^
161 (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
162 *name++;
163 }
164
165 return hash;
166}
167
168/*
169 * ocfs2_xattr_hash_entry()
170 *
171 * Compute the hash of an extended attribute.
172 */
173static void ocfs2_xattr_hash_entry(struct inode *inode,
174 struct ocfs2_xattr_header *header,
175 struct ocfs2_xattr_entry *entry)
176{
177 u32 hash = 0;
178 char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
179
180 hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
181 entry->xe_name_hash = cpu_to_le32(hash);
182
183 return;
184}
185
186static int ocfs2_xattr_extend_allocation(struct inode *inode,
187 u32 clusters_to_add,
188 struct buffer_head *xattr_bh,
189 struct ocfs2_xattr_value_root *xv)
190{
191 int status = 0;
192 int restart_func = 0;
193 int credits = 0;
194 handle_t *handle = NULL;
195 struct ocfs2_alloc_context *data_ac = NULL;
196 struct ocfs2_alloc_context *meta_ac = NULL;
197 enum ocfs2_alloc_restarted why;
198 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
199 u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
200 struct ocfs2_extent_tree et;
201
202 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
203
204 ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
205
206restart_all:
207
208 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
209 &data_ac, &meta_ac);
210 if (status) {
211 mlog_errno(status);
212 goto leave;
213 }
214
215 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
216 clusters_to_add);
217 handle = ocfs2_start_trans(osb, credits);
218 if (IS_ERR(handle)) {
219 status = PTR_ERR(handle);
220 handle = NULL;
221 mlog_errno(status);
222 goto leave;
223 }
224
225restarted_transaction:
226 status = ocfs2_journal_access(handle, inode, xattr_bh,
227 OCFS2_JOURNAL_ACCESS_WRITE);
228 if (status < 0) {
229 mlog_errno(status);
230 goto leave;
231 }
232
233 prev_clusters = le32_to_cpu(xv->xr_clusters);
234 status = ocfs2_add_clusters_in_btree(osb,
235 inode,
236 &logical_start,
237 clusters_to_add,
238 0,
239 &et,
240 handle,
241 data_ac,
242 meta_ac,
243 &why);
244 if ((status < 0) && (status != -EAGAIN)) {
245 if (status != -ENOSPC)
246 mlog_errno(status);
247 goto leave;
248 }
249
250 status = ocfs2_journal_dirty(handle, xattr_bh);
251 if (status < 0) {
252 mlog_errno(status);
253 goto leave;
254 }
255
256 clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
257
258 if (why != RESTART_NONE && clusters_to_add) {
259 if (why == RESTART_META) {
260 mlog(0, "restarting function.\n");
261 restart_func = 1;
262 } else {
263 BUG_ON(why != RESTART_TRANS);
264
265 mlog(0, "restarting transaction.\n");
266 /* TODO: This can be more intelligent. */
267 credits = ocfs2_calc_extend_credits(osb->sb,
268 et.et_root_el,
269 clusters_to_add);
270 status = ocfs2_extend_trans(handle, credits);
271 if (status < 0) {
272 /* handle still has to be committed at
273 * this point. */
274 status = -ENOMEM;
275 mlog_errno(status);
276 goto leave;
277 }
278 goto restarted_transaction;
279 }
280 }
281
282leave:
283 if (handle) {
284 ocfs2_commit_trans(osb, handle);
285 handle = NULL;
286 }
287 if (data_ac) {
288 ocfs2_free_alloc_context(data_ac);
289 data_ac = NULL;
290 }
291 if (meta_ac) {
292 ocfs2_free_alloc_context(meta_ac);
293 meta_ac = NULL;
294 }
295 if ((!status) && restart_func) {
296 restart_func = 0;
297 goto restart_all;
298 }
299
300 return status;
301}
302
303static int __ocfs2_remove_xattr_range(struct inode *inode,
304 struct buffer_head *root_bh,
305 struct ocfs2_xattr_value_root *xv,
306 u32 cpos, u32 phys_cpos, u32 len,
307 struct ocfs2_cached_dealloc_ctxt *dealloc)
308{
309 int ret;
310 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
312 struct inode *tl_inode = osb->osb_tl_inode;
313 handle_t *handle;
314 struct ocfs2_alloc_context *meta_ac = NULL;
315 struct ocfs2_extent_tree et;
316
317 ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
318
319 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
320 if (ret) {
321 mlog_errno(ret);
322 return ret;
323 }
324
325 mutex_lock(&tl_inode->i_mutex);
326
327 if (ocfs2_truncate_log_needs_flush(osb)) {
328 ret = __ocfs2_flush_truncate_log(osb);
329 if (ret < 0) {
330 mlog_errno(ret);
331 goto out;
332 }
333 }
334
335 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
336 if (IS_ERR(handle)) {
337 ret = PTR_ERR(handle);
338 mlog_errno(ret);
339 goto out;
340 }
341
342 ret = ocfs2_journal_access(handle, inode, root_bh,
343 OCFS2_JOURNAL_ACCESS_WRITE);
344 if (ret) {
345 mlog_errno(ret);
346 goto out_commit;
347 }
348
349 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
350 dealloc);
351 if (ret) {
352 mlog_errno(ret);
353 goto out_commit;
354 }
355
356 le32_add_cpu(&xv->xr_clusters, -len);
357
358 ret = ocfs2_journal_dirty(handle, root_bh);
359 if (ret) {
360 mlog_errno(ret);
361 goto out_commit;
362 }
363
364 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
365 if (ret)
366 mlog_errno(ret);
367
368out_commit:
369 ocfs2_commit_trans(osb, handle);
370out:
371 mutex_unlock(&tl_inode->i_mutex);
372
373 if (meta_ac)
374 ocfs2_free_alloc_context(meta_ac);
375
376 return ret;
377}
378
379static int ocfs2_xattr_shrink_size(struct inode *inode,
380 u32 old_clusters,
381 u32 new_clusters,
382 struct buffer_head *root_bh,
383 struct ocfs2_xattr_value_root *xv)
384{
385 int ret = 0;
386 u32 trunc_len, cpos, phys_cpos, alloc_size;
387 u64 block;
388 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
389 struct ocfs2_cached_dealloc_ctxt dealloc;
390
391 ocfs2_init_dealloc_ctxt(&dealloc);
392
393 if (old_clusters <= new_clusters)
394 return 0;
395
396 cpos = new_clusters;
397 trunc_len = old_clusters - new_clusters;
398 while (trunc_len) {
399 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
400 &alloc_size, &xv->xr_list);
401 if (ret) {
402 mlog_errno(ret);
403 goto out;
404 }
405
406 if (alloc_size > trunc_len)
407 alloc_size = trunc_len;
408
409 ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
410 phys_cpos, alloc_size,
411 &dealloc);
412 if (ret) {
413 mlog_errno(ret);
414 goto out;
415 }
416
417 block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
418 ocfs2_remove_xattr_clusters_from_cache(inode, block,
419 alloc_size);
420 cpos += alloc_size;
421 trunc_len -= alloc_size;
422 }
423
424out:
425 ocfs2_schedule_truncate_log_flush(osb, 1);
426 ocfs2_run_deallocs(osb, &dealloc);
427
428 return ret;
429}
430
431static int ocfs2_xattr_value_truncate(struct inode *inode,
432 struct buffer_head *root_bh,
433 struct ocfs2_xattr_value_root *xv,
434 int len)
435{
436 int ret;
437 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
438 u32 old_clusters = le32_to_cpu(xv->xr_clusters);
439
440 if (new_clusters == old_clusters)
441 return 0;
442
443 if (new_clusters > old_clusters)
444 ret = ocfs2_xattr_extend_allocation(inode,
445 new_clusters - old_clusters,
446 root_bh, xv);
447 else
448 ret = ocfs2_xattr_shrink_size(inode,
449 old_clusters, new_clusters,
450 root_bh, xv);
451
452 return ret;
453}
454
455static int ocfs2_xattr_list_entry(char *buffer, size_t size,
456 size_t *result, const char *prefix,
457 const char *name, int name_len)
458{
459 char *p = buffer + *result;
460 int prefix_len = strlen(prefix);
461 int total_len = prefix_len + name_len + 1;
462
463 *result += total_len;
464
465 /* we are just looking for how big our buffer needs to be */
466 if (!size)
467 return 0;
468
469 if (*result > size)
470 return -ERANGE;
471
472 memcpy(p, prefix, prefix_len);
473 memcpy(p + prefix_len, name, name_len);
474 p[prefix_len + name_len] = '\0';
475
476 return 0;
477}
478
479static int ocfs2_xattr_list_entries(struct inode *inode,
480 struct ocfs2_xattr_header *header,
481 char *buffer, size_t buffer_size)
482{
483 size_t result = 0;
484 int i, type, ret;
485 const char *prefix, *name;
486
487 for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
488 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
489 type = ocfs2_xattr_get_type(entry);
490 prefix = ocfs2_xattr_prefix(type);
491
492 if (prefix) {
493 name = (const char *)header +
494 le16_to_cpu(entry->xe_name_offset);
495
496 ret = ocfs2_xattr_list_entry(buffer, buffer_size,
497 &result, prefix, name,
498 entry->xe_name_len);
499 if (ret)
500 return ret;
501 }
502 }
503
504 return result;
505}
506
507static int ocfs2_xattr_ibody_list(struct inode *inode,
508 struct ocfs2_dinode *di,
509 char *buffer,
510 size_t buffer_size)
511{
512 struct ocfs2_xattr_header *header = NULL;
513 struct ocfs2_inode_info *oi = OCFS2_I(inode);
514 int ret = 0;
515
516 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
517 return ret;
518
519 header = (struct ocfs2_xattr_header *)
520 ((void *)di + inode->i_sb->s_blocksize -
521 le16_to_cpu(di->i_xattr_inline_size));
522
523 ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
524
525 return ret;
526}
527
528static int ocfs2_xattr_block_list(struct inode *inode,
529 struct ocfs2_dinode *di,
530 char *buffer,
531 size_t buffer_size)
532{
533 struct buffer_head *blk_bh = NULL;
534 struct ocfs2_xattr_block *xb;
535 int ret = 0;
536
537 if (!di->i_xattr_loc)
538 return ret;
539
540 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
541 if (ret < 0) {
542 mlog_errno(ret);
543 return ret;
544 }
545 /*Verify the signature of xattr block*/
546 if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
547 strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
548 ret = -EFAULT;
549 goto cleanup;
550 }
551
552 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
553
554 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
555 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
556 ret = ocfs2_xattr_list_entries(inode, header,
557 buffer, buffer_size);
558 } else {
559 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
560 ret = ocfs2_xattr_tree_list_index_block(inode, xt,
561 buffer, buffer_size);
562 }
563cleanup:
564 brelse(blk_bh);
565
566 return ret;
567}
568
569ssize_t ocfs2_listxattr(struct dentry *dentry,
570 char *buffer,
571 size_t size)
572{
573 int ret = 0, i_ret = 0, b_ret = 0;
574 struct buffer_head *di_bh = NULL;
575 struct ocfs2_dinode *di = NULL;
576 struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
577
578 if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
579 return -EOPNOTSUPP;
580
581 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
582 return ret;
583
584 ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
585 if (ret < 0) {
586 mlog_errno(ret);
587 return ret;
588 }
589
590 di = (struct ocfs2_dinode *)di_bh->b_data;
591
592 down_read(&oi->ip_xattr_sem);
593 i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
594 if (i_ret < 0)
595 b_ret = 0;
596 else {
597 if (buffer) {
598 buffer += i_ret;
599 size -= i_ret;
600 }
601 b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
602 buffer, size);
603 if (b_ret < 0)
604 i_ret = 0;
605 }
606 up_read(&oi->ip_xattr_sem);
607 ocfs2_inode_unlock(dentry->d_inode, 0);
608
609 brelse(di_bh);
610
611 return i_ret + b_ret;
612}
613
614static int ocfs2_xattr_find_entry(int name_index,
615 const char *name,
616 struct ocfs2_xattr_search *xs)
617{
618 struct ocfs2_xattr_entry *entry;
619 size_t name_len;
620 int i, cmp = 1;
621
622 if (name == NULL)
623 return -EINVAL;
624
625 name_len = strlen(name);
626 entry = xs->here;
627 for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
628 cmp = name_index - ocfs2_xattr_get_type(entry);
629 if (!cmp)
630 cmp = name_len - entry->xe_name_len;
631 if (!cmp)
632 cmp = memcmp(name, (xs->base +
633 le16_to_cpu(entry->xe_name_offset)),
634 name_len);
635 if (cmp == 0)
636 break;
637 entry += 1;
638 }
639 xs->here = entry;
640
641 return cmp ? -ENODATA : 0;
642}
643
644static int ocfs2_xattr_get_value_outside(struct inode *inode,
645 struct ocfs2_xattr_value_root *xv,
646 void *buffer,
647 size_t len)
648{
649 u32 cpos, p_cluster, num_clusters, bpc, clusters;
650 u64 blkno;
651 int i, ret = 0;
652 size_t cplen, blocksize;
653 struct buffer_head *bh = NULL;
654 struct ocfs2_extent_list *el;
655
656 el = &xv->xr_list;
657 clusters = le32_to_cpu(xv->xr_clusters);
658 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
659 blocksize = inode->i_sb->s_blocksize;
660
661 cpos = 0;
662 while (cpos < clusters) {
663 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
664 &num_clusters, el);
665 if (ret) {
666 mlog_errno(ret);
667 goto out;
668 }
669
670 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
671 /* Copy ocfs2_xattr_value */
672 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
673 ret = ocfs2_read_block(inode, blkno, &bh);
674 if (ret) {
675 mlog_errno(ret);
676 goto out;
677 }
678
679 cplen = len >= blocksize ? blocksize : len;
680 memcpy(buffer, bh->b_data, cplen);
681 len -= cplen;
682 buffer += cplen;
683
684 brelse(bh);
685 bh = NULL;
686 if (len == 0)
687 break;
688 }
689 cpos += num_clusters;
690 }
691out:
692 return ret;
693}
694
695static int ocfs2_xattr_ibody_get(struct inode *inode,
696 int name_index,
697 const char *name,
698 void *buffer,
699 size_t buffer_size,
700 struct ocfs2_xattr_search *xs)
701{
702 struct ocfs2_inode_info *oi = OCFS2_I(inode);
703 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
704 struct ocfs2_xattr_value_root *xv;
705 size_t size;
706 int ret = 0;
707
708 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
709 return -ENODATA;
710
711 xs->end = (void *)di + inode->i_sb->s_blocksize;
712 xs->header = (struct ocfs2_xattr_header *)
713 (xs->end - le16_to_cpu(di->i_xattr_inline_size));
714 xs->base = (void *)xs->header;
715 xs->here = xs->header->xh_entries;
716
717 ret = ocfs2_xattr_find_entry(name_index, name, xs);
718 if (ret)
719 return ret;
720 size = le64_to_cpu(xs->here->xe_value_size);
721 if (buffer) {
722 if (size > buffer_size)
723 return -ERANGE;
724 if (ocfs2_xattr_is_local(xs->here)) {
725 memcpy(buffer, (void *)xs->base +
726 le16_to_cpu(xs->here->xe_name_offset) +
727 OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
728 } else {
729 xv = (struct ocfs2_xattr_value_root *)
730 (xs->base + le16_to_cpu(
731 xs->here->xe_name_offset) +
732 OCFS2_XATTR_SIZE(xs->here->xe_name_len));
733 ret = ocfs2_xattr_get_value_outside(inode, xv,
734 buffer, size);
735 if (ret < 0) {
736 mlog_errno(ret);
737 return ret;
738 }
739 }
740 }
741
742 return size;
743}
744
745static int ocfs2_xattr_block_get(struct inode *inode,
746 int name_index,
747 const char *name,
748 void *buffer,
749 size_t buffer_size,
750 struct ocfs2_xattr_search *xs)
751{
752 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
753 struct buffer_head *blk_bh = NULL;
754 struct ocfs2_xattr_block *xb;
755 struct ocfs2_xattr_value_root *xv;
756 size_t size;
757 int ret = -ENODATA, name_offset, name_len, block_off, i;
758
759 if (!di->i_xattr_loc)
760 return ret;
761
762 memset(&xs->bucket, 0, sizeof(xs->bucket));
763
764 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
765 if (ret < 0) {
766 mlog_errno(ret);
767 return ret;
768 }
769 /*Verify the signature of xattr block*/
770 if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
771 strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
772 ret = -EFAULT;
773 goto cleanup;
774 }
775
776 xs->xattr_bh = blk_bh;
777 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
778
779 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
780 xs->header = &xb->xb_attrs.xb_header;
781 xs->base = (void *)xs->header;
782 xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
783 xs->here = xs->header->xh_entries;
784
785 ret = ocfs2_xattr_find_entry(name_index, name, xs);
786 } else
787 ret = ocfs2_xattr_index_block_find(inode, blk_bh,
788 name_index,
789 name, xs);
790
791 if (ret)
792 goto cleanup;
793 size = le64_to_cpu(xs->here->xe_value_size);
794 if (buffer) {
795 ret = -ERANGE;
796 if (size > buffer_size)
797 goto cleanup;
798
799 name_offset = le16_to_cpu(xs->here->xe_name_offset);
800 name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
801 i = xs->here - xs->header->xh_entries;
802
803 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
804 ret = ocfs2_xattr_bucket_get_name_value(inode,
805 xs->bucket.xh,
806 i,
807 &block_off,
808 &name_offset);
809 xs->base = xs->bucket.bhs[block_off]->b_data;
810 }
811 if (ocfs2_xattr_is_local(xs->here)) {
812 memcpy(buffer, (void *)xs->base +
813 name_offset + name_len, size);
814 } else {
815 xv = (struct ocfs2_xattr_value_root *)
816 (xs->base + name_offset + name_len);
817 ret = ocfs2_xattr_get_value_outside(inode, xv,
818 buffer, size);
819 if (ret < 0) {
820 mlog_errno(ret);
821 goto cleanup;
822 }
823 }
824 }
825 ret = size;
826cleanup:
827 for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
828 brelse(xs->bucket.bhs[i]);
829 memset(&xs->bucket, 0, sizeof(xs->bucket));
830
831 brelse(blk_bh);
832 return ret;
833}
834
835/* ocfs2_xattr_get()
836 *
837 * Copy an extended attribute into the buffer provided.
838 * Buffer is NULL to compute the size of buffer required.
839 */
840int ocfs2_xattr_get(struct inode *inode,
841 int name_index,
842 const char *name,
843 void *buffer,
844 size_t buffer_size)
845{
846 int ret;
847 struct ocfs2_dinode *di = NULL;
848 struct buffer_head *di_bh = NULL;
849 struct ocfs2_inode_info *oi = OCFS2_I(inode);
850 struct ocfs2_xattr_search xis = {
851 .not_found = -ENODATA,
852 };
853 struct ocfs2_xattr_search xbs = {
854 .not_found = -ENODATA,
855 };
856
857 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
858 return -EOPNOTSUPP;
859
860 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
861 ret = -ENODATA;
862
863 ret = ocfs2_inode_lock(inode, &di_bh, 0);
864 if (ret < 0) {
865 mlog_errno(ret);
866 return ret;
867 }
868 xis.inode_bh = xbs.inode_bh = di_bh;
869 di = (struct ocfs2_dinode *)di_bh->b_data;
870
871 down_read(&oi->ip_xattr_sem);
872 ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
873 buffer_size, &xis);
874 if (ret == -ENODATA)
875 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
876 buffer_size, &xbs);
877 up_read(&oi->ip_xattr_sem);
878 ocfs2_inode_unlock(inode, 0);
879
880 brelse(di_bh);
881
882 return ret;
883}
884
885static int __ocfs2_xattr_set_value_outside(struct inode *inode,
886 struct ocfs2_xattr_value_root *xv,
887 const void *value,
888 int value_len)
889{
890 int ret = 0, i, cp_len, credits;
891 u16 blocksize = inode->i_sb->s_blocksize;
892 u32 p_cluster, num_clusters;
893 u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
894 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
895 u64 blkno;
896 struct buffer_head *bh = NULL;
897 handle_t *handle;
898
899 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
900
901 credits = clusters * bpc;
902 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
903 if (IS_ERR(handle)) {
904 ret = PTR_ERR(handle);
905 mlog_errno(ret);
906 goto out;
907 }
908
909 while (cpos < clusters) {
910 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
911 &num_clusters, &xv->xr_list);
912 if (ret) {
913 mlog_errno(ret);
914 goto out_commit;
915 }
916
917 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
918
919 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
920 ret = ocfs2_read_block(inode, blkno, &bh);
921 if (ret) {
922 mlog_errno(ret);
923 goto out_commit;
924 }
925
926 ret = ocfs2_journal_access(handle,
927 inode,
928 bh,
929 OCFS2_JOURNAL_ACCESS_WRITE);
930 if (ret < 0) {
931 mlog_errno(ret);
932 goto out_commit;
933 }
934
935 cp_len = value_len > blocksize ? blocksize : value_len;
936 memcpy(bh->b_data, value, cp_len);
937 value_len -= cp_len;
938 value += cp_len;
939 if (cp_len < blocksize)
940 memset(bh->b_data + cp_len, 0,
941 blocksize - cp_len);
942
943 ret = ocfs2_journal_dirty(handle, bh);
944 if (ret < 0) {
945 mlog_errno(ret);
946 goto out_commit;
947 }
948 brelse(bh);
949 bh = NULL;
950
951 /*
952 * XXX: do we need to empty all the following
953 * blocks in this cluster?
954 */
955 if (!value_len)
956 break;
957 }
958 cpos += num_clusters;
959 }
960out_commit:
961 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
962out:
963 brelse(bh);
964
965 return ret;
966}
967
968static int ocfs2_xattr_cleanup(struct inode *inode,
969 struct ocfs2_xattr_info *xi,
970 struct ocfs2_xattr_search *xs,
971 size_t offs)
972{
973 handle_t *handle = NULL;
974 int ret = 0;
975 size_t name_len = strlen(xi->name);
976 void *val = xs->base + offs;
977 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
978
979 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
980 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
981 if (IS_ERR(handle)) {
982 ret = PTR_ERR(handle);
983 mlog_errno(ret);
984 goto out;
985 }
986 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
987 OCFS2_JOURNAL_ACCESS_WRITE);
988 if (ret) {
989 mlog_errno(ret);
990 goto out_commit;
991 }
992 /* Decrease xattr count */
993 le16_add_cpu(&xs->header->xh_count, -1);
994 /* Remove the xattr entry and tree root which has already be set*/
995 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
996 memset(val, 0, size);
997
998 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
999 if (ret < 0)
1000 mlog_errno(ret);
1001out_commit:
1002 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1003out:
1004 return ret;
1005}
1006
1007static int ocfs2_xattr_update_entry(struct inode *inode,
1008 struct ocfs2_xattr_info *xi,
1009 struct ocfs2_xattr_search *xs,
1010 size_t offs)
1011{
1012 handle_t *handle = NULL;
1013 int ret = 0;
1014
1015 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
1016 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1017 if (IS_ERR(handle)) {
1018 ret = PTR_ERR(handle);
1019 mlog_errno(ret);
1020 goto out;
1021 }
1022 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1023 OCFS2_JOURNAL_ACCESS_WRITE);
1024 if (ret) {
1025 mlog_errno(ret);
1026 goto out_commit;
1027 }
1028
1029 xs->here->xe_name_offset = cpu_to_le16(offs);
1030 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1031 if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
1032 ocfs2_xattr_set_local(xs->here, 1);
1033 else
1034 ocfs2_xattr_set_local(xs->here, 0);
1035 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1036
1037 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
1038 if (ret < 0)
1039 mlog_errno(ret);
1040out_commit:
1041 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1042out:
1043 return ret;
1044}
1045
1046/*
1047 * ocfs2_xattr_set_value_outside()
1048 *
1049 * Set large size value in B tree.
1050 */
1051static int ocfs2_xattr_set_value_outside(struct inode *inode,
1052 struct ocfs2_xattr_info *xi,
1053 struct ocfs2_xattr_search *xs,
1054 size_t offs)
1055{
1056 size_t name_len = strlen(xi->name);
1057 void *val = xs->base + offs;
1058 struct ocfs2_xattr_value_root *xv = NULL;
1059 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1060 int ret = 0;
1061
1062 memset(val, 0, size);
1063 memcpy(val, xi->name, name_len);
1064 xv = (struct ocfs2_xattr_value_root *)
1065 (val + OCFS2_XATTR_SIZE(name_len));
1066 xv->xr_clusters = 0;
1067 xv->xr_last_eb_blk = 0;
1068 xv->xr_list.l_tree_depth = 0;
1069 xv->xr_list.l_count = cpu_to_le16(1);
1070 xv->xr_list.l_next_free_rec = 0;
1071
1072 ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
1073 xi->value_len);
1074 if (ret < 0) {
1075 mlog_errno(ret);
1076 return ret;
1077 }
1078 ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
1079 xi->value_len);
1080 if (ret < 0) {
1081 mlog_errno(ret);
1082 return ret;
1083 }
1084 ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
1085 if (ret < 0)
1086 mlog_errno(ret);
1087
1088 return ret;
1089}
1090
1091/*
1092 * ocfs2_xattr_set_entry_local()
1093 *
1094 * Set, replace or remove extended attribute in local.
1095 */
1096static void ocfs2_xattr_set_entry_local(struct inode *inode,
1097 struct ocfs2_xattr_info *xi,
1098 struct ocfs2_xattr_search *xs,
1099 struct ocfs2_xattr_entry *last,
1100 size_t min_offs)
1101{
1102 size_t name_len = strlen(xi->name);
1103 int i;
1104
1105 if (xi->value && xs->not_found) {
1106 /* Insert the new xattr entry. */
1107 le16_add_cpu(&xs->header->xh_count, 1);
1108 ocfs2_xattr_set_type(last, xi->name_index);
1109 ocfs2_xattr_set_local(last, 1);
1110 last->xe_name_len = name_len;
1111 } else {
1112 void *first_val;
1113 void *val;
1114 size_t offs, size;
1115
1116 first_val = xs->base + min_offs;
1117 offs = le16_to_cpu(xs->here->xe_name_offset);
1118 val = xs->base + offs;
1119
1120 if (le64_to_cpu(xs->here->xe_value_size) >
1121 OCFS2_XATTR_INLINE_SIZE)
1122 size = OCFS2_XATTR_SIZE(name_len) +
1123 OCFS2_XATTR_ROOT_SIZE;
1124 else
1125 size = OCFS2_XATTR_SIZE(name_len) +
1126 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1127
1128 if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
1129 OCFS2_XATTR_SIZE(xi->value_len)) {
1130 /* The old and the new value have the
1131 same size. Just replace the value. */
1132 ocfs2_xattr_set_local(xs->here, 1);
1133 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1134 /* Clear value bytes. */
1135 memset(val + OCFS2_XATTR_SIZE(name_len),
1136 0,
1137 OCFS2_XATTR_SIZE(xi->value_len));
1138 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1139 xi->value,
1140 xi->value_len);
1141 return;
1142 }
1143 /* Remove the old name+value. */
1144 memmove(first_val + size, first_val, val - first_val);
1145 memset(first_val, 0, size);
1146 xs->here->xe_name_hash = 0;
1147 xs->here->xe_name_offset = 0;
1148 ocfs2_xattr_set_local(xs->here, 1);
1149 xs->here->xe_value_size = 0;
1150
1151 min_offs += size;
1152
1153 /* Adjust all value offsets. */
1154 last = xs->header->xh_entries;
1155 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1156 size_t o = le16_to_cpu(last->xe_name_offset);
1157
1158 if (o < offs)
1159 last->xe_name_offset = cpu_to_le16(o + size);
1160 last += 1;
1161 }
1162
1163 if (!xi->value) {
1164 /* Remove the old entry. */
1165 last -= 1;
1166 memmove(xs->here, xs->here + 1,
1167 (void *)last - (void *)xs->here);
1168 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
1169 le16_add_cpu(&xs->header->xh_count, -1);
1170 }
1171 }
1172 if (xi->value) {
1173 /* Insert the new name+value. */
1174 size_t size = OCFS2_XATTR_SIZE(name_len) +
1175 OCFS2_XATTR_SIZE(xi->value_len);
1176 void *val = xs->base + min_offs - size;
1177
1178 xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
1179 memset(val, 0, size);
1180 memcpy(val, xi->name, name_len);
1181 memcpy(val + OCFS2_XATTR_SIZE(name_len),
1182 xi->value,
1183 xi->value_len);
1184 xs->here->xe_value_size = cpu_to_le64(xi->value_len);
1185 ocfs2_xattr_set_local(xs->here, 1);
1186 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1187 }
1188
1189 return;
1190}
1191
1192/*
1193 * ocfs2_xattr_set_entry()
1194 *
1195 * Set extended attribute entry into inode or block.
1196 *
1197 * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
1198 * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
1199 * then set value in B tree with set_value_outside().
1200 */
1201static int ocfs2_xattr_set_entry(struct inode *inode,
1202 struct ocfs2_xattr_info *xi,
1203 struct ocfs2_xattr_search *xs,
1204 int flag)
1205{
1206 struct ocfs2_xattr_entry *last;
1207 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1208 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1209 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
1210 size_t size_l = 0;
1211 handle_t *handle = NULL;
1212 int free, i, ret;
1213 struct ocfs2_xattr_info xi_l = {
1214 .name_index = xi->name_index,
1215 .name = xi->name,
1216 .value = xi->value,
1217 .value_len = xi->value_len,
1218 };
1219
1220 /* Compute min_offs, last and free space. */
1221 last = xs->header->xh_entries;
1222
1223 for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
1224 size_t offs = le16_to_cpu(last->xe_name_offset);
1225 if (offs < min_offs)
1226 min_offs = offs;
1227 last += 1;
1228 }
1229
1230 free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
1231 if (free < 0)
1232 return -EFAULT;
1233
1234 if (!xs->not_found) {
1235 size_t size = 0;
1236 if (ocfs2_xattr_is_local(xs->here))
1237 size = OCFS2_XATTR_SIZE(name_len) +
1238 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1239 else
1240 size = OCFS2_XATTR_SIZE(name_len) +
1241 OCFS2_XATTR_ROOT_SIZE;
1242 free += (size + sizeof(struct ocfs2_xattr_entry));
1243 }
1244 /* Check free space in inode or block */
1245 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1246 if (free < sizeof(struct ocfs2_xattr_entry) +
1247 OCFS2_XATTR_SIZE(name_len) +
1248 OCFS2_XATTR_ROOT_SIZE) {
1249 ret = -ENOSPC;
1250 goto out;
1251 }
1252 size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1253 xi_l.value = (void *)&def_xv;
1254 xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
1255 } else if (xi->value) {
1256 if (free < sizeof(struct ocfs2_xattr_entry) +
1257 OCFS2_XATTR_SIZE(name_len) +
1258 OCFS2_XATTR_SIZE(xi->value_len)) {
1259 ret = -ENOSPC;
1260 goto out;
1261 }
1262 }
1263
1264 if (!xs->not_found) {
1265 /* For existing extended attribute */
1266 size_t size = OCFS2_XATTR_SIZE(name_len) +
1267 OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
1268 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1269 void *val = xs->base + offs;
1270
1271 if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
1272 /* Replace existing local xattr with tree root */
1273 ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
1274 offs);
1275 if (ret < 0)
1276 mlog_errno(ret);
1277 goto out;
1278 } else if (!ocfs2_xattr_is_local(xs->here)) {
1279 /* For existing xattr which has value outside */
1280 struct ocfs2_xattr_value_root *xv = NULL;
1281 xv = (struct ocfs2_xattr_value_root *)(val +
1282 OCFS2_XATTR_SIZE(name_len));
1283
1284 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1285 /*
1286 * If new value need set outside also,
1287 * first truncate old value to new value,
1288 * then set new value with set_value_outside().
1289 */
1290 ret = ocfs2_xattr_value_truncate(inode,
1291 xs->xattr_bh,
1292 xv,
1293 xi->value_len);
1294 if (ret < 0) {
1295 mlog_errno(ret);
1296 goto out;
1297 }
1298
1299 ret = __ocfs2_xattr_set_value_outside(inode,
1300 xv,
1301 xi->value,
1302 xi->value_len);
1303 if (ret < 0) {
1304 mlog_errno(ret);
1305 goto out;
1306 }
1307
1308 ret = ocfs2_xattr_update_entry(inode,
1309 xi,
1310 xs,
1311 offs);
1312 if (ret < 0)
1313 mlog_errno(ret);
1314 goto out;
1315 } else {
1316 /*
1317 * If new value need set in local,
1318 * just trucate old value to zero.
1319 */
1320 ret = ocfs2_xattr_value_truncate(inode,
1321 xs->xattr_bh,
1322 xv,
1323 0);
1324 if (ret < 0)
1325 mlog_errno(ret);
1326 }
1327 }
1328 }
1329
1330 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
1331 OCFS2_INODE_UPDATE_CREDITS);
1332 if (IS_ERR(handle)) {
1333 ret = PTR_ERR(handle);
1334 mlog_errno(ret);
1335 goto out;
1336 }
1337
1338 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) {
1341 mlog_errno(ret);
1342 goto out_commit;
1343 }
1344
1345 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1346 /* set extended attribute in external block. */
1347 ret = ocfs2_extend_trans(handle,
1348 OCFS2_INODE_UPDATE_CREDITS +
1349 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1350 if (ret) {
1351 mlog_errno(ret);
1352 goto out_commit;
1353 }
1354 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1355 OCFS2_JOURNAL_ACCESS_WRITE);
1356 if (ret) {
1357 mlog_errno(ret);
1358 goto out_commit;
1359 }
1360 }
1361
1362 /*
1363 * Set value in local, include set tree root in local.
1364 * This is the first step for value size >INLINE_SIZE.
1365 */
1366 ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
1367
1368 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1369 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
1370 if (ret < 0) {
1371 mlog_errno(ret);
1372 goto out_commit;
1373 }
1374 }
1375
1376 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
1377 (flag & OCFS2_INLINE_XATTR_FL)) {
1378 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1379 unsigned int xattrsize = osb->s_xattr_inline_size;
1380
1381 /*
1382 * Adjust extent record count or inline data size
1383 * to reserve space for extended attribute.
1384 */
1385 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1386 struct ocfs2_inline_data *idata = &di->id2.i_data;
1387 le16_add_cpu(&idata->id_count, -xattrsize);
1388 } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
1389 struct ocfs2_extent_list *el = &di->id2.i_list;
1390 le16_add_cpu(&el->l_count, -(xattrsize /
1391 sizeof(struct ocfs2_extent_rec)));
1392 }
1393 di->i_xattr_inline_size = cpu_to_le16(xattrsize);
1394 }
1395 /* Update xattr flag */
1396 spin_lock(&oi->ip_lock);
1397 oi->ip_dyn_features |= flag;
1398 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1399 spin_unlock(&oi->ip_lock);
1400 /* Update inode ctime */
1401 inode->i_ctime = CURRENT_TIME;
1402 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1403 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1404
1405 ret = ocfs2_journal_dirty(handle, xs->inode_bh);
1406 if (ret < 0)
1407 mlog_errno(ret);
1408
1409out_commit:
1410 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1411
1412 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1413 /*
1414 * Set value outside in B tree.
1415 * This is the second step for value size > INLINE_SIZE.
1416 */
1417 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1418 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
1419 if (ret < 0) {
1420 int ret2;
1421
1422 mlog_errno(ret);
1423 /*
1424 * If set value outside failed, we have to clean
1425 * the junk tree root we have already set in local.
1426 */
1427 ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
1428 if (ret2 < 0)
1429 mlog_errno(ret2);
1430 }
1431 }
1432out:
1433 return ret;
1434
1435}
1436
1437static int ocfs2_remove_value_outside(struct inode*inode,
1438 struct buffer_head *bh,
1439 struct ocfs2_xattr_header *header)
1440{
1441 int ret = 0, i;
1442
1443 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
1444 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
1445
1446 if (!ocfs2_xattr_is_local(entry)) {
1447 struct ocfs2_xattr_value_root *xv;
1448 void *val;
1449
1450 val = (void *)header +
1451 le16_to_cpu(entry->xe_name_offset);
1452 xv = (struct ocfs2_xattr_value_root *)
1453 (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
1454 ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
1455 if (ret < 0) {
1456 mlog_errno(ret);
1457 return ret;
1458 }
1459 }
1460 }
1461
1462 return ret;
1463}
1464
1465static int ocfs2_xattr_ibody_remove(struct inode *inode,
1466 struct buffer_head *di_bh)
1467{
1468
1469 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1470 struct ocfs2_xattr_header *header;
1471 int ret;
1472
1473 header = (struct ocfs2_xattr_header *)
1474 ((void *)di + inode->i_sb->s_blocksize -
1475 le16_to_cpu(di->i_xattr_inline_size));
1476
1477 ret = ocfs2_remove_value_outside(inode, di_bh, header);
1478
1479 return ret;
1480}
1481
1482static int ocfs2_xattr_block_remove(struct inode *inode,
1483 struct buffer_head *blk_bh)
1484{
1485 struct ocfs2_xattr_block *xb;
1486 int ret = 0;
1487
1488 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1489 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1490 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
1491 ret = ocfs2_remove_value_outside(inode, blk_bh, header);
1492 } else
1493 ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
1494
1495 return ret;
1496}
1497
1498static int ocfs2_xattr_free_block(struct inode *inode,
1499 u64 block)
1500{
1501 struct inode *xb_alloc_inode;
1502 struct buffer_head *xb_alloc_bh = NULL;
1503 struct buffer_head *blk_bh = NULL;
1504 struct ocfs2_xattr_block *xb;
1505 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1506 handle_t *handle;
1507 int ret = 0;
1508 u64 blk, bg_blkno;
1509 u16 bit;
1510
1511 ret = ocfs2_read_block(inode, block, &blk_bh);
1512 if (ret < 0) {
1513 mlog_errno(ret);
1514 goto out;
1515 }
1516
1517 /*Verify the signature of xattr block*/
1518 if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
1519 strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
1520 ret = -EFAULT;
1521 goto out;
1522 }
1523
1524 ret = ocfs2_xattr_block_remove(inode, blk_bh);
1525 if (ret < 0) {
1526 mlog_errno(ret);
1527 goto out;
1528 }
1529
1530 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1531 blk = le64_to_cpu(xb->xb_blkno);
1532 bit = le16_to_cpu(xb->xb_suballoc_bit);
1533 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1534
1535 xb_alloc_inode = ocfs2_get_system_file_inode(osb,
1536 EXTENT_ALLOC_SYSTEM_INODE,
1537 le16_to_cpu(xb->xb_suballoc_slot));
1538 if (!xb_alloc_inode) {
1539 ret = -ENOMEM;
1540 mlog_errno(ret);
1541 goto out;
1542 }
1543 mutex_lock(&xb_alloc_inode->i_mutex);
1544
1545 ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
1546 if (ret < 0) {
1547 mlog_errno(ret);
1548 goto out_mutex;
1549 }
1550
1551 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
1552 if (IS_ERR(handle)) {
1553 ret = PTR_ERR(handle);
1554 mlog_errno(ret);
1555 goto out_unlock;
1556 }
1557
1558 ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
1559 bit, bg_blkno, 1);
1560 if (ret < 0)
1561 mlog_errno(ret);
1562
1563 ocfs2_commit_trans(osb, handle);
1564out_unlock:
1565 ocfs2_inode_unlock(xb_alloc_inode, 1);
1566 brelse(xb_alloc_bh);
1567out_mutex:
1568 mutex_unlock(&xb_alloc_inode->i_mutex);
1569 iput(xb_alloc_inode);
1570out:
1571 brelse(blk_bh);
1572 return ret;
1573}
1574
1575/*
1576 * ocfs2_xattr_remove()
1577 *
1578 * Free extended attribute resources associated with this inode.
1579 */
1580int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1581{
1582 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1583 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1584 handle_t *handle;
1585 int ret;
1586
1587 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
1588 return 0;
1589
1590 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
1591 return 0;
1592
1593 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
1594 ret = ocfs2_xattr_ibody_remove(inode, di_bh);
1595 if (ret < 0) {
1596 mlog_errno(ret);
1597 goto out;
1598 }
1599 }
1600
1601 if (di->i_xattr_loc) {
1602 ret = ocfs2_xattr_free_block(inode,
1603 le64_to_cpu(di->i_xattr_loc));
1604 if (ret < 0) {
1605 mlog_errno(ret);
1606 goto out;
1607 }
1608 }
1609
1610 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
1611 OCFS2_INODE_UPDATE_CREDITS);
1612 if (IS_ERR(handle)) {
1613 ret = PTR_ERR(handle);
1614 mlog_errno(ret);
1615 goto out;
1616 }
1617 ret = ocfs2_journal_access(handle, inode, di_bh,
1618 OCFS2_JOURNAL_ACCESS_WRITE);
1619 if (ret) {
1620 mlog_errno(ret);
1621 goto out_commit;
1622 }
1623
1624 di->i_xattr_loc = 0;
1625
1626 spin_lock(&oi->ip_lock);
1627 oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
1628 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1629 spin_unlock(&oi->ip_lock);
1630
1631 ret = ocfs2_journal_dirty(handle, di_bh);
1632 if (ret < 0)
1633 mlog_errno(ret);
1634out_commit:
1635 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1636out:
1637 return ret;
1638}
1639
1640static int ocfs2_xattr_has_space_inline(struct inode *inode,
1641 struct ocfs2_dinode *di)
1642{
1643 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1644 unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
1645 int free;
1646
1647 if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
1648 return 0;
1649
1650 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1651 struct ocfs2_inline_data *idata = &di->id2.i_data;
1652 free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
1653 } else if (ocfs2_inode_is_fast_symlink(inode)) {
1654 free = ocfs2_fast_symlink_chars(inode->i_sb) -
1655 le64_to_cpu(di->i_size);
1656 } else {
1657 struct ocfs2_extent_list *el = &di->id2.i_list;
1658 free = (le16_to_cpu(el->l_count) -
1659 le16_to_cpu(el->l_next_free_rec)) *
1660 sizeof(struct ocfs2_extent_rec);
1661 }
1662 if (free >= xattrsize)
1663 return 1;
1664
1665 return 0;
1666}
1667
1668/*
1669 * ocfs2_xattr_ibody_find()
1670 *
1671 * Find extended attribute in inode block and
1672 * fill search info into struct ocfs2_xattr_search.
1673 */
1674static int ocfs2_xattr_ibody_find(struct inode *inode,
1675 int name_index,
1676 const char *name,
1677 struct ocfs2_xattr_search *xs)
1678{
1679 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1680 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1681 int ret;
1682 int has_space = 0;
1683
1684 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
1685 return 0;
1686
1687 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
1688 down_read(&oi->ip_alloc_sem);
1689 has_space = ocfs2_xattr_has_space_inline(inode, di);
1690 up_read(&oi->ip_alloc_sem);
1691 if (!has_space)
1692 return 0;
1693 }
1694
1695 xs->xattr_bh = xs->inode_bh;
1696 xs->end = (void *)di + inode->i_sb->s_blocksize;
1697 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
1698 xs->header = (struct ocfs2_xattr_header *)
1699 (xs->end - le16_to_cpu(di->i_xattr_inline_size));
1700 else
1701 xs->header = (struct ocfs2_xattr_header *)
1702 (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
1703 xs->base = (void *)xs->header;
1704 xs->here = xs->header->xh_entries;
1705
1706 /* Find the named attribute. */
1707 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
1708 ret = ocfs2_xattr_find_entry(name_index, name, xs);
1709 if (ret && ret != -ENODATA)
1710 return ret;
1711 xs->not_found = ret;
1712 }
1713
1714 return 0;
1715}
1716
1717/*
1718 * ocfs2_xattr_ibody_set()
1719 *
1720 * Set, replace or remove an extended attribute into inode block.
1721 *
1722 */
1723static int ocfs2_xattr_ibody_set(struct inode *inode,
1724 struct ocfs2_xattr_info *xi,
1725 struct ocfs2_xattr_search *xs)
1726{
1727 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1728 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1729 int ret;
1730
1731 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
1732 return -ENOSPC;
1733
1734 down_write(&oi->ip_alloc_sem);
1735 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
1736 if (!ocfs2_xattr_has_space_inline(inode, di)) {
1737 ret = -ENOSPC;
1738 goto out;
1739 }
1740 }
1741
1742 ret = ocfs2_xattr_set_entry(inode, xi, xs,
1743 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
1744out:
1745 up_write(&oi->ip_alloc_sem);
1746
1747 return ret;
1748}
1749
1750/*
1751 * ocfs2_xattr_block_find()
1752 *
1753 * Find extended attribute in external block and
1754 * fill search info into struct ocfs2_xattr_search.
1755 */
1756static int ocfs2_xattr_block_find(struct inode *inode,
1757 int name_index,
1758 const char *name,
1759 struct ocfs2_xattr_search *xs)
1760{
1761 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1762 struct buffer_head *blk_bh = NULL;
1763 struct ocfs2_xattr_block *xb;
1764 int ret = 0;
1765
1766 if (!di->i_xattr_loc)
1767 return ret;
1768
1769 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
1770 if (ret < 0) {
1771 mlog_errno(ret);
1772 return ret;
1773 }
1774 /*Verify the signature of xattr block*/
1775 if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
1776 strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
1777 ret = -EFAULT;
1778 goto cleanup;
1779 }
1780
1781 xs->xattr_bh = blk_bh;
1782 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1783
1784 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1785 xs->header = &xb->xb_attrs.xb_header;
1786 xs->base = (void *)xs->header;
1787 xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
1788 xs->here = xs->header->xh_entries;
1789
1790 ret = ocfs2_xattr_find_entry(name_index, name, xs);
1791 } else
1792 ret = ocfs2_xattr_index_block_find(inode, blk_bh,
1793 name_index,
1794 name, xs);
1795
1796 if (ret && ret != -ENODATA) {
1797 xs->xattr_bh = NULL;
1798 goto cleanup;
1799 }
1800 xs->not_found = ret;
1801 return 0;
1802cleanup:
1803 brelse(blk_bh);
1804
1805 return ret;
1806}
1807
1808/*
1809 * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
1810 * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
1811 * re-initialized.
1812 */
1813static int ocfs2_restore_xattr_block(struct inode *inode,
1814 struct ocfs2_xattr_search *xs)
1815{
1816 int ret;
1817 handle_t *handle;
1818 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1819 struct ocfs2_xattr_block *xb =
1820 (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
1821 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
1822 u16 xb_flags = le16_to_cpu(xb->xb_flags);
1823
1824 BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
1825 le16_to_cpu(el->l_next_free_rec) != 0);
1826
1827 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1828 if (IS_ERR(handle)) {
1829 ret = PTR_ERR(handle);
1830 handle = NULL;
1831 goto out;
1832 }
1833
1834 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1835 OCFS2_JOURNAL_ACCESS_WRITE);
1836 if (ret < 0) {
1837 mlog_errno(ret);
1838 goto out_commit;
1839 }
1840
1841 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
1842 offsetof(struct ocfs2_xattr_block, xb_attrs));
1843
1844 xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
1845
1846 ocfs2_journal_dirty(handle, xs->xattr_bh);
1847
1848out_commit:
1849 ocfs2_commit_trans(osb, handle);
1850out:
1851 return ret;
1852}
1853
1854/*
1855 * ocfs2_xattr_block_set()
1856 *
1857 * Set, replace or remove an extended attribute into external block.
1858 *
1859 */
1860static int ocfs2_xattr_block_set(struct inode *inode,
1861 struct ocfs2_xattr_info *xi,
1862 struct ocfs2_xattr_search *xs)
1863{
1864 struct buffer_head *new_bh = NULL;
1865 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1866 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1867 struct ocfs2_alloc_context *meta_ac = NULL;
1868 handle_t *handle = NULL;
1869 struct ocfs2_xattr_block *xblk = NULL;
1870 u16 suballoc_bit_start;
1871 u32 num_got;
1872 u64 first_blkno;
1873 int ret;
1874
1875 if (!xs->xattr_bh) {
1876 /*
1877 * Alloc one external block for extended attribute
1878 * outside of inode.
1879 */
1880 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
1881 if (ret < 0) {
1882 mlog_errno(ret);
1883 goto out;
1884 }
1885 handle = ocfs2_start_trans(osb,
1886 OCFS2_XATTR_BLOCK_CREATE_CREDITS);
1887 if (IS_ERR(handle)) {
1888 ret = PTR_ERR(handle);
1889 mlog_errno(ret);
1890 goto out;
1891 }
1892 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1893 OCFS2_JOURNAL_ACCESS_CREATE);
1894 if (ret < 0) {
1895 mlog_errno(ret);
1896 goto out_commit;
1897 }
1898
1899 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
1900 &suballoc_bit_start, &num_got,
1901 &first_blkno);
1902 if (ret < 0) {
1903 mlog_errno(ret);
1904 goto out_commit;
1905 }
1906
1907 new_bh = sb_getblk(inode->i_sb, first_blkno);
1908 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1909
1910 ret = ocfs2_journal_access(handle, inode, new_bh,
1911 OCFS2_JOURNAL_ACCESS_CREATE);
1912 if (ret < 0) {
1913 mlog_errno(ret);
1914 goto out_commit;
1915 }
1916
1917 /* Initialize ocfs2_xattr_block */
1918 xs->xattr_bh = new_bh;
1919 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
1920 memset(xblk, 0, inode->i_sb->s_blocksize);
1921 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
1922 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
1923 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1924 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
1925 xblk->xb_blkno = cpu_to_le64(first_blkno);
1926
1927 xs->header = &xblk->xb_attrs.xb_header;
1928 xs->base = (void *)xs->header;
1929 xs->end = (void *)xblk + inode->i_sb->s_blocksize;
1930 xs->here = xs->header->xh_entries;
1931
1932
1933 ret = ocfs2_journal_dirty(handle, new_bh);
1934 if (ret < 0) {
1935 mlog_errno(ret);
1936 goto out_commit;
1937 }
1938 di->i_xattr_loc = cpu_to_le64(first_blkno);
1939 ret = ocfs2_journal_dirty(handle, xs->inode_bh);
1940 if (ret < 0)
1941 mlog_errno(ret);
1942out_commit:
1943 ocfs2_commit_trans(osb, handle);
1944out:
1945 if (meta_ac)
1946 ocfs2_free_alloc_context(meta_ac);
1947 if (ret < 0)
1948 return ret;
1949 } else
1950 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
1951
1952 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
1953 /* Set extended attribute into external block */
1954 ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
1955 if (!ret || ret != -ENOSPC)
1956 goto end;
1957
1958 ret = ocfs2_xattr_create_index_block(inode, xs);
1959 if (ret)
1960 goto end;
1961 }
1962
1963 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
1964 if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
1965 ret = ocfs2_restore_xattr_block(inode, xs);
1966
1967end:
1968
1969 return ret;
1970}
1971
1972/*
1973 * ocfs2_xattr_set()
1974 *
1975 * Set, replace or remove an extended attribute for this inode.
1976 * value is NULL to remove an existing extended attribute, else either
1977 * create or replace an extended attribute.
1978 */
1979int ocfs2_xattr_set(struct inode *inode,
1980 int name_index,
1981 const char *name,
1982 const void *value,
1983 size_t value_len,
1984 int flags)
1985{
1986 struct buffer_head *di_bh = NULL;
1987 struct ocfs2_dinode *di;
1988 int ret;
1989 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
1990
1991 struct ocfs2_xattr_info xi = {
1992 .name_index = name_index,
1993 .name = name,
1994 .value = value,
1995 .value_len = value_len,
1996 };
1997
1998 struct ocfs2_xattr_search xis = {
1999 .not_found = -ENODATA,
2000 };
2001
2002 struct ocfs2_xattr_search xbs = {
2003 .not_found = -ENODATA,
2004 };
2005
2006 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
2007 return -EOPNOTSUPP;
2008
2009 ret = ocfs2_inode_lock(inode, &di_bh, 1);
2010 if (ret < 0) {
2011 mlog_errno(ret);
2012 return ret;
2013 }
2014 xis.inode_bh = xbs.inode_bh = di_bh;
2015 di = (struct ocfs2_dinode *)di_bh->b_data;
2016
2017 down_write(&OCFS2_I(inode)->ip_xattr_sem);
2018 /*
2019 * Scan inode and external block to find the same name
2020 * extended attribute and collect search infomation.
2021 */
2022 ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
2023 if (ret)
2024 goto cleanup;
2025 if (xis.not_found) {
2026 ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
2027 if (ret)
2028 goto cleanup;
2029 }
2030
2031 if (xis.not_found && xbs.not_found) {
2032 ret = -ENODATA;
2033 if (flags & XATTR_REPLACE)
2034 goto cleanup;
2035 ret = 0;
2036 if (!value)
2037 goto cleanup;
2038 } else {
2039 ret = -EEXIST;
2040 if (flags & XATTR_CREATE)
2041 goto cleanup;
2042 }
2043
2044 if (!value) {
2045 /* Remove existing extended attribute */
2046 if (!xis.not_found)
2047 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2048 else if (!xbs.not_found)
2049 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2050 } else {
2051 /* We always try to set extended attribute into inode first*/
2052 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2053 if (!ret && !xbs.not_found) {
2054 /*
2055 * If succeed and that extended attribute existing in
2056 * external block, then we will remove it.
2057 */
2058 xi.value = NULL;
2059 xi.value_len = 0;
2060 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2061 } else if (ret == -ENOSPC) {
2062 if (di->i_xattr_loc && !xbs.xattr_bh) {
2063 ret = ocfs2_xattr_block_find(inode, name_index,
2064 name, &xbs);
2065 if (ret)
2066 goto cleanup;
2067 }
2068 /*
2069 * If no space in inode, we will set extended attribute
2070 * into external block.
2071 */
2072 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2073 if (ret)
2074 goto cleanup;
2075 if (!xis.not_found) {
2076 /*
2077 * If succeed and that extended attribute
2078 * existing in inode, we will remove it.
2079 */
2080 xi.value = NULL;
2081 xi.value_len = 0;
2082 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2083 }
2084 }
2085 }
2086cleanup:
2087 up_write(&OCFS2_I(inode)->ip_xattr_sem);
2088 ocfs2_inode_unlock(inode, 1);
2089 brelse(di_bh);
2090 brelse(xbs.xattr_bh);
2091 for (i = 0; i < blk_per_bucket; i++)
2092 brelse(xbs.bucket.bhs[i]);
2093
2094 return ret;
2095}
2096
2097/*
2098 * Find the xattr extent rec which may contains name_hash.
2099 * e_cpos will be the first name hash of the xattr rec.
2100 * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
2101 */
2102static int ocfs2_xattr_get_rec(struct inode *inode,
2103 u32 name_hash,
2104 u64 *p_blkno,
2105 u32 *e_cpos,
2106 u32 *num_clusters,
2107 struct ocfs2_extent_list *el)
2108{
2109 int ret = 0, i;
2110 struct buffer_head *eb_bh = NULL;
2111 struct ocfs2_extent_block *eb;
2112 struct ocfs2_extent_rec *rec = NULL;
2113 u64 e_blkno = 0;
2114
2115 if (el->l_tree_depth) {
2116 ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
2117 if (ret) {
2118 mlog_errno(ret);
2119 goto out;
2120 }
2121
2122 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2123 el = &eb->h_list;
2124
2125 if (el->l_tree_depth) {
2126 ocfs2_error(inode->i_sb,
2127 "Inode %lu has non zero tree depth in "
2128 "xattr tree block %llu\n", inode->i_ino,
2129 (unsigned long long)eb_bh->b_blocknr);
2130 ret = -EROFS;
2131 goto out;
2132 }
2133 }
2134
2135 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
2136 rec = &el->l_recs[i];
2137
2138 if (le32_to_cpu(rec->e_cpos) <= name_hash) {
2139 e_blkno = le64_to_cpu(rec->e_blkno);
2140 break;
2141 }
2142 }
2143
2144 if (!e_blkno) {
2145 ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
2146 "record (%u, %u, 0) in xattr", inode->i_ino,
2147 le32_to_cpu(rec->e_cpos),
2148 ocfs2_rec_clusters(el, rec));
2149 ret = -EROFS;
2150 goto out;
2151 }
2152
2153 *p_blkno = le64_to_cpu(rec->e_blkno);
2154 *num_clusters = le16_to_cpu(rec->e_leaf_clusters);
2155 if (e_cpos)
2156 *e_cpos = le32_to_cpu(rec->e_cpos);
2157out:
2158 brelse(eb_bh);
2159 return ret;
2160}
2161
2162typedef int (xattr_bucket_func)(struct inode *inode,
2163 struct ocfs2_xattr_bucket *bucket,
2164 void *para);
2165
2166static int ocfs2_find_xe_in_bucket(struct inode *inode,
2167 struct buffer_head *header_bh,
2168 int name_index,
2169 const char *name,
2170 u32 name_hash,
2171 u16 *xe_index,
2172 int *found)
2173{
2174 int i, ret = 0, cmp = 1, block_off, new_offset;
2175 struct ocfs2_xattr_header *xh =
2176 (struct ocfs2_xattr_header *)header_bh->b_data;
2177 size_t name_len = strlen(name);
2178 struct ocfs2_xattr_entry *xe = NULL;
2179 struct buffer_head *name_bh = NULL;
2180 char *xe_name;
2181
2182 /*
2183 * We don't use binary search in the bucket because there
2184 * may be multiple entries with the same name hash.
2185 */
2186 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
2187 xe = &xh->xh_entries[i];
2188
2189 if (name_hash > le32_to_cpu(xe->xe_name_hash))
2190 continue;
2191 else if (name_hash < le32_to_cpu(xe->xe_name_hash))
2192 break;
2193
2194 cmp = name_index - ocfs2_xattr_get_type(xe);
2195 if (!cmp)
2196 cmp = name_len - xe->xe_name_len;
2197 if (cmp)
2198 continue;
2199
2200 ret = ocfs2_xattr_bucket_get_name_value(inode,
2201 xh,
2202 i,
2203 &block_off,
2204 &new_offset);
2205 if (ret) {
2206 mlog_errno(ret);
2207 break;
2208 }
2209
2210 ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
2211 &name_bh);
2212 if (ret) {
2213 mlog_errno(ret);
2214 break;
2215 }
2216 xe_name = name_bh->b_data + new_offset;
2217
2218 cmp = memcmp(name, xe_name, name_len);
2219 brelse(name_bh);
2220 name_bh = NULL;
2221
2222 if (cmp == 0) {
2223 *xe_index = i;
2224 *found = 1;
2225 ret = 0;
2226 break;
2227 }
2228 }
2229
2230 return ret;
2231}
2232
2233/*
2234 * Find the specified xattr entry in a series of buckets.
2235 * This series start from p_blkno and last for num_clusters.
2236 * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
2237 * the num of the valid buckets.
2238 *
2239 * Return the buffer_head this xattr should reside in. And if the xattr's
2240 * hash is in the gap of 2 buckets, return the lower bucket.
2241 */
2242static int ocfs2_xattr_bucket_find(struct inode *inode,
2243 int name_index,
2244 const char *name,
2245 u32 name_hash,
2246 u64 p_blkno,
2247 u32 first_hash,
2248 u32 num_clusters,
2249 struct ocfs2_xattr_search *xs)
2250{
2251 int ret, found = 0;
2252 struct buffer_head *bh = NULL;
2253 struct buffer_head *lower_bh = NULL;
2254 struct ocfs2_xattr_header *xh = NULL;
2255 struct ocfs2_xattr_entry *xe = NULL;
2256 u16 index = 0;
2257 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2258 int low_bucket = 0, bucket, high_bucket;
2259 u32 last_hash;
2260 u64 blkno;
2261
2262 ret = ocfs2_read_block(inode, p_blkno, &bh);
2263 if (ret) {
2264 mlog_errno(ret);
2265 goto out;
2266 }
2267
2268 xh = (struct ocfs2_xattr_header *)bh->b_data;
2269 high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
2270
2271 while (low_bucket <= high_bucket) {
2272 brelse(bh);
2273 bh = NULL;
2274 bucket = (low_bucket + high_bucket) / 2;
2275
2276 blkno = p_blkno + bucket * blk_per_bucket;
2277
2278 ret = ocfs2_read_block(inode, blkno, &bh);
2279 if (ret) {
2280 mlog_errno(ret);
2281 goto out;
2282 }
2283
2284 xh = (struct ocfs2_xattr_header *)bh->b_data;
2285 xe = &xh->xh_entries[0];
2286 if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
2287 high_bucket = bucket - 1;
2288 continue;
2289 }
2290
2291 /*
2292 * Check whether the hash of the last entry in our
2293 * bucket is larger than the search one. for an empty
2294 * bucket, the last one is also the first one.
2295 */
2296 if (xh->xh_count)
2297 xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
2298
2299 last_hash = le32_to_cpu(xe->xe_name_hash);
2300
2301 /* record lower_bh which may be the insert place. */
2302 brelse(lower_bh);
2303 lower_bh = bh;
2304 bh = NULL;
2305
2306 if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
2307 low_bucket = bucket + 1;
2308 continue;
2309 }
2310
2311 /* the searched xattr should reside in this bucket if exists. */
2312 ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
2313 name_index, name, name_hash,
2314 &index, &found);
2315 if (ret) {
2316 mlog_errno(ret);
2317 goto out;
2318 }
2319 break;
2320 }
2321
2322 /*
2323 * Record the bucket we have found.
2324 * When the xattr's hash value is in the gap of 2 buckets, we will
2325 * always set it to the previous bucket.
2326 */
2327 if (!lower_bh) {
2328 /*
2329 * We can't find any bucket whose first name_hash is less
2330 * than the find name_hash.
2331 */
2332 BUG_ON(bh->b_blocknr != p_blkno);
2333 lower_bh = bh;
2334 bh = NULL;
2335 }
2336 xs->bucket.bhs[0] = lower_bh;
2337 xs->bucket.xh = (struct ocfs2_xattr_header *)
2338 xs->bucket.bhs[0]->b_data;
2339 lower_bh = NULL;
2340
2341 xs->header = xs->bucket.xh;
2342 xs->base = xs->bucket.bhs[0]->b_data;
2343 xs->end = xs->base + inode->i_sb->s_blocksize;
2344
2345 if (found) {
2346 /*
2347 * If we have found the xattr enty, read all the blocks in
2348 * this bucket.
2349 */
2350 ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
2351 blk_per_bucket - 1, &xs->bucket.bhs[1],
2352 0);
2353 if (ret) {
2354 mlog_errno(ret);
2355 goto out;
2356 }
2357
2358 xs->here = &xs->header->xh_entries[index];
2359 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
2360 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
2361 } else
2362 ret = -ENODATA;
2363
2364out:
2365 brelse(bh);
2366 brelse(lower_bh);
2367 return ret;
2368}
2369
2370static int ocfs2_xattr_index_block_find(struct inode *inode,
2371 struct buffer_head *root_bh,
2372 int name_index,
2373 const char *name,
2374 struct ocfs2_xattr_search *xs)
2375{
2376 int ret;
2377 struct ocfs2_xattr_block *xb =
2378 (struct ocfs2_xattr_block *)root_bh->b_data;
2379 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
2380 struct ocfs2_extent_list *el = &xb_root->xt_list;
2381 u64 p_blkno = 0;
2382 u32 first_hash, num_clusters = 0;
2383 u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
2384
2385 if (le16_to_cpu(el->l_next_free_rec) == 0)
2386 return -ENODATA;
2387
2388 mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
2389 name, name_hash, name_index);
2390
2391 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
2392 &num_clusters, el);
2393 if (ret) {
2394 mlog_errno(ret);
2395 goto out;
2396 }
2397
2398 BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
2399
2400 mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
2401 "in the rec is %u\n", num_clusters, p_blkno, first_hash);
2402
2403 ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
2404 p_blkno, first_hash, num_clusters, xs);
2405
2406out:
2407 return ret;
2408}
2409
2410static int ocfs2_iterate_xattr_buckets(struct inode *inode,
2411 u64 blkno,
2412 u32 clusters,
2413 xattr_bucket_func *func,
2414 void *para)
2415{
2416 int i, j, ret = 0;
2417 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2418 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
2419 u32 num_buckets = clusters * bpc;
2420 struct ocfs2_xattr_bucket bucket;
2421
2422 memset(&bucket, 0, sizeof(bucket));
2423
2424 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
2425 clusters, blkno);
2426
2427 for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
2428 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
2429 bucket.bhs, 0);
2430 if (ret) {
2431 mlog_errno(ret);
2432 goto out;
2433 }
2434
2435 bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
2436 /*
2437 * The real bucket num in this series of blocks is stored
2438 * in the 1st bucket.
2439 */
2440 if (i == 0)
2441 num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
2442
2443 mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
2444 le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
2445 if (func) {
2446 ret = func(inode, &bucket, para);
2447 if (ret) {
2448 mlog_errno(ret);
2449 break;
2450 }
2451 }
2452
2453 for (j = 0; j < blk_per_bucket; j++)
2454 brelse(bucket.bhs[j]);
2455 memset(&bucket, 0, sizeof(bucket));
2456 }
2457
2458out:
2459 for (j = 0; j < blk_per_bucket; j++)
2460 brelse(bucket.bhs[j]);
2461
2462 return ret;
2463}
2464
2465struct ocfs2_xattr_tree_list {
2466 char *buffer;
2467 size_t buffer_size;
2468 size_t result;
2469};
2470
2471static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
2472 struct ocfs2_xattr_header *xh,
2473 int index,
2474 int *block_off,
2475 int *new_offset)
2476{
2477 u16 name_offset;
2478
2479 if (index < 0 || index >= le16_to_cpu(xh->xh_count))
2480 return -EINVAL;
2481
2482 name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
2483
2484 *block_off = name_offset >> inode->i_sb->s_blocksize_bits;
2485 *new_offset = name_offset % inode->i_sb->s_blocksize;
2486
2487 return 0;
2488}
2489
2490static int ocfs2_list_xattr_bucket(struct inode *inode,
2491 struct ocfs2_xattr_bucket *bucket,
2492 void *para)
2493{
2494 int ret = 0, type;
2495 struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
2496 int i, block_off, new_offset;
2497 const char *prefix, *name;
2498
2499 for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
2500 struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
2501 type = ocfs2_xattr_get_type(entry);
2502 prefix = ocfs2_xattr_prefix(type);
2503
2504 if (prefix) {
2505 ret = ocfs2_xattr_bucket_get_name_value(inode,
2506 bucket->xh,
2507 i,
2508 &block_off,
2509 &new_offset);
2510 if (ret)
2511 break;
2512
2513 name = (const char *)bucket->bhs[block_off]->b_data +
2514 new_offset;
2515 ret = ocfs2_xattr_list_entry(xl->buffer,
2516 xl->buffer_size,
2517 &xl->result,
2518 prefix, name,
2519 entry->xe_name_len);
2520 if (ret)
2521 break;
2522 }
2523 }
2524
2525 return ret;
2526}
2527
2528static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
2529 struct ocfs2_xattr_tree_root *xt,
2530 char *buffer,
2531 size_t buffer_size)
2532{
2533 struct ocfs2_extent_list *el = &xt->xt_list;
2534 int ret = 0;
2535 u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
2536 u64 p_blkno = 0;
2537 struct ocfs2_xattr_tree_list xl = {
2538 .buffer = buffer,
2539 .buffer_size = buffer_size,
2540 .result = 0,
2541 };
2542
2543 if (le16_to_cpu(el->l_next_free_rec) == 0)
2544 return 0;
2545
2546 while (name_hash > 0) {
2547 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
2548 &e_cpos, &num_clusters, el);
2549 if (ret) {
2550 mlog_errno(ret);
2551 goto out;
2552 }
2553
2554 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
2555 ocfs2_list_xattr_bucket,
2556 &xl);
2557 if (ret) {
2558 mlog_errno(ret);
2559 goto out;
2560 }
2561
2562 if (e_cpos == 0)
2563 break;
2564
2565 name_hash = e_cpos - 1;
2566 }
2567
2568 ret = xl.result;
2569out:
2570 return ret;
2571}
2572
2573static int cmp_xe(const void *a, const void *b)
2574{
2575 const struct ocfs2_xattr_entry *l = a, *r = b;
2576 u32 l_hash = le32_to_cpu(l->xe_name_hash);
2577 u32 r_hash = le32_to_cpu(r->xe_name_hash);
2578
2579 if (l_hash > r_hash)
2580 return 1;
2581 if (l_hash < r_hash)
2582 return -1;
2583 return 0;
2584}
2585
2586static void swap_xe(void *a, void *b, int size)
2587{
2588 struct ocfs2_xattr_entry *l = a, *r = b, tmp;
2589
2590 tmp = *l;
2591 memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
2592 memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
2593}
2594
2595/*
2596 * When the ocfs2_xattr_block is filled up, new bucket will be created
2597 * and all the xattr entries will be moved to the new bucket.
2598 * Note: we need to sort the entries since they are not saved in order
2599 * in the ocfs2_xattr_block.
2600 */
2601static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2602 struct buffer_head *xb_bh,
2603 struct buffer_head *xh_bh,
2604 struct buffer_head *data_bh)
2605{
2606 int i, blocksize = inode->i_sb->s_blocksize;
2607 u16 offset, size, off_change;
2608 struct ocfs2_xattr_entry *xe;
2609 struct ocfs2_xattr_block *xb =
2610 (struct ocfs2_xattr_block *)xb_bh->b_data;
2611 struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
2612 struct ocfs2_xattr_header *xh =
2613 (struct ocfs2_xattr_header *)xh_bh->b_data;
2614 u16 count = le16_to_cpu(xb_xh->xh_count);
2615 char *target = xh_bh->b_data, *src = xb_bh->b_data;
2616
2617 mlog(0, "cp xattr from block %llu to bucket %llu\n",
2618 (unsigned long long)xb_bh->b_blocknr,
2619 (unsigned long long)xh_bh->b_blocknr);
2620
2621 memset(xh_bh->b_data, 0, blocksize);
2622 if (data_bh)
2623 memset(data_bh->b_data, 0, blocksize);
2624 /*
2625 * Since the xe_name_offset is based on ocfs2_xattr_header,
2626 * there is a offset change corresponding to the change of
2627 * ocfs2_xattr_header's position.
2628 */
2629 off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
2630 xe = &xb_xh->xh_entries[count - 1];
2631 offset = le16_to_cpu(xe->xe_name_offset) + off_change;
2632 size = blocksize - offset;
2633
2634 /* copy all the names and values. */
2635 if (data_bh)
2636 target = data_bh->b_data;
2637 memcpy(target + offset, src + offset, size);
2638
2639 /* Init new header now. */
2640 xh->xh_count = xb_xh->xh_count;
2641 xh->xh_num_buckets = cpu_to_le16(1);
2642 xh->xh_name_value_len = cpu_to_le16(size);
2643 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
2644
2645 /* copy all the entries. */
2646 target = xh_bh->b_data;
2647 offset = offsetof(struct ocfs2_xattr_header, xh_entries);
2648 size = count * sizeof(struct ocfs2_xattr_entry);
2649 memcpy(target + offset, (char *)xb_xh + offset, size);
2650
2651 /* Change the xe offset for all the xe because of the move. */
2652 off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
2653 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
2654 for (i = 0; i < count; i++)
2655 le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
2656
2657 mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
2658 offset, size, off_change);
2659
2660 sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
2661 cmp_xe, swap_xe);
2662}
2663
2664/*
2665 * After we move xattr from block to index btree, we have to
2666 * update ocfs2_xattr_search to the new xe and base.
2667 *
2668 * When the entry is in xattr block, xattr_bh indicates the storage place.
2669 * While if the entry is in index b-tree, "bucket" indicates the
2670 * real place of the xattr.
2671 */
2672static int ocfs2_xattr_update_xattr_search(struct inode *inode,
2673 struct ocfs2_xattr_search *xs,
2674 struct buffer_head *old_bh,
2675 struct buffer_head *new_bh)
2676{
2677 int ret = 0;
2678 char *buf = old_bh->b_data;
2679 struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
2680 struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
2681 int i, blocksize = inode->i_sb->s_blocksize;
2682 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2683
2684 xs->bucket.bhs[0] = new_bh;
2685 get_bh(new_bh);
2686 xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
2687 xs->header = xs->bucket.xh;
2688
2689 xs->base = new_bh->b_data;
2690 xs->end = xs->base + inode->i_sb->s_blocksize;
2691
2692 if (!xs->not_found) {
2693 if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
2694 ret = ocfs2_read_blocks(inode,
2695 xs->bucket.bhs[0]->b_blocknr + 1,
2696 blk_per_bucket - 1, &xs->bucket.bhs[1],
2697 0);
2698 if (ret) {
2699 mlog_errno(ret);
2700 return ret;
2701 }
2702
2703 i = xs->here - old_xh->xh_entries;
2704 xs->here = &xs->header->xh_entries[i];
2705 }
2706 }
2707
2708 return ret;
2709}
2710
2711static int ocfs2_xattr_create_index_block(struct inode *inode,
2712 struct ocfs2_xattr_search *xs)
2713{
2714 int ret, credits = OCFS2_SUBALLOC_ALLOC;
2715 u32 bit_off, len;
2716 u64 blkno;
2717 handle_t *handle;
2718 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2719 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2720 struct ocfs2_alloc_context *data_ac;
2721 struct buffer_head *xh_bh = NULL, *data_bh = NULL;
2722 struct buffer_head *xb_bh = xs->xattr_bh;
2723 struct ocfs2_xattr_block *xb =
2724 (struct ocfs2_xattr_block *)xb_bh->b_data;
2725 struct ocfs2_xattr_tree_root *xr;
2726 u16 xb_flags = le16_to_cpu(xb->xb_flags);
2727 u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2728
2729 mlog(0, "create xattr index block for %llu\n",
2730 (unsigned long long)xb_bh->b_blocknr);
2731
2732 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
2733
2734 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
2735 if (ret) {
2736 mlog_errno(ret);
2737 goto out;
2738 }
2739
2740 /*
2741 * XXX:
2742 * We can use this lock for now, and maybe move to a dedicated mutex
2743 * if performance becomes a problem later.
2744 */
2745 down_write(&oi->ip_alloc_sem);
2746
2747 /*
2748 * 3 more credits, one for xattr block update, one for the 1st block
2749 * of the new xattr bucket and one for the value/data.
2750 */
2751 credits += 3;
2752 handle = ocfs2_start_trans(osb, credits);
2753 if (IS_ERR(handle)) {
2754 ret = PTR_ERR(handle);
2755 mlog_errno(ret);
2756 goto out_sem;
2757 }
2758
2759 ret = ocfs2_journal_access(handle, inode, xb_bh,
2760 OCFS2_JOURNAL_ACCESS_WRITE);
2761 if (ret) {
2762 mlog_errno(ret);
2763 goto out_commit;
2764 }
2765
2766 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
2767 if (ret) {
2768 mlog_errno(ret);
2769 goto out_commit;
2770 }
2771
2772 /*
2773 * The bucket may spread in many blocks, and
2774 * we will only touch the 1st block and the last block
2775 * in the whole bucket(one for entry and one for data).
2776 */
2777 blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
2778
2779 mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
2780
2781 xh_bh = sb_getblk(inode->i_sb, blkno);
2782 if (!xh_bh) {
2783 ret = -EIO;
2784 mlog_errno(ret);
2785 goto out_commit;
2786 }
2787
2788 ocfs2_set_new_buffer_uptodate(inode, xh_bh);
2789
2790 ret = ocfs2_journal_access(handle, inode, xh_bh,
2791 OCFS2_JOURNAL_ACCESS_CREATE);
2792 if (ret) {
2793 mlog_errno(ret);
2794 goto out_commit;
2795 }
2796
2797 if (bpb > 1) {
2798 data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
2799 if (!data_bh) {
2800 ret = -EIO;
2801 mlog_errno(ret);
2802 goto out_commit;
2803 }
2804
2805 ocfs2_set_new_buffer_uptodate(inode, data_bh);
2806
2807 ret = ocfs2_journal_access(handle, inode, data_bh,
2808 OCFS2_JOURNAL_ACCESS_CREATE);
2809 if (ret) {
2810 mlog_errno(ret);
2811 goto out_commit;
2812 }
2813 }
2814
2815 ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
2816
2817 ocfs2_journal_dirty(handle, xh_bh);
2818 if (data_bh)
2819 ocfs2_journal_dirty(handle, data_bh);
2820
2821 ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
2822
2823 /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
2824 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
2825 offsetof(struct ocfs2_xattr_block, xb_attrs));
2826
2827 xr = &xb->xb_attrs.xb_root;
2828 xr->xt_clusters = cpu_to_le32(1);
2829 xr->xt_last_eb_blk = 0;
2830 xr->xt_list.l_tree_depth = 0;
2831 xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
2832 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2833
2834 xr->xt_list.l_recs[0].e_cpos = 0;
2835 xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
2836 xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
2837
2838 xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
2839
2840 ret = ocfs2_journal_dirty(handle, xb_bh);
2841 if (ret) {
2842 mlog_errno(ret);
2843 goto out_commit;
2844 }
2845
2846out_commit:
2847 ocfs2_commit_trans(osb, handle);
2848
2849out_sem:
2850 up_write(&oi->ip_alloc_sem);
2851
2852out:
2853 if (data_ac)
2854 ocfs2_free_alloc_context(data_ac);
2855
2856 brelse(xh_bh);
2857 brelse(data_bh);
2858
2859 return ret;
2860}
2861
2862static int cmp_xe_offset(const void *a, const void *b)
2863{
2864 const struct ocfs2_xattr_entry *l = a, *r = b;
2865 u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
2866 u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
2867
2868 if (l_name_offset < r_name_offset)
2869 return 1;
2870 if (l_name_offset > r_name_offset)
2871 return -1;
2872 return 0;
2873}
2874
2875/*
2876 * defrag a xattr bucket if we find that the bucket has some
2877 * holes beteen name/value pairs.
2878 * We will move all the name/value pairs to the end of the bucket
2879 * so that we can spare some space for insertion.
2880 */
2881static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2882 struct ocfs2_xattr_bucket *bucket)
2883{
2884 int ret, i;
2885 size_t end, offset, len, value_len;
2886 struct ocfs2_xattr_header *xh;
2887 char *entries, *buf, *bucket_buf = NULL;
2888 u64 blkno = bucket->bhs[0]->b_blocknr;
2889 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2890 u16 xh_free_start;
2891 size_t blocksize = inode->i_sb->s_blocksize;
2892 handle_t *handle;
2893 struct buffer_head **bhs;
2894 struct ocfs2_xattr_entry *xe;
2895
2896 bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
2897 GFP_NOFS);
2898 if (!bhs)
2899 return -ENOMEM;
2900
2901 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
2902 if (ret)
2903 goto out;
2904
2905 /*
2906 * In order to make the operation more efficient and generic,
2907 * we copy all the blocks into a contiguous memory and do the
2908 * defragment there, so if anything is error, we will not touch
2909 * the real block.
2910 */
2911 bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
2912 if (!bucket_buf) {
2913 ret = -EIO;
2914 goto out;
2915 }
2916
2917 buf = bucket_buf;
2918 for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
2919 memcpy(buf, bhs[i]->b_data, blocksize);
2920
2921 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
2922 if (IS_ERR(handle)) {
2923 ret = PTR_ERR(handle);
2924 handle = NULL;
2925 mlog_errno(ret);
2926 goto out;
2927 }
2928
2929 for (i = 0; i < blk_per_bucket; i++) {
2930 ret = ocfs2_journal_access(handle, inode, bhs[i],
2931 OCFS2_JOURNAL_ACCESS_WRITE);
2932 if (ret < 0) {
2933 mlog_errno(ret);
2934 goto commit;
2935 }
2936 }
2937
2938 xh = (struct ocfs2_xattr_header *)bucket_buf;
2939 entries = (char *)xh->xh_entries;
2940 xh_free_start = le16_to_cpu(xh->xh_free_start);
2941
2942 mlog(0, "adjust xattr bucket in %llu, count = %u, "
2943 "xh_free_start = %u, xh_name_value_len = %u.\n",
2944 blkno, le16_to_cpu(xh->xh_count), xh_free_start,
2945 le16_to_cpu(xh->xh_name_value_len));
2946
2947 /*
2948 * sort all the entries by their offset.
2949 * the largest will be the first, so that we can
2950 * move them to the end one by one.
2951 */
2952 sort(entries, le16_to_cpu(xh->xh_count),
2953 sizeof(struct ocfs2_xattr_entry),
2954 cmp_xe_offset, swap_xe);
2955
2956 /* Move all name/values to the end of the bucket. */
2957 xe = xh->xh_entries;
2958 end = OCFS2_XATTR_BUCKET_SIZE;
2959 for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
2960 offset = le16_to_cpu(xe->xe_name_offset);
2961 if (ocfs2_xattr_is_local(xe))
2962 value_len = OCFS2_XATTR_SIZE(
2963 le64_to_cpu(xe->xe_value_size));
2964 else
2965 value_len = OCFS2_XATTR_ROOT_SIZE;
2966 len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
2967
2968 /*
2969 * We must make sure that the name/value pair
2970 * exist in the same block. So adjust end to
2971 * the previous block end if needed.
2972 */
2973 if (((end - len) / blocksize !=
2974 (end - 1) / blocksize))
2975 end = end - end % blocksize;
2976
2977 if (end > offset + len) {
2978 memmove(bucket_buf + end - len,
2979 bucket_buf + offset, len);
2980 xe->xe_name_offset = cpu_to_le16(end - len);
2981 }
2982
2983 mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
2984 "bucket %llu\n", (unsigned long long)blkno);
2985
2986 end -= len;
2987 }
2988
2989 mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
2990 "bucket %llu\n", (unsigned long long)blkno);
2991
2992 if (xh_free_start == end)
2993 goto commit;
2994
2995 memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
2996 xh->xh_free_start = cpu_to_le16(end);
2997
2998 /* sort the entries by their name_hash. */
2999 sort(entries, le16_to_cpu(xh->xh_count),
3000 sizeof(struct ocfs2_xattr_entry),
3001 cmp_xe, swap_xe);
3002
3003 buf = bucket_buf;
3004 for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
3005 memcpy(bhs[i]->b_data, buf, blocksize);
3006 ocfs2_journal_dirty(handle, bhs[i]);
3007 }
3008
3009commit:
3010 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3011out:
3012
3013 if (bhs) {
3014 for (i = 0; i < blk_per_bucket; i++)
3015 brelse(bhs[i]);
3016 }
3017 kfree(bhs);
3018
3019 kfree(bucket_buf);
3020 return ret;
3021}
3022
3023/*
3024 * Move half nums of the xattr bucket in the previous cluster to this new
3025 * cluster. We only touch the last cluster of the previous extend record.
3026 *
3027 * first_bh is the first buffer_head of a series of bucket in the same
3028 * extent rec and header_bh is the header of one bucket in this cluster.
3029 * They will be updated if we move the data header_bh contains to the new
3030 * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
3031 */
3032static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
3033 handle_t *handle,
3034 struct buffer_head **first_bh,
3035 struct buffer_head **header_bh,
3036 u64 new_blkno,
3037 u64 prev_blkno,
3038 u32 num_clusters,
3039 u32 *first_hash)
3040{
3041 int i, ret, credits;
3042 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3043 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3044 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
3045 int blocksize = inode->i_sb->s_blocksize;
3046 struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
3047 struct ocfs2_xattr_header *new_xh;
3048 struct ocfs2_xattr_header *xh =
3049 (struct ocfs2_xattr_header *)((*first_bh)->b_data);
3050
3051 BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
3052 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
3053
3054 prev_bh = *first_bh;
3055 get_bh(prev_bh);
3056 xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
3057
3058 prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
3059
3060 mlog(0, "move half of xattrs in cluster %llu to %llu\n",
3061 prev_blkno, new_blkno);
3062
3063 /*
3064 * We need to update the 1st half of the new cluster and
3065 * 1 more for the update of the 1st bucket of the previous
3066 * extent record.
3067 */
3068 credits = bpc / 2 + 1;
3069 ret = ocfs2_extend_trans(handle, credits);
3070 if (ret) {
3071 mlog_errno(ret);
3072 goto out;
3073 }
3074
3075 ret = ocfs2_journal_access(handle, inode, prev_bh,
3076 OCFS2_JOURNAL_ACCESS_WRITE);
3077 if (ret) {
3078 mlog_errno(ret);
3079 goto out;
3080 }
3081
3082 for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
3083 old_bh = new_bh = NULL;
3084 new_bh = sb_getblk(inode->i_sb, new_blkno);
3085 if (!new_bh) {
3086 ret = -EIO;
3087 mlog_errno(ret);
3088 goto out;
3089 }
3090
3091 ocfs2_set_new_buffer_uptodate(inode, new_bh);
3092
3093 ret = ocfs2_journal_access(handle, inode, new_bh,
3094 OCFS2_JOURNAL_ACCESS_CREATE);
3095 if (ret < 0) {
3096 mlog_errno(ret);
3097 brelse(new_bh);
3098 goto out;
3099 }
3100
3101 ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
3102 if (ret < 0) {
3103 mlog_errno(ret);
3104 brelse(new_bh);
3105 goto out;
3106 }
3107
3108 memcpy(new_bh->b_data, old_bh->b_data, blocksize);
3109
3110 if (i == 0) {
3111 new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
3112 new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
3113
3114 if (first_hash)
3115 *first_hash = le32_to_cpu(
3116 new_xh->xh_entries[0].xe_name_hash);
3117 new_first_bh = new_bh;
3118 get_bh(new_first_bh);
3119 }
3120
3121 ocfs2_journal_dirty(handle, new_bh);
3122
3123 if (*header_bh == old_bh) {
3124 brelse(*header_bh);
3125 *header_bh = new_bh;
3126 get_bh(*header_bh);
3127
3128 brelse(*first_bh);
3129 *first_bh = new_first_bh;
3130 get_bh(*first_bh);
3131 }
3132 brelse(new_bh);
3133 brelse(old_bh);
3134 }
3135
3136 le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
3137
3138 ocfs2_journal_dirty(handle, prev_bh);
3139out:
3140 brelse(prev_bh);
3141 brelse(new_first_bh);
3142 return ret;
3143}
3144
3145static int ocfs2_read_xattr_bucket(struct inode *inode,
3146 u64 blkno,
3147 struct buffer_head **bhs,
3148 int new)
3149{
3150 int ret = 0;
3151 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3152
3153 if (!new)
3154 return ocfs2_read_blocks(inode, blkno,
3155 blk_per_bucket, bhs, 0);
3156
3157 for (i = 0; i < blk_per_bucket; i++) {
3158 bhs[i] = sb_getblk(inode->i_sb, blkno + i);
3159 if (bhs[i] == NULL) {
3160 ret = -EIO;
3161 mlog_errno(ret);
3162 break;
3163 }
3164 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
3165 }
3166
3167 return ret;
3168}
3169
3170/*
3171 * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
3172 * first_hash will record the 1st hash of the new bucket.
3173 */
3174static int ocfs2_half_xattr_bucket(struct inode *inode,
3175 handle_t *handle,
3176 u64 blk,
3177 u64 new_blk,
3178 u32 *first_hash,
3179 int new_bucket_head)
3180{
3181 int ret, i;
3182 u16 count, start, len, name_value_len, xe_len, name_offset;
3183 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3184 struct buffer_head **s_bhs, **t_bhs = NULL;
3185 struct ocfs2_xattr_header *xh;
3186 struct ocfs2_xattr_entry *xe;
3187 int blocksize = inode->i_sb->s_blocksize;
3188
3189 mlog(0, "move half of xattrs from bucket %llu to %llu\n",
3190 blk, new_blk);
3191
3192 s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
3193 if (!s_bhs)
3194 return -ENOMEM;
3195
3196 ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
3197 if (ret) {
3198 mlog_errno(ret);
3199 goto out;
3200 }
3201
3202 ret = ocfs2_journal_access(handle, inode, s_bhs[0],
3203 OCFS2_JOURNAL_ACCESS_WRITE);
3204 if (ret) {
3205 mlog_errno(ret);
3206 goto out;
3207 }
3208
3209 t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
3210 if (!t_bhs) {
3211 ret = -ENOMEM;
3212 goto out;
3213 }
3214
3215 ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
3216 if (ret) {
3217 mlog_errno(ret);
3218 goto out;
3219 }
3220
3221 for (i = 0; i < blk_per_bucket; i++) {
3222 ret = ocfs2_journal_access(handle, inode, t_bhs[i],
3223 OCFS2_JOURNAL_ACCESS_CREATE);
3224 if (ret) {
3225 mlog_errno(ret);
3226 goto out;
3227 }
3228 }
3229
3230 /* copy the whole bucket to the new first. */
3231 for (i = 0; i < blk_per_bucket; i++)
3232 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3233
3234 /* update the new bucket. */
3235 xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
3236 count = le16_to_cpu(xh->xh_count);
3237 start = count / 2;
3238
3239 /*
3240 * Calculate the total name/value len and xh_free_start for
3241 * the old bucket first.
3242 */
3243 name_offset = OCFS2_XATTR_BUCKET_SIZE;
3244 name_value_len = 0;
3245 for (i = 0; i < start; i++) {
3246 xe = &xh->xh_entries[i];
3247 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
3248 if (ocfs2_xattr_is_local(xe))
3249 xe_len +=
3250 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
3251 else
3252 xe_len += OCFS2_XATTR_ROOT_SIZE;
3253 name_value_len += xe_len;
3254 if (le16_to_cpu(xe->xe_name_offset) < name_offset)
3255 name_offset = le16_to_cpu(xe->xe_name_offset);
3256 }
3257
3258 /*
3259 * Now begin the modification to the new bucket.
3260 *
3261 * In the new bucket, We just move the xattr entry to the beginning
3262 * and don't touch the name/value. So there will be some holes in the
3263 * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
3264 * called.
3265 */
3266 xe = &xh->xh_entries[start];
3267 len = sizeof(struct ocfs2_xattr_entry) * (count - start);
3268 mlog(0, "mv xattr entry len %d from %d to %d\n", len,
3269 (int)((char *)xe - (char *)xh),
3270 (int)((char *)xh->xh_entries - (char *)xh));
3271 memmove((char *)xh->xh_entries, (char *)xe, len);
3272 xe = &xh->xh_entries[count - start];
3273 len = sizeof(struct ocfs2_xattr_entry) * start;
3274 memset((char *)xe, 0, len);
3275
3276 le16_add_cpu(&xh->xh_count, -start);
3277 le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
3278
3279 /* Calculate xh_free_start for the new bucket. */
3280 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
3281 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
3282 xe = &xh->xh_entries[i];
3283 xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
3284 if (ocfs2_xattr_is_local(xe))
3285 xe_len +=
3286 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
3287 else
3288 xe_len += OCFS2_XATTR_ROOT_SIZE;
3289 if (le16_to_cpu(xe->xe_name_offset) <
3290 le16_to_cpu(xh->xh_free_start))
3291 xh->xh_free_start = xe->xe_name_offset;
3292 }
3293
3294 /* set xh->xh_num_buckets for the new xh. */
3295 if (new_bucket_head)
3296 xh->xh_num_buckets = cpu_to_le16(1);
3297 else
3298 xh->xh_num_buckets = 0;
3299
3300 for (i = 0; i < blk_per_bucket; i++) {
3301 ocfs2_journal_dirty(handle, t_bhs[i]);
3302 if (ret)
3303 mlog_errno(ret);
3304 }
3305
3306 /* store the first_hash of the new bucket. */
3307 if (first_hash)
3308 *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
3309
3310 /*
3311 * Now only update the 1st block of the old bucket.
3312 * Please note that the entry has been sorted already above.
3313 */
3314 xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
3315 memset(&xh->xh_entries[start], 0,
3316 sizeof(struct ocfs2_xattr_entry) * (count - start));
3317 xh->xh_count = cpu_to_le16(start);
3318 xh->xh_free_start = cpu_to_le16(name_offset);
3319 xh->xh_name_value_len = cpu_to_le16(name_value_len);
3320
3321 ocfs2_journal_dirty(handle, s_bhs[0]);
3322 if (ret)
3323 mlog_errno(ret);
3324
3325out:
3326 if (s_bhs) {
3327 for (i = 0; i < blk_per_bucket; i++)
3328 brelse(s_bhs[i]);
3329 }
3330 kfree(s_bhs);
3331
3332 if (t_bhs) {
3333 for (i = 0; i < blk_per_bucket; i++)
3334 brelse(t_bhs[i]);
3335 }
3336 kfree(t_bhs);
3337
3338 return ret;
3339}
3340
3341/*
3342 * Copy xattr from one bucket to another bucket.
3343 *
3344 * The caller must make sure that the journal transaction
3345 * has enough space for journaling.
3346 */
3347static int ocfs2_cp_xattr_bucket(struct inode *inode,
3348 handle_t *handle,
3349 u64 s_blkno,
3350 u64 t_blkno,
3351 int t_is_new)
3352{
3353 int ret, i;
3354 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3355 int blocksize = inode->i_sb->s_blocksize;
3356 struct buffer_head **s_bhs, **t_bhs = NULL;
3357
3358 BUG_ON(s_blkno == t_blkno);
3359
3360 mlog(0, "cp bucket %llu to %llu, target is %d\n",
3361 s_blkno, t_blkno, t_is_new);
3362
3363 s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
3364 GFP_NOFS);
3365 if (!s_bhs)
3366 return -ENOMEM;
3367
3368 ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
3369 if (ret)
3370 goto out;
3371
3372 t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
3373 GFP_NOFS);
3374 if (!t_bhs) {
3375 ret = -ENOMEM;
3376 goto out;
3377 }
3378
3379 ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
3380 if (ret)
3381 goto out;
3382
3383 for (i = 0; i < blk_per_bucket; i++) {
3384 ret = ocfs2_journal_access(handle, inode, t_bhs[i],
3385 OCFS2_JOURNAL_ACCESS_WRITE);
3386 if (ret)
3387 goto out;
3388 }
3389
3390 for (i = 0; i < blk_per_bucket; i++) {
3391 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3392 ocfs2_journal_dirty(handle, t_bhs[i]);
3393 }
3394
3395out:
3396 if (s_bhs) {
3397 for (i = 0; i < blk_per_bucket; i++)
3398 brelse(s_bhs[i]);
3399 }
3400 kfree(s_bhs);
3401
3402 if (t_bhs) {
3403 for (i = 0; i < blk_per_bucket; i++)
3404 brelse(t_bhs[i]);
3405 }
3406 kfree(t_bhs);
3407
3408 return ret;
3409}
3410
3411/*
3412 * Copy one xattr cluster from src_blk to to_blk.
3413 * The to_blk will become the first bucket header of the cluster, so its
3414 * xh_num_buckets will be initialized as the bucket num in the cluster.
3415 */
3416static int ocfs2_cp_xattr_cluster(struct inode *inode,
3417 handle_t *handle,
3418 struct buffer_head *first_bh,
3419 u64 src_blk,
3420 u64 to_blk,
3421 u32 *first_hash)
3422{
3423 int i, ret, credits;
3424 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3425 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3426 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
3427 struct buffer_head *bh = NULL;
3428 struct ocfs2_xattr_header *xh;
3429 u64 to_blk_start = to_blk;
3430
3431 mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
3432
3433 /*
3434 * We need to update the new cluster and 1 more for the update of
3435 * the 1st bucket of the previous extent rec.
3436 */
3437 credits = bpc + 1;
3438 ret = ocfs2_extend_trans(handle, credits);
3439 if (ret) {
3440 mlog_errno(ret);
3441 goto out;
3442 }
3443
3444 ret = ocfs2_journal_access(handle, inode, first_bh,
3445 OCFS2_JOURNAL_ACCESS_WRITE);
3446 if (ret) {
3447 mlog_errno(ret);
3448 goto out;
3449 }
3450
3451 for (i = 0; i < num_buckets; i++) {
3452 ret = ocfs2_cp_xattr_bucket(inode, handle,
3453 src_blk, to_blk, 1);
3454 if (ret) {
3455 mlog_errno(ret);
3456 goto out;
3457 }
3458
3459 src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3460 to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3461 }
3462
3463 /* update the old bucket header. */
3464 xh = (struct ocfs2_xattr_header *)first_bh->b_data;
3465 le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
3466
3467 ocfs2_journal_dirty(handle, first_bh);
3468
3469 /* update the new bucket header. */
3470 ret = ocfs2_read_block(inode, to_blk_start, &bh);
3471 if (ret < 0) {
3472 mlog_errno(ret);
3473 goto out;
3474 }
3475
3476 ret = ocfs2_journal_access(handle, inode, bh,
3477 OCFS2_JOURNAL_ACCESS_WRITE);
3478 if (ret) {
3479 mlog_errno(ret);
3480 goto out;
3481 }
3482
3483 xh = (struct ocfs2_xattr_header *)bh->b_data;
3484 xh->xh_num_buckets = cpu_to_le16(num_buckets);
3485
3486 ocfs2_journal_dirty(handle, bh);
3487
3488 if (first_hash)
3489 *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
3490out:
3491 brelse(bh);
3492 return ret;
3493}
3494
3495/*
3496 * Move half of the xattrs in this cluster to the new cluster.
3497 * This function should only be called when bucket size == cluster size.
3498 * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
3499 */
3500static int ocfs2_half_xattr_cluster(struct inode *inode,
3501 handle_t *handle,
3502 u64 prev_blk,
3503 u64 new_blk,
3504 u32 *first_hash)
3505{
3506 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3507 int ret, credits = 2 * blk_per_bucket;
3508
3509 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
3510
3511 ret = ocfs2_extend_trans(handle, credits);
3512 if (ret) {
3513 mlog_errno(ret);
3514 return ret;
3515 }
3516
3517 /* Move half of the xattr in start_blk to the next bucket. */
3518 return ocfs2_half_xattr_bucket(inode, handle, prev_blk,
3519 new_blk, first_hash, 1);
3520}
3521
3522/*
3523 * Move some xattrs from the old cluster to the new one since they are not
3524 * contiguous in ocfs2 xattr tree.
3525 *
3526 * new_blk starts a new separate cluster, and we will move some xattrs from
3527 * prev_blk to it. v_start will be set as the first name hash value in this
3528 * new cluster so that it can be used as e_cpos during tree insertion and
3529 * don't collide with our original b-tree operations. first_bh and header_bh
3530 * will also be updated since they will be used in ocfs2_extend_xattr_bucket
3531 * to extend the insert bucket.
3532 *
3533 * The problem is how much xattr should we move to the new one and when should
3534 * we update first_bh and header_bh?
3535 * 1. If cluster size > bucket size, that means the previous cluster has more
3536 * than 1 bucket, so just move half nums of bucket into the new cluster and
3537 * update the first_bh and header_bh if the insert bucket has been moved
3538 * to the new cluster.
3539 * 2. If cluster_size == bucket_size:
3540 * a) If the previous extent rec has more than one cluster and the insert
3541 * place isn't in the last cluster, copy the entire last cluster to the
3542 * new one. This time, we don't need to upate the first_bh and header_bh
3543 * since they will not be moved into the new cluster.
3544 * b) Otherwise, move the bottom half of the xattrs in the last cluster into
3545 * the new one. And we set the extend flag to zero if the insert place is
3546 * moved into the new allocated cluster since no extend is needed.
3547 */
3548static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
3549 handle_t *handle,
3550 struct buffer_head **first_bh,
3551 struct buffer_head **header_bh,
3552 u64 new_blk,
3553 u64 prev_blk,
3554 u32 prev_clusters,
3555 u32 *v_start,
3556 int *extend)
3557{
3558 int ret = 0;
3559 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3560
3561 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
3562 prev_blk, prev_clusters, new_blk);
3563
3564 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
3565 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
3566 handle,
3567 first_bh,
3568 header_bh,
3569 new_blk,
3570 prev_blk,
3571 prev_clusters,
3572 v_start);
3573 else {
3574 u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
3575
3576 if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
3577 ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
3578 last_blk, new_blk,
3579 v_start);
3580 else {
3581 ret = ocfs2_half_xattr_cluster(inode, handle,
3582 last_blk, new_blk,
3583 v_start);
3584
3585 if ((*header_bh)->b_blocknr == last_blk && extend)
3586 *extend = 0;
3587 }
3588 }
3589
3590 return ret;
3591}
3592
3593/*
3594 * Add a new cluster for xattr storage.
3595 *
3596 * If the new cluster is contiguous with the previous one, it will be
3597 * appended to the same extent record, and num_clusters will be updated.
3598 * If not, we will insert a new extent for it and move some xattrs in
3599 * the last cluster into the new allocated one.
3600 * We also need to limit the maximum size of a btree leaf, otherwise we'll
3601 * lose the benefits of hashing because we'll have to search large leaves.
3602 * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
3603 * if it's bigger).
3604 *
3605 * first_bh is the first block of the previous extent rec and header_bh
3606 * indicates the bucket we will insert the new xattrs. They will be updated
3607 * when the header_bh is moved into the new cluster.
3608 */
3609static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3610 struct buffer_head *root_bh,
3611 struct buffer_head **first_bh,
3612 struct buffer_head **header_bh,
3613 u32 *num_clusters,
3614 u32 prev_cpos,
3615 u64 prev_blkno,
3616 int *extend)
3617{
3618 int ret, credits;
3619 u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3620 u32 prev_clusters = *num_clusters;
3621 u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
3622 u64 block;
3623 handle_t *handle = NULL;
3624 struct ocfs2_alloc_context *data_ac = NULL;
3625 struct ocfs2_alloc_context *meta_ac = NULL;
3626 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3627 struct ocfs2_extent_tree et;
3628
3629 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
3630 "previous xattr blkno = %llu\n",
3631 (unsigned long long)OCFS2_I(inode)->ip_blkno,
3632 prev_cpos, prev_blkno);
3633
3634 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
3635
3636 ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
3637 &data_ac, &meta_ac);
3638 if (ret) {
3639 mlog_errno(ret);
3640 goto leave;
3641 }
3642
3643 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
3644 clusters_to_add);
3645 handle = ocfs2_start_trans(osb, credits);
3646 if (IS_ERR(handle)) {
3647 ret = PTR_ERR(handle);
3648 handle = NULL;
3649 mlog_errno(ret);
3650 goto leave;
3651 }
3652
3653 ret = ocfs2_journal_access(handle, inode, root_bh,
3654 OCFS2_JOURNAL_ACCESS_WRITE);
3655 if (ret < 0) {
3656 mlog_errno(ret);
3657 goto leave;
3658 }
3659
3660 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
3661 clusters_to_add, &bit_off, &num_bits);
3662 if (ret < 0) {
3663 if (ret != -ENOSPC)
3664 mlog_errno(ret);
3665 goto leave;
3666 }
3667
3668 BUG_ON(num_bits > clusters_to_add);
3669
3670 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
3671 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
3672 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
3673
3674 if (prev_blkno + prev_clusters * bpc == block &&
3675 (prev_clusters + num_bits) << osb->s_clustersize_bits <=
3676 OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
3677 /*
3678 * If this cluster is contiguous with the old one and
3679 * adding this new cluster, we don't surpass the limit of
3680 * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
3681 * initialized and used like other buckets in the previous
3682 * cluster.
3683 * So add it as a contiguous one. The caller will handle
3684 * its init process.
3685 */
3686 v_start = prev_cpos + prev_clusters;
3687 *num_clusters = prev_clusters + num_bits;
3688 mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
3689 num_bits);
3690 } else {
3691 ret = ocfs2_adjust_xattr_cross_cluster(inode,
3692 handle,
3693 first_bh,
3694 header_bh,
3695 block,
3696 prev_blkno,
3697 prev_clusters,
3698 &v_start,
3699 extend);
3700 if (ret) {
3701 mlog_errno(ret);
3702 goto leave;
3703 }
3704 }
3705
3706 if (handle->h_buffer_credits < credits) {
3707 /*
3708 * The journal has been restarted before, and don't
3709 * have enough space for the insertion, so extend it
3710 * here.
3711 */
3712 ret = ocfs2_extend_trans(handle, credits);
3713 if (ret) {
3714 mlog_errno(ret);
3715 goto leave;
3716 }
3717 }
3718 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
3719 num_bits, block, v_start);
3720 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
3721 num_bits, 0, meta_ac);
3722 if (ret < 0) {
3723 mlog_errno(ret);
3724 goto leave;
3725 }
3726
3727 ret = ocfs2_journal_dirty(handle, root_bh);
3728 if (ret < 0) {
3729 mlog_errno(ret);
3730 goto leave;
3731 }
3732
3733leave:
3734 if (handle)
3735 ocfs2_commit_trans(osb, handle);
3736 if (data_ac)
3737 ocfs2_free_alloc_context(data_ac);
3738 if (meta_ac)
3739 ocfs2_free_alloc_context(meta_ac);
3740
3741 return ret;
3742}
3743
3744/*
3745 * Extend a new xattr bucket and move xattrs to the end one by one until
3746 * We meet with start_bh. Only move half of the xattrs to the bucket after it.
3747 */
3748static int ocfs2_extend_xattr_bucket(struct inode *inode,
3749 struct buffer_head *first_bh,
3750 struct buffer_head *start_bh,
3751 u32 num_clusters)
3752{
3753 int ret, credits;
3754 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3755 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3756 u64 start_blk = start_bh->b_blocknr, end_blk;
3757 u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
3758 handle_t *handle;
3759 struct ocfs2_xattr_header *first_xh =
3760 (struct ocfs2_xattr_header *)first_bh->b_data;
3761 u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
3762
3763 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
3764 "from %llu, len = %u\n", start_blk,
3765 (unsigned long long)first_bh->b_blocknr, num_clusters);
3766
3767 BUG_ON(bucket >= num_buckets);
3768
3769 end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
3770
3771 /*
3772 * We will touch all the buckets after the start_bh(include it).
3773 * Add one more bucket and modify the first_bh.
3774 */
3775 credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
3776 handle = ocfs2_start_trans(osb, credits);
3777 if (IS_ERR(handle)) {
3778 ret = PTR_ERR(handle);
3779 handle = NULL;
3780 mlog_errno(ret);
3781 goto out;
3782 }
3783
3784 ret = ocfs2_journal_access(handle, inode, first_bh,
3785 OCFS2_JOURNAL_ACCESS_WRITE);
3786 if (ret) {
3787 mlog_errno(ret);
3788 goto commit;
3789 }
3790
3791 while (end_blk != start_blk) {
3792 ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
3793 end_blk + blk_per_bucket, 0);
3794 if (ret)
3795 goto commit;
3796 end_blk -= blk_per_bucket;
3797 }
3798
3799 /* Move half of the xattr in start_blk to the next bucket. */
3800 ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
3801 start_blk + blk_per_bucket, NULL, 0);
3802
3803 le16_add_cpu(&first_xh->xh_num_buckets, 1);
3804 ocfs2_journal_dirty(handle, first_bh);
3805
3806commit:
3807 ocfs2_commit_trans(osb, handle);
3808out:
3809 return ret;
3810}
3811
3812/*
3813 * Add new xattr bucket in an extent record and adjust the buckets accordingly.
3814 * xb_bh is the ocfs2_xattr_block.
3815 * We will move all the buckets starting from header_bh to the next place. As
3816 * for this one, half num of its xattrs will be moved to the next one.
3817 *
3818 * We will allocate a new cluster if current cluster is full and adjust
3819 * header_bh and first_bh if the insert place is moved to the new cluster.
3820 */
3821static int ocfs2_add_new_xattr_bucket(struct inode *inode,
3822 struct buffer_head *xb_bh,
3823 struct buffer_head *header_bh)
3824{
3825 struct ocfs2_xattr_header *first_xh = NULL;
3826 struct buffer_head *first_bh = NULL;
3827 struct ocfs2_xattr_block *xb =
3828 (struct ocfs2_xattr_block *)xb_bh->b_data;
3829 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
3830 struct ocfs2_extent_list *el = &xb_root->xt_list;
3831 struct ocfs2_xattr_header *xh =
3832 (struct ocfs2_xattr_header *)header_bh->b_data;
3833 u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
3834 struct super_block *sb = inode->i_sb;
3835 struct ocfs2_super *osb = OCFS2_SB(sb);
3836 int ret, num_buckets, extend = 1;
3837 u64 p_blkno;
3838 u32 e_cpos, num_clusters;
3839
3840 mlog(0, "Add new xattr bucket starting form %llu\n",
3841 (unsigned long long)header_bh->b_blocknr);
3842
3843 /*
3844 * Add refrence for header_bh here because it may be
3845 * changed in ocfs2_add_new_xattr_cluster and we need
3846 * to free it in the end.
3847 */
3848 get_bh(header_bh);
3849
3850 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
3851 &num_clusters, el);
3852 if (ret) {
3853 mlog_errno(ret);
3854 goto out;
3855 }
3856
3857 ret = ocfs2_read_block(inode, p_blkno, &first_bh);
3858 if (ret) {
3859 mlog_errno(ret);
3860 goto out;
3861 }
3862
3863 num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
3864 first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
3865
3866 if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
3867 ret = ocfs2_add_new_xattr_cluster(inode,
3868 xb_bh,
3869 &first_bh,
3870 &header_bh,
3871 &num_clusters,
3872 e_cpos,
3873 p_blkno,
3874 &extend);
3875 if (ret) {
3876 mlog_errno(ret);
3877 goto out;
3878 }
3879 }
3880
3881 if (extend)
3882 ret = ocfs2_extend_xattr_bucket(inode,
3883 first_bh,
3884 header_bh,
3885 num_clusters);
3886 if (ret)
3887 mlog_errno(ret);
3888out:
3889 brelse(first_bh);
3890 brelse(header_bh);
3891 return ret;
3892}
3893
3894static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
3895 struct ocfs2_xattr_bucket *bucket,
3896 int offs)
3897{
3898 int block_off = offs >> inode->i_sb->s_blocksize_bits;
3899
3900 offs = offs % inode->i_sb->s_blocksize;
3901 return bucket->bhs[block_off]->b_data + offs;
3902}
3903
3904/*
3905 * Handle the normal xattr set, including replace, delete and new.
3906 *
3907 * Note: "local" indicates the real data's locality. So we can't
3908 * just its bucket locality by its length.
3909 */
3910static void ocfs2_xattr_set_entry_normal(struct inode *inode,
3911 struct ocfs2_xattr_info *xi,
3912 struct ocfs2_xattr_search *xs,
3913 u32 name_hash,
3914 int local)
3915{
3916 struct ocfs2_xattr_entry *last, *xe;
3917 int name_len = strlen(xi->name);
3918 struct ocfs2_xattr_header *xh = xs->header;
3919 u16 count = le16_to_cpu(xh->xh_count), start;
3920 size_t blocksize = inode->i_sb->s_blocksize;
3921 char *val;
3922 size_t offs, size, new_size;
3923
3924 last = &xh->xh_entries[count];
3925 if (!xs->not_found) {
3926 xe = xs->here;
3927 offs = le16_to_cpu(xe->xe_name_offset);
3928 if (ocfs2_xattr_is_local(xe))
3929 size = OCFS2_XATTR_SIZE(name_len) +
3930 OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
3931 else
3932 size = OCFS2_XATTR_SIZE(name_len) +
3933 OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
3934
3935 /*
3936 * If the new value will be stored outside, xi->value has been
3937 * initalized as an empty ocfs2_xattr_value_root, and the same
3938 * goes with xi->value_len, so we can set new_size safely here.
3939 * See ocfs2_xattr_set_in_bucket.
3940 */
3941 new_size = OCFS2_XATTR_SIZE(name_len) +
3942 OCFS2_XATTR_SIZE(xi->value_len);
3943
3944 le16_add_cpu(&xh->xh_name_value_len, -size);
3945 if (xi->value) {
3946 if (new_size > size)
3947 goto set_new_name_value;
3948
3949 /* Now replace the old value with new one. */
3950 if (local)
3951 xe->xe_value_size = cpu_to_le64(xi->value_len);
3952 else
3953 xe->xe_value_size = 0;
3954
3955 val = ocfs2_xattr_bucket_get_val(inode,
3956 &xs->bucket, offs);
3957 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
3958 size - OCFS2_XATTR_SIZE(name_len));
3959 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
3960 memcpy(val + OCFS2_XATTR_SIZE(name_len),
3961 xi->value, xi->value_len);
3962
3963 le16_add_cpu(&xh->xh_name_value_len, new_size);
3964 ocfs2_xattr_set_local(xe, local);
3965 return;
3966 } else {
3967 /*
3968 * Remove the old entry if there is more than one.
3969 * We don't remove the last entry so that we can
3970 * use it to indicate the hash value of the empty
3971 * bucket.
3972 */
3973 last -= 1;
3974 le16_add_cpu(&xh->xh_count, -1);
3975 if (xh->xh_count) {
3976 memmove(xe, xe + 1,
3977 (void *)last - (void *)xe);
3978 memset(last, 0,
3979 sizeof(struct ocfs2_xattr_entry));
3980 } else
3981 xh->xh_free_start =
3982 cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
3983
3984 return;
3985 }
3986 } else {
3987 /* find a new entry for insert. */
3988 int low = 0, high = count - 1, tmp;
3989 struct ocfs2_xattr_entry *tmp_xe;
3990
3991 while (low <= high && count) {
3992 tmp = (low + high) / 2;
3993 tmp_xe = &xh->xh_entries[tmp];
3994
3995 if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
3996 low = tmp + 1;
3997 else if (name_hash <
3998 le32_to_cpu(tmp_xe->xe_name_hash))
3999 high = tmp - 1;
4000 else {
4001 low = tmp;
4002 break;
4003 }
4004 }
4005
4006 xe = &xh->xh_entries[low];
4007 if (low != count)
4008 memmove(xe + 1, xe, (void *)last - (void *)xe);
4009
4010 le16_add_cpu(&xh->xh_count, 1);
4011 memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
4012 xe->xe_name_hash = cpu_to_le32(name_hash);
4013 xe->xe_name_len = name_len;
4014 ocfs2_xattr_set_type(xe, xi->name_index);
4015 }
4016
4017set_new_name_value:
4018 /* Insert the new name+value. */
4019 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
4020
4021 /*
4022 * We must make sure that the name/value pair
4023 * exists in the same block.
4024 */
4025 offs = le16_to_cpu(xh->xh_free_start);
4026 start = offs - size;
4027
4028 if (start >> inode->i_sb->s_blocksize_bits !=
4029 (offs - 1) >> inode->i_sb->s_blocksize_bits) {
4030 offs = offs - offs % blocksize;
4031 xh->xh_free_start = cpu_to_le16(offs);
4032 }
4033
4034 val = ocfs2_xattr_bucket_get_val(inode,
4035 &xs->bucket, offs - size);
4036 xe->xe_name_offset = cpu_to_le16(offs - size);
4037
4038 memset(val, 0, size);
4039 memcpy(val, xi->name, name_len);
4040 memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
4041
4042 xe->xe_value_size = cpu_to_le64(xi->value_len);
4043 ocfs2_xattr_set_local(xe, local);
4044 xs->here = xe;
4045 le16_add_cpu(&xh->xh_free_start, -size);
4046 le16_add_cpu(&xh->xh_name_value_len, size);
4047
4048 return;
4049}
4050
4051static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
4052 handle_t *handle,
4053 struct ocfs2_xattr_search *xs,
4054 struct buffer_head **bhs,
4055 u16 bh_num)
4056{
4057 int ret = 0, off, block_off;
4058 struct ocfs2_xattr_entry *xe = xs->here;
4059
4060 /*
4061 * First calculate all the blocks we should journal_access
4062 * and journal_dirty. The first block should always be touched.
4063 */
4064 ret = ocfs2_journal_dirty(handle, bhs[0]);
4065 if (ret)
4066 mlog_errno(ret);
4067
4068 /* calc the data. */
4069 off = le16_to_cpu(xe->xe_name_offset);
4070 block_off = off >> inode->i_sb->s_blocksize_bits;
4071 ret = ocfs2_journal_dirty(handle, bhs[block_off]);
4072 if (ret)
4073 mlog_errno(ret);
4074
4075 return ret;
4076}
4077
4078/*
4079 * Set the xattr entry in the specified bucket.
4080 * The bucket is indicated by xs->bucket and it should have the enough
4081 * space for the xattr insertion.
4082 */
4083static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4084 struct ocfs2_xattr_info *xi,
4085 struct ocfs2_xattr_search *xs,
4086 u32 name_hash,
4087 int local)
4088{
4089 int i, ret;
4090 handle_t *handle = NULL;
4091 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4092 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4093
4094 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4095 (unsigned long)xi->value_len, xi->name_index,
4096 (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
4097
4098 if (!xs->bucket.bhs[1]) {
4099 ret = ocfs2_read_blocks(inode,
4100 xs->bucket.bhs[0]->b_blocknr + 1,
4101 blk_per_bucket - 1, &xs->bucket.bhs[1],
4102 0);
4103 if (ret) {
4104 mlog_errno(ret);
4105 goto out;
4106 }
4107 }
4108
4109 handle = ocfs2_start_trans(osb, blk_per_bucket);
4110 if (IS_ERR(handle)) {
4111 ret = PTR_ERR(handle);
4112 handle = NULL;
4113 mlog_errno(ret);
4114 goto out;
4115 }
4116
4117 for (i = 0; i < blk_per_bucket; i++) {
4118 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
4119 OCFS2_JOURNAL_ACCESS_WRITE);
4120 if (ret < 0) {
4121 mlog_errno(ret);
4122 goto out;
4123 }
4124 }
4125
4126 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4127
4128 /*Only dirty the blocks we have touched in set xattr. */
4129 ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
4130 xs->bucket.bhs, blk_per_bucket);
4131 if (ret)
4132 mlog_errno(ret);
4133out:
4134 ocfs2_commit_trans(osb, handle);
4135
4136 return ret;
4137}
4138
4139static int ocfs2_xattr_value_update_size(struct inode *inode,
4140 struct buffer_head *xe_bh,
4141 struct ocfs2_xattr_entry *xe,
4142 u64 new_size)
4143{
4144 int ret;
4145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4146 handle_t *handle = NULL;
4147
4148 handle = ocfs2_start_trans(osb, 1);
4149 if (handle == NULL) {
4150 ret = -ENOMEM;
4151 mlog_errno(ret);
4152 goto out;
4153 }
4154
4155 ret = ocfs2_journal_access(handle, inode, xe_bh,
4156 OCFS2_JOURNAL_ACCESS_WRITE);
4157 if (ret < 0) {
4158 mlog_errno(ret);
4159 goto out_commit;
4160 }
4161
4162 xe->xe_value_size = cpu_to_le64(new_size);
4163
4164 ret = ocfs2_journal_dirty(handle, xe_bh);
4165 if (ret < 0)
4166 mlog_errno(ret);
4167
4168out_commit:
4169 ocfs2_commit_trans(osb, handle);
4170out:
4171 return ret;
4172}
4173
4174/*
4175 * Truncate the specified xe_off entry in xattr bucket.
4176 * bucket is indicated by header_bh and len is the new length.
4177 * Both the ocfs2_xattr_value_root and the entry will be updated here.
4178 *
4179 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
4180 */
4181static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4182 struct buffer_head *header_bh,
4183 int xe_off,
4184 int len)
4185{
4186 int ret, offset;
4187 u64 value_blk;
4188 struct buffer_head *value_bh = NULL;
4189 struct ocfs2_xattr_value_root *xv;
4190 struct ocfs2_xattr_entry *xe;
4191 struct ocfs2_xattr_header *xh =
4192 (struct ocfs2_xattr_header *)header_bh->b_data;
4193 size_t blocksize = inode->i_sb->s_blocksize;
4194
4195 xe = &xh->xh_entries[xe_off];
4196
4197 BUG_ON(!xe || ocfs2_xattr_is_local(xe));
4198
4199 offset = le16_to_cpu(xe->xe_name_offset) +
4200 OCFS2_XATTR_SIZE(xe->xe_name_len);
4201
4202 value_blk = offset / blocksize;
4203
4204 /* We don't allow ocfs2_xattr_value to be stored in different block. */
4205 BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
4206 value_blk += header_bh->b_blocknr;
4207
4208 ret = ocfs2_read_block(inode, value_blk, &value_bh);
4209 if (ret) {
4210 mlog_errno(ret);
4211 goto out;
4212 }
4213
4214 xv = (struct ocfs2_xattr_value_root *)
4215 (value_bh->b_data + offset % blocksize);
4216
4217 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
4218 xe_off, (unsigned long long)header_bh->b_blocknr, len);
4219 ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
4220 if (ret) {
4221 mlog_errno(ret);
4222 goto out;
4223 }
4224
4225 ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
4226 if (ret) {
4227 mlog_errno(ret);
4228 goto out;
4229 }
4230
4231out:
4232 brelse(value_bh);
4233 return ret;
4234}
4235
4236static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
4237 struct ocfs2_xattr_search *xs,
4238 int len)
4239{
4240 int ret, offset;
4241 struct ocfs2_xattr_entry *xe = xs->here;
4242 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
4243
4244 BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
4245
4246 offset = xe - xh->xh_entries;
4247 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
4248 offset, len);
4249 if (ret)
4250 mlog_errno(ret);
4251
4252 return ret;
4253}
4254
4255static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4256 struct ocfs2_xattr_search *xs,
4257 char *val,
4258 int value_len)
4259{
4260 int offset;
4261 struct ocfs2_xattr_value_root *xv;
4262 struct ocfs2_xattr_entry *xe = xs->here;
4263
4264 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
4265
4266 offset = le16_to_cpu(xe->xe_name_offset) +
4267 OCFS2_XATTR_SIZE(xe->xe_name_len);
4268
4269 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
4270
4271 return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
4272}
4273
4274static int ocfs2_rm_xattr_cluster(struct inode *inode,
4275 struct buffer_head *root_bh,
4276 u64 blkno,
4277 u32 cpos,
4278 u32 len)
4279{
4280 int ret;
4281 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4282 struct inode *tl_inode = osb->osb_tl_inode;
4283 handle_t *handle;
4284 struct ocfs2_xattr_block *xb =
4285 (struct ocfs2_xattr_block *)root_bh->b_data;
4286 struct ocfs2_alloc_context *meta_ac = NULL;
4287 struct ocfs2_cached_dealloc_ctxt dealloc;
4288 struct ocfs2_extent_tree et;
4289
4290 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
4291
4292 ocfs2_init_dealloc_ctxt(&dealloc);
4293
4294 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
4295 cpos, len, (unsigned long long)blkno);
4296
4297 ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
4298
4299 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
4300 if (ret) {
4301 mlog_errno(ret);
4302 return ret;
4303 }
4304
4305 mutex_lock(&tl_inode->i_mutex);
4306
4307 if (ocfs2_truncate_log_needs_flush(osb)) {
4308 ret = __ocfs2_flush_truncate_log(osb);
4309 if (ret < 0) {
4310 mlog_errno(ret);
4311 goto out;
4312 }
4313 }
4314
4315 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
4316 if (handle == NULL) {
4317 ret = -ENOMEM;
4318 mlog_errno(ret);
4319 goto out;
4320 }
4321
4322 ret = ocfs2_journal_access(handle, inode, root_bh,
4323 OCFS2_JOURNAL_ACCESS_WRITE);
4324 if (ret) {
4325 mlog_errno(ret);
4326 goto out_commit;
4327 }
4328
4329 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
4330 &dealloc);
4331 if (ret) {
4332 mlog_errno(ret);
4333 goto out_commit;
4334 }
4335
4336 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
4337
4338 ret = ocfs2_journal_dirty(handle, root_bh);
4339 if (ret) {
4340 mlog_errno(ret);
4341 goto out_commit;
4342 }
4343
4344 ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
4345 if (ret)
4346 mlog_errno(ret);
4347
4348out_commit:
4349 ocfs2_commit_trans(osb, handle);
4350out:
4351 ocfs2_schedule_truncate_log_flush(osb, 1);
4352
4353 mutex_unlock(&tl_inode->i_mutex);
4354
4355 if (meta_ac)
4356 ocfs2_free_alloc_context(meta_ac);
4357
4358 ocfs2_run_deallocs(osb, &dealloc);
4359
4360 return ret;
4361}
4362
4363static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
4364 struct ocfs2_xattr_search *xs)
4365{
4366 handle_t *handle = NULL;
4367 struct ocfs2_xattr_header *xh = xs->bucket.xh;
4368 struct ocfs2_xattr_entry *last = &xh->xh_entries[
4369 le16_to_cpu(xh->xh_count) - 1];
4370 int ret = 0;
4371
4372 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
4373 if (IS_ERR(handle)) {
4374 ret = PTR_ERR(handle);
4375 mlog_errno(ret);
4376 return;
4377 }
4378
4379 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
4380 OCFS2_JOURNAL_ACCESS_WRITE);
4381 if (ret) {
4382 mlog_errno(ret);
4383 goto out_commit;
4384 }
4385
4386 /* Remove the old entry. */
4387 memmove(xs->here, xs->here + 1,
4388 (void *)last - (void *)xs->here);
4389 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
4390 le16_add_cpu(&xh->xh_count, -1);
4391
4392 ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
4393 if (ret < 0)
4394 mlog_errno(ret);
4395out_commit:
4396 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
4397}
4398
4399/*
4400 * Set the xattr name/value in the bucket specified in xs.
4401 *
4402 * As the new value in xi may be stored in the bucket or in an outside cluster,
4403 * we divide the whole process into 3 steps:
4404 * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
4405 * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
4406 * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
4407 * 4. If the clusters for the new outside value can't be allocated, we need
4408 * to free the xattr we allocated in set.
4409 */
4410static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4411 struct ocfs2_xattr_info *xi,
4412 struct ocfs2_xattr_search *xs)
4413{
4414 int ret, local = 1;
4415 size_t value_len;
4416 char *val = (char *)xi->value;
4417 struct ocfs2_xattr_entry *xe = xs->here;
4418 u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
4419 strlen(xi->name));
4420
4421 if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
4422 /*
4423 * We need to truncate the xattr storage first.
4424 *
4425 * If both the old and new value are stored to
4426 * outside block, we only need to truncate
4427 * the storage and then set the value outside.
4428 *
4429 * If the new value should be stored within block,
4430 * we should free all the outside block first and
4431 * the modification to the xattr block will be done
4432 * by following steps.
4433 */
4434 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
4435 value_len = xi->value_len;
4436 else
4437 value_len = 0;
4438
4439 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4440 value_len);
4441 if (ret)
4442 goto out;
4443
4444 if (value_len)
4445 goto set_value_outside;
4446 }
4447
4448 value_len = xi->value_len;
4449 /* So we have to handle the inside block change now. */
4450 if (value_len > OCFS2_XATTR_INLINE_SIZE) {
4451 /*
4452 * If the new value will be stored outside of block,
4453 * initalize a new empty value root and insert it first.
4454 */
4455 local = 0;
4456 xi->value = &def_xv;
4457 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
4458 }
4459
4460 ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
4461 if (ret) {
4462 mlog_errno(ret);
4463 goto out;
4464 }
4465
4466 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
4467 goto out;
4468
4469 /* allocate the space now for the outside block storage. */
4470 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4471 value_len);
4472 if (ret) {
4473 mlog_errno(ret);
4474
4475 if (xs->not_found) {
4476 /*
4477 * We can't allocate enough clusters for outside
4478 * storage and we have allocated xattr already,
4479 * so need to remove it.
4480 */
4481 ocfs2_xattr_bucket_remove_xs(inode, xs);
4482 }
4483 goto out;
4484 }
4485
4486set_value_outside:
4487 ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
4488out:
4489 return ret;
4490}
4491
4492/* check whether the xattr bucket is filled up with the same hash value. */
4493static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4494 struct ocfs2_xattr_bucket *bucket)
4495{
4496 struct ocfs2_xattr_header *xh = bucket->xh;
4497
4498 if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
4499 xh->xh_entries[0].xe_name_hash) {
4500 mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
4501 "hash = %u\n",
4502 (unsigned long long)bucket->bhs[0]->b_blocknr,
4503 le32_to_cpu(xh->xh_entries[0].xe_name_hash));
4504 return -ENOSPC;
4505 }
4506
4507 return 0;
4508}
4509
4510static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
4511 struct ocfs2_xattr_info *xi,
4512 struct ocfs2_xattr_search *xs)
4513{
4514 struct ocfs2_xattr_header *xh;
4515 struct ocfs2_xattr_entry *xe;
4516 u16 count, header_size, xh_free_start;
4517 int i, free, max_free, need, old;
4518 size_t value_size = 0, name_len = strlen(xi->name);
4519 size_t blocksize = inode->i_sb->s_blocksize;
4520 int ret, allocation = 0;
4521 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4522
4523 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
4524
4525try_again:
4526 xh = xs->header;
4527 count = le16_to_cpu(xh->xh_count);
4528 xh_free_start = le16_to_cpu(xh->xh_free_start);
4529 header_size = sizeof(struct ocfs2_xattr_header) +
4530 count * sizeof(struct ocfs2_xattr_entry);
4531 max_free = OCFS2_XATTR_BUCKET_SIZE -
4532 le16_to_cpu(xh->xh_name_value_len) - header_size;
4533
4534 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
4535 "of %u which exceed block size\n",
4536 (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
4537 header_size);
4538
4539 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
4540 value_size = OCFS2_XATTR_ROOT_SIZE;
4541 else if (xi->value)
4542 value_size = OCFS2_XATTR_SIZE(xi->value_len);
4543
4544 if (xs->not_found)
4545 need = sizeof(struct ocfs2_xattr_entry) +
4546 OCFS2_XATTR_SIZE(name_len) + value_size;
4547 else {
4548 need = value_size + OCFS2_XATTR_SIZE(name_len);
4549
4550 /*
4551 * We only replace the old value if the new length is smaller
4552 * than the old one. Otherwise we will allocate new space in the
4553 * bucket to store it.
4554 */
4555 xe = xs->here;
4556 if (ocfs2_xattr_is_local(xe))
4557 old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
4558 else
4559 old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
4560
4561 if (old >= value_size)
4562 need = 0;
4563 }
4564
4565 free = xh_free_start - header_size;
4566 /*
4567 * We need to make sure the new name/value pair
4568 * can exist in the same block.
4569 */
4570 if (xh_free_start % blocksize < need)
4571 free -= xh_free_start % blocksize;
4572
4573 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
4574 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
4575 " %u\n", xs->not_found,
4576 (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
4577 free, need, max_free, le16_to_cpu(xh->xh_free_start),
4578 le16_to_cpu(xh->xh_name_value_len));
4579
4580 if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
4581 if (need <= max_free &&
4582 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
4583 /*
4584 * We can create the space by defragment. Since only the
4585 * name/value will be moved, the xe shouldn't be changed
4586 * in xs.
4587 */
4588 ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
4589 if (ret) {
4590 mlog_errno(ret);
4591 goto out;
4592 }
4593
4594 xh_free_start = le16_to_cpu(xh->xh_free_start);
4595 free = xh_free_start - header_size;
4596 if (xh_free_start % blocksize < need)
4597 free -= xh_free_start % blocksize;
4598
4599 if (free >= need)
4600 goto xattr_set;
4601
4602 mlog(0, "Can't get enough space for xattr insert by "
4603 "defragment. Need %u bytes, but we have %d, so "
4604 "allocate new bucket for it.\n", need, free);
4605 }
4606
4607 /*
4608 * We have to add new buckets or clusters and one
4609 * allocation should leave us enough space for insert.
4610 */
4611 BUG_ON(allocation);
4612
4613 /*
4614 * We do not allow for overlapping ranges between buckets. And
4615 * the maximum number of collisions we will allow for then is
4616 * one bucket's worth, so check it here whether we need to
4617 * add a new bucket for the insert.
4618 */
4619 ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
4620 if (ret) {
4621 mlog_errno(ret);
4622 goto out;
4623 }
4624
4625 ret = ocfs2_add_new_xattr_bucket(inode,
4626 xs->xattr_bh,
4627 xs->bucket.bhs[0]);
4628 if (ret) {
4629 mlog_errno(ret);
4630 goto out;
4631 }
4632
4633 for (i = 0; i < blk_per_bucket; i++)
4634 brelse(xs->bucket.bhs[i]);
4635
4636 memset(&xs->bucket, 0, sizeof(xs->bucket));
4637
4638 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
4639 xi->name_index,
4640 xi->name, xs);
4641 if (ret && ret != -ENODATA)
4642 goto out;
4643 xs->not_found = ret;
4644 allocation = 1;
4645 goto try_again;
4646 }
4647
4648xattr_set:
4649 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
4650out:
4651 mlog_exit(ret);
4652 return ret;
4653}
4654
4655static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
4656 struct ocfs2_xattr_bucket *bucket,
4657 void *para)
4658{
4659 int ret = 0;
4660 struct ocfs2_xattr_header *xh = bucket->xh;
4661 u16 i;
4662 struct ocfs2_xattr_entry *xe;
4663
4664 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4665 xe = &xh->xh_entries[i];
4666 if (ocfs2_xattr_is_local(xe))
4667 continue;
4668
4669 ret = ocfs2_xattr_bucket_value_truncate(inode,
4670 bucket->bhs[0],
4671 i, 0);
4672 if (ret) {
4673 mlog_errno(ret);
4674 break;
4675 }
4676 }
4677
4678 return ret;
4679}
4680
4681static int ocfs2_delete_xattr_index_block(struct inode *inode,
4682 struct buffer_head *xb_bh)
4683{
4684 struct ocfs2_xattr_block *xb =
4685 (struct ocfs2_xattr_block *)xb_bh->b_data;
4686 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
4687 int ret = 0;
4688 u32 name_hash = UINT_MAX, e_cpos, num_clusters;
4689 u64 p_blkno;
4690
4691 if (le16_to_cpu(el->l_next_free_rec) == 0)
4692 return 0;
4693
4694 while (name_hash > 0) {
4695 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
4696 &e_cpos, &num_clusters, el);
4697 if (ret) {
4698 mlog_errno(ret);
4699 goto out;
4700 }
4701
4702 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
4703 ocfs2_delete_xattr_in_bucket,
4704 NULL);
4705 if (ret) {
4706 mlog_errno(ret);
4707 goto out;
4708 }
4709
4710 ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
4711 p_blkno, e_cpos, num_clusters);
4712 if (ret) {
4713 mlog_errno(ret);
4714 break;
4715 }
4716
4717 if (e_cpos == 0)
4718 break;
4719
4720 name_hash = e_cpos - 1;
4721 }
4722
4723out:
4724 return ret;
4725}
4726
4727/*
4728 * 'trusted' attributes support
4729 */
4730
4731#define XATTR_TRUSTED_PREFIX "trusted."
4732
4733static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
4734 size_t list_size, const char *name,
4735 size_t name_len)
4736{
4737 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
4738 const size_t total_len = prefix_len + name_len + 1;
4739
4740 if (list && total_len <= list_size) {
4741 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
4742 memcpy(list + prefix_len, name, name_len);
4743 list[prefix_len + name_len] = '\0';
4744 }
4745 return total_len;
4746}
4747
4748static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
4749 void *buffer, size_t size)
4750{
4751 if (strcmp(name, "") == 0)
4752 return -EINVAL;
4753 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
4754 buffer, size);
4755}
4756
4757static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
4758 const void *value, size_t size, int flags)
4759{
4760 if (strcmp(name, "") == 0)
4761 return -EINVAL;
4762
4763 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
4764 size, flags);
4765}
4766
4767struct xattr_handler ocfs2_xattr_trusted_handler = {
4768 .prefix = XATTR_TRUSTED_PREFIX,
4769 .list = ocfs2_xattr_trusted_list,
4770 .get = ocfs2_xattr_trusted_get,
4771 .set = ocfs2_xattr_trusted_set,
4772};
4773
4774
4775/*
4776 * 'user' attributes support
4777 */
4778
4779#define XATTR_USER_PREFIX "user."
4780
4781static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
4782 size_t list_size, const char *name,
4783 size_t name_len)
4784{
4785 const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
4786 const size_t total_len = prefix_len + name_len + 1;
4787 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4788
4789 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
4790 return 0;
4791
4792 if (list && total_len <= list_size) {
4793 memcpy(list, XATTR_USER_PREFIX, prefix_len);
4794 memcpy(list + prefix_len, name, name_len);
4795 list[prefix_len + name_len] = '\0';
4796 }
4797 return total_len;
4798}
4799
4800static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
4801 void *buffer, size_t size)
4802{
4803 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4804
4805 if (strcmp(name, "") == 0)
4806 return -EINVAL;
4807 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
4808 return -EOPNOTSUPP;
4809 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
4810 buffer, size);
4811}
4812
4813static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
4814 const void *value, size_t size, int flags)
4815{
4816 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4817
4818 if (strcmp(name, "") == 0)
4819 return -EINVAL;
4820 if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
4821 return -EOPNOTSUPP;
4822
4823 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
4824 size, flags);
4825}
4826
4827struct xattr_handler ocfs2_xattr_user_handler = {
4828 .prefix = XATTR_USER_PREFIX,
4829 .list = ocfs2_xattr_user_list,
4830 .get = ocfs2_xattr_user_get,
4831 .set = ocfs2_xattr_user_set,
4832};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 000000000000..c25c7c62a059
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,68 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * xattr.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_XATTR_H
27#define OCFS2_XATTR_H
28
29#include <linux/init.h>
30#include <linux/xattr.h>
31
32enum ocfs2_xattr_type {
33 OCFS2_XATTR_INDEX_USER = 1,
34 OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
35 OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
36 OCFS2_XATTR_INDEX_TRUSTED,
37 OCFS2_XATTR_INDEX_SECURITY,
38 OCFS2_XATTR_MAX
39};
40
41extern struct xattr_handler ocfs2_xattr_user_handler;
42extern struct xattr_handler ocfs2_xattr_trusted_handler;
43
44extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
45extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
46extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
47 size_t, int);
48extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
49extern struct xattr_handler *ocfs2_xattr_handlers[];
50
51static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
52{
53 return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
54}
55
56static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
57{
58 return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
59}
60
61static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
62{
63 u16 len = sb->s_blocksize -
64 offsetof(struct ocfs2_xattr_header, xh_entries);
65
66 return len / sizeof(struct ocfs2_xattr_entry);
67}
68#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d29047b1b9b0..cbf047a847c5 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -346,7 +346,7 @@ enum {
346 Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask 346 Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
347}; 347};
348 348
349static match_table_t tokens = { 349static const match_table_t tokens = {
350 {Opt_uid, "uid=%u"}, 350 {Opt_uid, "uid=%u"},
351 {Opt_gid, "gid=%u"}, 351 {Opt_gid, "gid=%u"},
352 {Opt_umask, "umask=%o"}, 352 {Opt_umask, "umask=%o"},
diff --git a/fs/open.c b/fs/open.c
index 07da9359481c..5596049863bf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1141,8 +1141,7 @@ EXPORT_SYMBOL(sys_close);
1141asmlinkage long sys_vhangup(void) 1141asmlinkage long sys_vhangup(void)
1142{ 1142{
1143 if (capable(CAP_SYS_TTY_CONFIG)) { 1143 if (capable(CAP_SYS_TTY_CONFIG)) {
1144 /* XXX: this needs locking */ 1144 tty_vhangup_self();
1145 tty_vhangup(current->signal->tty);
1146 return 0; 1145 return 0;
1147 } 1146 }
1148 return -EPERM; 1147 return -EPERM;
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 3d3e16631472..a97b477ac0fc 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -275,16 +275,6 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
275 id = data[0x1fc] & 15; 275 id = data[0x1fc] & 15;
276 put_dev_sector(sect); 276 put_dev_sector(sect);
277 277
278#ifdef CONFIG_BLK_DEV_MFM
279 if (MAJOR(bdev->bd_dev) == MFM_ACORN_MAJOR) {
280 extern void xd_set_geometry(struct block_device *,
281 unsigned char, unsigned char, unsigned int);
282 xd_set_geometry(bdev, dr->secspertrack, heads, 1);
283 invalidate_bh_lrus();
284 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
285 }
286#endif
287
288 /* 278 /*
289 * Work out start of non-adfs partition. 279 * Work out start of non-adfs partition.
290 */ 280 */
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7d6b34e201db..cfb0c80690aa 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
120 * a pointer to that same buffer (for convenience). 120 * a pointer to that same buffer (for convenience).
121 */ 121 */
122 122
123char *disk_name(struct gendisk *hd, int part, char *buf) 123char *disk_name(struct gendisk *hd, int partno, char *buf)
124{ 124{
125 if (!part) 125 if (!partno)
126 snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); 126 snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
127 else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) 127 else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
128 snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part); 128 snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
129 else 129 else
130 snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part); 130 snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
131 131
132 return buf; 132 return buf;
133} 133}
134 134
135const char *bdevname(struct block_device *bdev, char *buf) 135const char *bdevname(struct block_device *bdev, char *buf)
136{ 136{
137 int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor; 137 return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
138 return disk_name(bdev->bd_disk, part, buf);
139} 138}
140 139
141EXPORT_SYMBOL(bdevname); 140EXPORT_SYMBOL(bdevname);
@@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
169 if (isdigit(state->name[strlen(state->name)-1])) 168 if (isdigit(state->name[strlen(state->name)-1]))
170 sprintf(state->name, "p"); 169 sprintf(state->name, "p");
171 170
172 state->limit = hd->minors; 171 state->limit = disk_max_parts(hd);
173 i = res = err = 0; 172 i = res = err = 0;
174 while (!res && check_part[i]) { 173 while (!res && check_part[i]) {
175 memset(&state->parts, 0, sizeof(state->parts)); 174 memset(&state->parts, 0, sizeof(state->parts));
@@ -196,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
196 return ERR_PTR(res); 195 return ERR_PTR(res);
197} 196}
198 197
198static ssize_t part_partition_show(struct device *dev,
199 struct device_attribute *attr, char *buf)
200{
201 struct hd_struct *p = dev_to_part(dev);
202
203 return sprintf(buf, "%d\n", p->partno);
204}
205
199static ssize_t part_start_show(struct device *dev, 206static ssize_t part_start_show(struct device *dev,
200 struct device_attribute *attr, char *buf) 207 struct device_attribute *attr, char *buf)
201{ 208{
@@ -204,21 +211,22 @@ static ssize_t part_start_show(struct device *dev,
204 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); 211 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
205} 212}
206 213
207static ssize_t part_size_show(struct device *dev, 214ssize_t part_size_show(struct device *dev,
208 struct device_attribute *attr, char *buf) 215 struct device_attribute *attr, char *buf)
209{ 216{
210 struct hd_struct *p = dev_to_part(dev); 217 struct hd_struct *p = dev_to_part(dev);
211 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 218 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
212} 219}
213 220
214static ssize_t part_stat_show(struct device *dev, 221ssize_t part_stat_show(struct device *dev,
215 struct device_attribute *attr, char *buf) 222 struct device_attribute *attr, char *buf)
216{ 223{
217 struct hd_struct *p = dev_to_part(dev); 224 struct hd_struct *p = dev_to_part(dev);
225 int cpu;
218 226
219 preempt_disable(); 227 cpu = part_stat_lock();
220 part_round_stats(p); 228 part_round_stats(cpu, p);
221 preempt_enable(); 229 part_stat_unlock();
222 return sprintf(buf, 230 return sprintf(buf,
223 "%8lu %8lu %8llu %8u " 231 "%8lu %8lu %8llu %8u "
224 "%8lu %8lu %8llu %8u " 232 "%8lu %8lu %8llu %8u "
@@ -238,17 +246,17 @@ static ssize_t part_stat_show(struct device *dev,
238} 246}
239 247
240#ifdef CONFIG_FAIL_MAKE_REQUEST 248#ifdef CONFIG_FAIL_MAKE_REQUEST
241static ssize_t part_fail_show(struct device *dev, 249ssize_t part_fail_show(struct device *dev,
242 struct device_attribute *attr, char *buf) 250 struct device_attribute *attr, char *buf)
243{ 251{
244 struct hd_struct *p = dev_to_part(dev); 252 struct hd_struct *p = dev_to_part(dev);
245 253
246 return sprintf(buf, "%d\n", p->make_it_fail); 254 return sprintf(buf, "%d\n", p->make_it_fail);
247} 255}
248 256
249static ssize_t part_fail_store(struct device *dev, 257ssize_t part_fail_store(struct device *dev,
250 struct device_attribute *attr, 258 struct device_attribute *attr,
251 const char *buf, size_t count) 259 const char *buf, size_t count)
252{ 260{
253 struct hd_struct *p = dev_to_part(dev); 261 struct hd_struct *p = dev_to_part(dev);
254 int i; 262 int i;
@@ -260,6 +268,7 @@ static ssize_t part_fail_store(struct device *dev,
260} 268}
261#endif 269#endif
262 270
271static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
263static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); 272static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
264static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 273static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
265static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 274static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
@@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail =
269#endif 278#endif
270 279
271static struct attribute *part_attrs[] = { 280static struct attribute *part_attrs[] = {
281 &dev_attr_partition.attr,
272 &dev_attr_start.attr, 282 &dev_attr_start.attr,
273 &dev_attr_size.attr, 283 &dev_attr_size.attr,
274 &dev_attr_stat.attr, 284 &dev_attr_stat.attr,
@@ -300,40 +310,34 @@ struct device_type part_type = {
300 .release = part_release, 310 .release = part_release,
301}; 311};
302 312
303static inline void partition_sysfs_add_subdir(struct hd_struct *p) 313static void delete_partition_rcu_cb(struct rcu_head *head)
304{ 314{
305 struct kobject *k; 315 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
306 316
307 k = kobject_get(&p->dev.kobj); 317 part->start_sect = 0;
308 p->holder_dir = kobject_create_and_add("holders", k); 318 part->nr_sects = 0;
309 kobject_put(k); 319 part_stat_set_all(part, 0);
320 put_device(part_to_dev(part));
310} 321}
311 322
312static inline void disk_sysfs_add_subdirs(struct gendisk *disk) 323void delete_partition(struct gendisk *disk, int partno)
313{ 324{
314 struct kobject *k; 325 struct disk_part_tbl *ptbl = disk->part_tbl;
326 struct hd_struct *part;
315 327
316 k = kobject_get(&disk->dev.kobj); 328 if (partno >= ptbl->len)
317 disk->holder_dir = kobject_create_and_add("holders", k);
318 disk->slave_dir = kobject_create_and_add("slaves", k);
319 kobject_put(k);
320}
321
322void delete_partition(struct gendisk *disk, int part)
323{
324 struct hd_struct *p = disk->part[part-1];
325
326 if (!p)
327 return; 329 return;
328 if (!p->nr_sects) 330
331 part = ptbl->part[partno];
332 if (!part)
329 return; 333 return;
330 disk->part[part-1] = NULL; 334
331 p->start_sect = 0; 335 blk_free_devt(part_devt(part));
332 p->nr_sects = 0; 336 rcu_assign_pointer(ptbl->part[partno], NULL);
333 part_stat_set_all(p, 0); 337 kobject_put(part->holder_dir);
334 kobject_put(p->holder_dir); 338 device_del(part_to_dev(part));
335 device_del(&p->dev); 339
336 put_device(&p->dev); 340 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
337} 341}
338 342
339static ssize_t whole_disk_show(struct device *dev, 343static ssize_t whole_disk_show(struct device *dev,
@@ -344,102 +348,132 @@ static ssize_t whole_disk_show(struct device *dev,
344static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, 348static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
345 whole_disk_show, NULL); 349 whole_disk_show, NULL);
346 350
347int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) 351int add_partition(struct gendisk *disk, int partno,
352 sector_t start, sector_t len, int flags)
348{ 353{
349 struct hd_struct *p; 354 struct hd_struct *p;
355 dev_t devt = MKDEV(0, 0);
356 struct device *ddev = disk_to_dev(disk);
357 struct device *pdev;
358 struct disk_part_tbl *ptbl;
359 const char *dname;
350 int err; 360 int err;
351 361
362 err = disk_expand_part_tbl(disk, partno);
363 if (err)
364 return err;
365 ptbl = disk->part_tbl;
366
367 if (ptbl->part[partno])
368 return -EBUSY;
369
352 p = kzalloc(sizeof(*p), GFP_KERNEL); 370 p = kzalloc(sizeof(*p), GFP_KERNEL);
353 if (!p) 371 if (!p)
354 return -ENOMEM; 372 return -ENOMEM;
355 373
356 if (!init_part_stats(p)) { 374 if (!init_part_stats(p)) {
357 err = -ENOMEM; 375 err = -ENOMEM;
358 goto out0; 376 goto out_free;
359 } 377 }
378 pdev = part_to_dev(p);
379
360 p->start_sect = start; 380 p->start_sect = start;
361 p->nr_sects = len; 381 p->nr_sects = len;
362 p->partno = part; 382 p->partno = partno;
363 p->policy = disk->policy; 383 p->policy = get_disk_ro(disk);
364 384
365 if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1])) 385 dname = dev_name(ddev);
366 snprintf(p->dev.bus_id, BUS_ID_SIZE, 386 if (isdigit(dname[strlen(dname) - 1]))
367 "%sp%d", disk->dev.bus_id, part); 387 snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
368 else 388 else
369 snprintf(p->dev.bus_id, BUS_ID_SIZE, 389 snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
370 "%s%d", disk->dev.bus_id, part); 390
391 device_initialize(pdev);
392 pdev->class = &block_class;
393 pdev->type = &part_type;
394 pdev->parent = ddev;
371 395
372 device_initialize(&p->dev); 396 err = blk_alloc_devt(p, &devt);
373 p->dev.devt = MKDEV(disk->major, disk->first_minor + part); 397 if (err)
374 p->dev.class = &block_class; 398 goto out_free;
375 p->dev.type = &part_type; 399 pdev->devt = devt;
376 p->dev.parent = &disk->dev;
377 disk->part[part-1] = p;
378 400
379 /* delay uevent until 'holders' subdir is created */ 401 /* delay uevent until 'holders' subdir is created */
380 p->dev.uevent_suppress = 1; 402 pdev->uevent_suppress = 1;
381 err = device_add(&p->dev); 403 err = device_add(pdev);
382 if (err) 404 if (err)
383 goto out1; 405 goto out_put;
384 partition_sysfs_add_subdir(p); 406
385 p->dev.uevent_suppress = 0; 407 err = -ENOMEM;
408 p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
409 if (!p->holder_dir)
410 goto out_del;
411
412 pdev->uevent_suppress = 0;
386 if (flags & ADDPART_FLAG_WHOLEDISK) { 413 if (flags & ADDPART_FLAG_WHOLEDISK) {
387 err = device_create_file(&p->dev, &dev_attr_whole_disk); 414 err = device_create_file(pdev, &dev_attr_whole_disk);
388 if (err) 415 if (err)
389 goto out2; 416 goto out_del;
390 } 417 }
391 418
419 /* everything is up and running, commence */
420 INIT_RCU_HEAD(&p->rcu_head);
421 rcu_assign_pointer(ptbl->part[partno], p);
422
392 /* suppress uevent if the disk supresses it */ 423 /* suppress uevent if the disk supresses it */
393 if (!disk->dev.uevent_suppress) 424 if (!ddev->uevent_suppress)
394 kobject_uevent(&p->dev.kobj, KOBJ_ADD); 425 kobject_uevent(&pdev->kobj, KOBJ_ADD);
395 426
396 return 0; 427 return 0;
397 428
398out2: 429out_free:
399 device_del(&p->dev);
400out1:
401 put_device(&p->dev);
402 free_part_stats(p);
403out0:
404 kfree(p); 430 kfree(p);
405 return err; 431 return err;
432out_del:
433 kobject_put(p->holder_dir);
434 device_del(pdev);
435out_put:
436 put_device(pdev);
437 blk_free_devt(devt);
438 return err;
406} 439}
407 440
408/* Not exported, helper to add_disk(). */ 441/* Not exported, helper to add_disk(). */
409void register_disk(struct gendisk *disk) 442void register_disk(struct gendisk *disk)
410{ 443{
444 struct device *ddev = disk_to_dev(disk);
411 struct block_device *bdev; 445 struct block_device *bdev;
446 struct disk_part_iter piter;
447 struct hd_struct *part;
412 char *s; 448 char *s;
413 int i;
414 struct hd_struct *p;
415 int err; 449 int err;
416 450
417 disk->dev.parent = disk->driverfs_dev; 451 ddev->parent = disk->driverfs_dev;
418 disk->dev.devt = MKDEV(disk->major, disk->first_minor);
419 452
420 strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE); 453 strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
421 /* ewww... some of these buggers have / in the name... */ 454 /* ewww... some of these buggers have / in the name... */
422 s = strchr(disk->dev.bus_id, '/'); 455 s = strchr(ddev->bus_id, '/');
423 if (s) 456 if (s)
424 *s = '!'; 457 *s = '!';
425 458
426 /* delay uevents, until we scanned partition table */ 459 /* delay uevents, until we scanned partition table */
427 disk->dev.uevent_suppress = 1; 460 ddev->uevent_suppress = 1;
428 461
429 if (device_add(&disk->dev)) 462 if (device_add(ddev))
430 return; 463 return;
431#ifndef CONFIG_SYSFS_DEPRECATED 464#ifndef CONFIG_SYSFS_DEPRECATED
432 err = sysfs_create_link(block_depr, &disk->dev.kobj, 465 err = sysfs_create_link(block_depr, &ddev->kobj,
433 kobject_name(&disk->dev.kobj)); 466 kobject_name(&ddev->kobj));
434 if (err) { 467 if (err) {
435 device_del(&disk->dev); 468 device_del(ddev);
436 return; 469 return;
437 } 470 }
438#endif 471#endif
439 disk_sysfs_add_subdirs(disk); 472 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
473 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
440 474
441 /* No minors to use for partitions */ 475 /* No minors to use for partitions */
442 if (disk->minors == 1) 476 if (!disk_partitionable(disk))
443 goto exit; 477 goto exit;
444 478
445 /* No such device (e.g., media were just removed) */ 479 /* No such device (e.g., media were just removed) */
@@ -458,51 +492,80 @@ void register_disk(struct gendisk *disk)
458 492
459exit: 493exit:
460 /* announce disk after possible partitions are created */ 494 /* announce disk after possible partitions are created */
461 disk->dev.uevent_suppress = 0; 495 ddev->uevent_suppress = 0;
462 kobject_uevent(&disk->dev.kobj, KOBJ_ADD); 496 kobject_uevent(&ddev->kobj, KOBJ_ADD);
463 497
464 /* announce possible partitions */ 498 /* announce possible partitions */
465 for (i = 1; i < disk->minors; i++) { 499 disk_part_iter_init(&piter, disk, 0);
466 p = disk->part[i-1]; 500 while ((part = disk_part_iter_next(&piter)))
467 if (!p || !p->nr_sects) 501 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
468 continue; 502 disk_part_iter_exit(&piter);
469 kobject_uevent(&p->dev.kobj, KOBJ_ADD);
470 }
471} 503}
472 504
473int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 505int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
474{ 506{
507 struct disk_part_iter piter;
508 struct hd_struct *part;
475 struct parsed_partitions *state; 509 struct parsed_partitions *state;
476 int p, res; 510 int p, highest, res;
477 511
478 if (bdev->bd_part_count) 512 if (bdev->bd_part_count)
479 return -EBUSY; 513 return -EBUSY;
480 res = invalidate_partition(disk, 0); 514 res = invalidate_partition(disk, 0);
481 if (res) 515 if (res)
482 return res; 516 return res;
483 bdev->bd_invalidated = 0; 517
484 for (p = 1; p < disk->minors; p++) 518 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
485 delete_partition(disk, p); 519 while ((part = disk_part_iter_next(&piter)))
520 delete_partition(disk, part->partno);
521 disk_part_iter_exit(&piter);
522
486 if (disk->fops->revalidate_disk) 523 if (disk->fops->revalidate_disk)
487 disk->fops->revalidate_disk(disk); 524 disk->fops->revalidate_disk(disk);
525 check_disk_size_change(disk, bdev);
526 bdev->bd_invalidated = 0;
488 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 527 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
489 return 0; 528 return 0;
490 if (IS_ERR(state)) /* I/O error reading the partition table */ 529 if (IS_ERR(state)) /* I/O error reading the partition table */
491 return -EIO; 530 return -EIO;
492 531
493 /* tell userspace that the media / partition table may have changed */ 532 /* tell userspace that the media / partition table may have changed */
494 kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE); 533 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
534
535 /* Detect the highest partition number and preallocate
536 * disk->part_tbl. This is an optimization and not strictly
537 * necessary.
538 */
539 for (p = 1, highest = 0; p < state->limit; p++)
540 if (state->parts[p].size)
541 highest = p;
495 542
543 disk_expand_part_tbl(disk, highest);
544
545 /* add partitions */
496 for (p = 1; p < state->limit; p++) { 546 for (p = 1; p < state->limit; p++) {
497 sector_t size = state->parts[p].size; 547 sector_t size = state->parts[p].size;
498 sector_t from = state->parts[p].from; 548 sector_t from = state->parts[p].from;
499 if (!size) 549 if (!size)
500 continue; 550 continue;
501 if (from + size > get_capacity(disk)) { 551 if (from >= get_capacity(disk)) {
502 printk(KERN_ERR " %s: p%d exceeds device capacity\n", 552 printk(KERN_WARNING
503 disk->disk_name, p); 553 "%s: p%d ignored, start %llu is behind the end of the disk\n",
554 disk->disk_name, p, (unsigned long long) from);
504 continue; 555 continue;
505 } 556 }
557 if (from + size > get_capacity(disk)) {
558 /*
559 * we can not ignore partitions of broken tables
560 * created by for example camera firmware, but we
561 * limit them to the end of the disk to avoid
562 * creating invalid block devices
563 */
564 printk(KERN_WARNING
565 "%s: p%d size %llu limited to end of disk\n",
566 disk->disk_name, p, (unsigned long long) size);
567 size = get_capacity(disk) - from;
568 }
506 res = add_partition(disk, p, from, size, state->parts[p].flags); 569 res = add_partition(disk, p, from, size, state->parts[p].flags);
507 if (res) { 570 if (res) {
508 printk(KERN_ERR " %s: p%d could not be added: %d\n", 571 printk(KERN_ERR " %s: p%d could not be added: %d\n",
@@ -541,25 +604,31 @@ EXPORT_SYMBOL(read_dev_sector);
541 604
542void del_gendisk(struct gendisk *disk) 605void del_gendisk(struct gendisk *disk)
543{ 606{
544 int p; 607 struct disk_part_iter piter;
608 struct hd_struct *part;
545 609
546 /* invalidate stuff */ 610 /* invalidate stuff */
547 for (p = disk->minors - 1; p > 0; p--) { 611 disk_part_iter_init(&piter, disk,
548 invalidate_partition(disk, p); 612 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
549 delete_partition(disk, p); 613 while ((part = disk_part_iter_next(&piter))) {
614 invalidate_partition(disk, part->partno);
615 delete_partition(disk, part->partno);
550 } 616 }
617 disk_part_iter_exit(&piter);
618
551 invalidate_partition(disk, 0); 619 invalidate_partition(disk, 0);
552 disk->capacity = 0; 620 blk_free_devt(disk_to_dev(disk)->devt);
621 set_capacity(disk, 0);
553 disk->flags &= ~GENHD_FL_UP; 622 disk->flags &= ~GENHD_FL_UP;
554 unlink_gendisk(disk); 623 unlink_gendisk(disk);
555 disk_stat_set_all(disk, 0); 624 part_stat_set_all(&disk->part0, 0);
556 disk->stamp = 0; 625 disk->part0.stamp = 0;
557 626
558 kobject_put(disk->holder_dir); 627 kobject_put(disk->part0.holder_dir);
559 kobject_put(disk->slave_dir); 628 kobject_put(disk->slave_dir);
560 disk->driverfs_dev = NULL; 629 disk->driverfs_dev = NULL;
561#ifndef CONFIG_SYSFS_DEPRECATED 630#ifndef CONFIG_SYSFS_DEPRECATED
562 sysfs_remove_link(block_depr, disk->dev.bus_id); 631 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
563#endif 632#endif
564 device_del(&disk->dev); 633 device_del(disk_to_dev(disk));
565} 634}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8b..98dbe1a84528 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
5 * add_gd_partition adds a partitions details to the devices partition 5 * add_gd_partition adds a partitions details to the devices partition
6 * description. 6 * description.
7 */ 7 */
8enum { MAX_PART = 256 };
9
10struct parsed_partitions { 8struct parsed_partitions {
11 char name[BDEVNAME_SIZE]; 9 char name[BDEVNAME_SIZE];
12 struct { 10 struct {
13 sector_t from; 11 sector_t from;
14 sector_t size; 12 sector_t size;
15 int flags; 13 int flags;
16 } parts[MAX_PART]; 14 } parts[DISK_MAX_PARTS];
17 int next; 15 int next;
18 int limit; 16 int limit;
19}; 17};
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 73cd7a418f06..50f8f0600f06 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -57,3 +57,13 @@ config PROC_SYSCTL
57 As it is generally a good thing, you should say Y here unless 57 As it is generally a good thing, you should say Y here unless
58 building a kernel for install/rescue disks or your system is very 58 building a kernel for install/rescue disks or your system is very
59 limited in memory. 59 limited in memory.
60
61config PROC_PAGE_MONITOR
62 default y
63 depends on PROC_FS && MMU
64 bool "Enable /proc page monitoring" if EMBEDDED
65 help
66 Various /proc files exist to monitor process memory utilization:
67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
68 /proc/kpagecount, and /proc/kpageflags. Disabling these
69 interfaces will reduce the size of the kernel by approximately 4kb.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0d6eb33597c6..bb9f4b05703d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -86,11 +86,6 @@
86#include <asm/processor.h> 86#include <asm/processor.h>
87#include "internal.h" 87#include "internal.h"
88 88
89/* Gcc optimizes away "strlen(x)" for constant x */
90#define ADDBUF(buffer, string) \
91do { memcpy(buffer, string, strlen(string)); \
92 buffer += strlen(string); } while (0)
93
94static inline void task_name(struct seq_file *m, struct task_struct *p) 89static inline void task_name(struct seq_file *m, struct task_struct *p)
95{ 90{
96 int i; 91 int i;
@@ -261,7 +256,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
261 sigemptyset(&ignored); 256 sigemptyset(&ignored);
262 sigemptyset(&caught); 257 sigemptyset(&caught);
263 258
264 rcu_read_lock();
265 if (lock_task_sighand(p, &flags)) { 259 if (lock_task_sighand(p, &flags)) {
266 pending = p->pending.signal; 260 pending = p->pending.signal;
267 shpending = p->signal->shared_pending.signal; 261 shpending = p->signal->shared_pending.signal;
@@ -272,7 +266,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
272 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; 266 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
273 unlock_task_sighand(p, &flags); 267 unlock_task_sighand(p, &flags);
274 } 268 }
275 rcu_read_unlock();
276 269
277 seq_printf(m, "Threads:\t%d\n", num_threads); 270 seq_printf(m, "Threads:\t%d\n", num_threads);
278 seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); 271 seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
@@ -337,65 +330,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
337 return 0; 330 return 0;
338} 331}
339 332
340/*
341 * Use precise platform statistics if available:
342 */
343#ifdef CONFIG_VIRT_CPU_ACCOUNTING
344static cputime_t task_utime(struct task_struct *p)
345{
346 return p->utime;
347}
348
349static cputime_t task_stime(struct task_struct *p)
350{
351 return p->stime;
352}
353#else
354static cputime_t task_utime(struct task_struct *p)
355{
356 clock_t utime = cputime_to_clock_t(p->utime),
357 total = utime + cputime_to_clock_t(p->stime);
358 u64 temp;
359
360 /*
361 * Use CFS's precise accounting:
362 */
363 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
364
365 if (total) {
366 temp *= utime;
367 do_div(temp, total);
368 }
369 utime = (clock_t)temp;
370
371 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
372 return p->prev_utime;
373}
374
375static cputime_t task_stime(struct task_struct *p)
376{
377 clock_t stime;
378
379 /*
380 * Use CFS's precise accounting. (we subtract utime from
381 * the total, to make sure the total observed by userspace
382 * grows monotonically - apps rely on that):
383 */
384 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
385 cputime_to_clock_t(task_utime(p));
386
387 if (stime >= 0)
388 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
389
390 return p->prev_stime;
391}
392#endif
393
394static cputime_t task_gtime(struct task_struct *p)
395{
396 return p->gtime;
397}
398
399static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, 333static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
400 struct pid *pid, struct task_struct *task, int whole) 334 struct pid *pid, struct task_struct *task, int whole)
401{ 335{
@@ -454,20 +388,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
454 388
455 /* add up live thread stats at the group level */ 389 /* add up live thread stats at the group level */
456 if (whole) { 390 if (whole) {
391 struct task_cputime cputime;
457 struct task_struct *t = task; 392 struct task_struct *t = task;
458 do { 393 do {
459 min_flt += t->min_flt; 394 min_flt += t->min_flt;
460 maj_flt += t->maj_flt; 395 maj_flt += t->maj_flt;
461 utime = cputime_add(utime, task_utime(t));
462 stime = cputime_add(stime, task_stime(t));
463 gtime = cputime_add(gtime, task_gtime(t)); 396 gtime = cputime_add(gtime, task_gtime(t));
464 t = next_thread(t); 397 t = next_thread(t);
465 } while (t != task); 398 } while (t != task);
466 399
467 min_flt += sig->min_flt; 400 min_flt += sig->min_flt;
468 maj_flt += sig->maj_flt; 401 maj_flt += sig->maj_flt;
469 utime = cputime_add(utime, sig->utime); 402 thread_group_cputime(task, &cputime);
470 stime = cputime_add(stime, sig->stime); 403 utime = cputime.utime;
404 stime = cputime.stime;
471 gtime = cputime_add(gtime, sig->gtime); 405 gtime = cputime_add(gtime, sig->gtime);
472 } 406 }
473 407
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a28840b11b89..b5918ae8ca79 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -148,9 +148,6 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
148 return count; 148 return count;
149} 149}
150 150
151int maps_protect;
152EXPORT_SYMBOL(maps_protect);
153
154static struct fs_struct *get_fs_struct(struct task_struct *task) 151static struct fs_struct *get_fs_struct(struct task_struct *task)
155{ 152{
156 struct fs_struct *fs; 153 struct fs_struct *fs;
@@ -164,7 +161,6 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
164 161
165static int get_nr_threads(struct task_struct *tsk) 162static int get_nr_threads(struct task_struct *tsk)
166{ 163{
167 /* Must be called with the rcu_read_lock held */
168 unsigned long flags; 164 unsigned long flags;
169 int count = 0; 165 int count = 0;
170 166
@@ -471,14 +467,10 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
471 467
472 struct rlimit rlim[RLIM_NLIMITS]; 468 struct rlimit rlim[RLIM_NLIMITS];
473 469
474 rcu_read_lock(); 470 if (!lock_task_sighand(task, &flags))
475 if (!lock_task_sighand(task,&flags)) {
476 rcu_read_unlock();
477 return 0; 471 return 0;
478 }
479 memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); 472 memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
480 unlock_task_sighand(task, &flags); 473 unlock_task_sighand(task, &flags);
481 rcu_read_unlock();
482 474
483 /* 475 /*
484 * print the file header 476 * print the file header
@@ -2443,6 +2435,13 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
2443} 2435}
2444#endif /* CONFIG_TASK_IO_ACCOUNTING */ 2436#endif /* CONFIG_TASK_IO_ACCOUNTING */
2445 2437
2438static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
2439 struct pid *pid, struct task_struct *task)
2440{
2441 seq_printf(m, "%08x\n", task->personality);
2442 return 0;
2443}
2444
2446/* 2445/*
2447 * Thread groups 2446 * Thread groups
2448 */ 2447 */
@@ -2459,6 +2458,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2459 REG("environ", S_IRUSR, environ), 2458 REG("environ", S_IRUSR, environ),
2460 INF("auxv", S_IRUSR, pid_auxv), 2459 INF("auxv", S_IRUSR, pid_auxv),
2461 ONE("status", S_IRUGO, pid_status), 2460 ONE("status", S_IRUGO, pid_status),
2461 ONE("personality", S_IRUSR, pid_personality),
2462 INF("limits", S_IRUSR, pid_limits), 2462 INF("limits", S_IRUSR, pid_limits),
2463#ifdef CONFIG_SCHED_DEBUG 2463#ifdef CONFIG_SCHED_DEBUG
2464 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2464 REG("sched", S_IRUGO|S_IWUSR, pid_sched),
@@ -2794,6 +2794,7 @@ static const struct pid_entry tid_base_stuff[] = {
2794 REG("environ", S_IRUSR, environ), 2794 REG("environ", S_IRUSR, environ),
2795 INF("auxv", S_IRUSR, pid_auxv), 2795 INF("auxv", S_IRUSR, pid_auxv),
2796 ONE("status", S_IRUGO, pid_status), 2796 ONE("status", S_IRUGO, pid_status),
2797 ONE("personality", S_IRUSR, pid_personality),
2797 INF("limits", S_IRUSR, pid_limits), 2798 INF("limits", S_IRUSR, pid_limits),
2798#ifdef CONFIG_SCHED_DEBUG 2799#ifdef CONFIG_SCHED_DEBUG
2799 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2800 REG("sched", S_IRUGO|S_IWUSR, pid_sched),
@@ -3088,9 +3089,7 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
3088 generic_fillattr(inode, stat); 3089 generic_fillattr(inode, stat);
3089 3090
3090 if (p) { 3091 if (p) {
3091 rcu_read_lock();
3092 stat->nlink += get_nr_threads(p); 3092 stat->nlink += get_nr_threads(p);
3093 rcu_read_unlock();
3094 put_task_struct(p); 3093 put_task_struct(p);
3095 } 3094 }
3096 3095
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 4fb81e9c94e3..7821589a17d5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -330,6 +330,7 @@ retry:
330 spin_lock(&proc_inum_lock); 330 spin_lock(&proc_inum_lock);
331 ida_remove(&proc_inum_ida, i); 331 ida_remove(&proc_inum_ida, i);
332 spin_unlock(&proc_inum_lock); 332 spin_unlock(&proc_inum_lock);
333 return 0;
333 } 334 }
334 return PROC_DYNAMIC_FIRST + i; 335 return PROC_DYNAMIC_FIRST + i;
335} 336}
@@ -546,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
546 547
547 for (tmp = dir->subdir; tmp; tmp = tmp->next) 548 for (tmp = dir->subdir; tmp; tmp = tmp->next)
548 if (strcmp(tmp->name, dp->name) == 0) { 549 if (strcmp(tmp->name, dp->name) == 0) {
549 printk(KERN_WARNING "proc_dir_entry '%s' already " 550 printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
550 "registered\n", dp->name); 551 dir->name, dp->name);
551 dump_stack(); 552 dump_stack();
552 break; 553 break;
553 } 554 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8bb03f056c28..c6b4fa7e3b49 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -342,7 +342,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
342 if (!pde->proc_fops) { 342 if (!pde->proc_fops) {
343 spin_unlock(&pde->pde_unload_lock); 343 spin_unlock(&pde->pde_unload_lock);
344 kfree(pdeo); 344 kfree(pdeo);
345 return rv; 345 return -EINVAL;
346 } 346 }
347 pde->pde_users++; 347 pde->pde_users++;
348 open = pde->proc_fops->open; 348 open = pde->proc_fops->open;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 442202314d53..3bfb7b8747b3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -45,8 +45,6 @@ do { \
45extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); 45extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
46#endif 46#endif
47 47
48extern int maps_protect;
49
50extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, 48extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
51 struct pid *pid, struct task_struct *task); 49 struct pid *pid, struct task_struct *task);
52extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, 50extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ded969862960..59ea42e1ef03 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -24,6 +24,7 @@
24#include <linux/tty.h> 24#include <linux/tty.h>
25#include <linux/string.h> 25#include <linux/string.h>
26#include <linux/mman.h> 26#include <linux/mman.h>
27#include <linux/quicklist.h>
27#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
28#include <linux/ioport.h> 29#include <linux/ioport.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
@@ -44,7 +45,6 @@
44#include <linux/blkdev.h> 45#include <linux/blkdev.h>
45#include <linux/hugetlb.h> 46#include <linux/hugetlb.h>
46#include <linux/jiffies.h> 47#include <linux/jiffies.h>
47#include <linux/sysrq.h>
48#include <linux/vmalloc.h> 48#include <linux/vmalloc.h>
49#include <linux/crash_dump.h> 49#include <linux/crash_dump.h>
50#include <linux/pid_namespace.h> 50#include <linux/pid_namespace.h>
@@ -67,7 +67,6 @@
67extern int get_hardware_list(char *); 67extern int get_hardware_list(char *);
68extern int get_stram_list(char *); 68extern int get_stram_list(char *);
69extern int get_exec_domain_list(char *); 69extern int get_exec_domain_list(char *);
70extern int get_dma_list(char *);
71 70
72static int proc_calc_metrics(char *page, char **start, off_t off, 71static int proc_calc_metrics(char *page, char **start, off_t off,
73 int count, int *eof, int len) 72 int count, int *eof, int len)
@@ -182,6 +181,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
182 "SReclaimable: %8lu kB\n" 181 "SReclaimable: %8lu kB\n"
183 "SUnreclaim: %8lu kB\n" 182 "SUnreclaim: %8lu kB\n"
184 "PageTables: %8lu kB\n" 183 "PageTables: %8lu kB\n"
184#ifdef CONFIG_QUICKLIST
185 "Quicklists: %8lu kB\n"
186#endif
185 "NFS_Unstable: %8lu kB\n" 187 "NFS_Unstable: %8lu kB\n"
186 "Bounce: %8lu kB\n" 188 "Bounce: %8lu kB\n"
187 "WritebackTmp: %8lu kB\n" 189 "WritebackTmp: %8lu kB\n"
@@ -214,6 +216,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
214 K(global_page_state(NR_SLAB_RECLAIMABLE)), 216 K(global_page_state(NR_SLAB_RECLAIMABLE)),
215 K(global_page_state(NR_SLAB_UNRECLAIMABLE)), 217 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
216 K(global_page_state(NR_PAGETABLE)), 218 K(global_page_state(NR_PAGETABLE)),
219#ifdef CONFIG_QUICKLIST
220 K(quicklist_total_size()),
221#endif
217 K(global_page_state(NR_UNSTABLE_NFS)), 222 K(global_page_state(NR_UNSTABLE_NFS)),
218 K(global_page_state(NR_BOUNCE)), 223 K(global_page_state(NR_BOUNCE)),
219 K(global_page_state(NR_WRITEBACK_TEMP)), 224 K(global_page_state(NR_WRITEBACK_TEMP)),
@@ -677,6 +682,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
677 return proc_calc_metrics(page, start, off, count, eof, len); 682 return proc_calc_metrics(page, start, off, count, eof, len);
678} 683}
679 684
685#ifdef CONFIG_FILE_LOCKING
680static int locks_open(struct inode *inode, struct file *filp) 686static int locks_open(struct inode *inode, struct file *filp)
681{ 687{
682 return seq_open(filp, &locks_seq_operations); 688 return seq_open(filp, &locks_seq_operations);
@@ -688,6 +694,7 @@ static const struct file_operations proc_locks_operations = {
688 .llseek = seq_lseek, 694 .llseek = seq_lseek,
689 .release = seq_release, 695 .release = seq_release,
690}; 696};
697#endif /* CONFIG_FILE_LOCKING */
691 698
692static int execdomains_read_proc(char *page, char **start, off_t off, 699static int execdomains_read_proc(char *page, char **start, off_t off,
693 int count, int *eof, void *data) 700 int count, int *eof, void *data)
@@ -696,28 +703,6 @@ static int execdomains_read_proc(char *page, char **start, off_t off,
696 return proc_calc_metrics(page, start, off, count, eof, len); 703 return proc_calc_metrics(page, start, off, count, eof, len);
697} 704}
698 705
699#ifdef CONFIG_MAGIC_SYSRQ
700/*
701 * writing 'C' to /proc/sysrq-trigger is like sysrq-C
702 */
703static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
704 size_t count, loff_t *ppos)
705{
706 if (count) {
707 char c;
708
709 if (get_user(c, buf))
710 return -EFAULT;
711 __handle_sysrq(c, NULL, 0);
712 }
713 return count;
714}
715
716static const struct file_operations proc_sysrq_trigger_operations = {
717 .write = write_sysrq_trigger,
718};
719#endif
720
721#ifdef CONFIG_PROC_PAGE_MONITOR 706#ifdef CONFIG_PROC_PAGE_MONITOR
722#define KPMSIZE sizeof(u64) 707#define KPMSIZE sizeof(u64)
723#define KPMMASK (KPMSIZE - 1) 708#define KPMMASK (KPMSIZE - 1)
@@ -881,7 +866,9 @@ void __init proc_misc_init(void)
881#ifdef CONFIG_PRINTK 866#ifdef CONFIG_PRINTK
882 proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); 867 proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
883#endif 868#endif
869#ifdef CONFIG_FILE_LOCKING
884 proc_create("locks", 0, NULL, &proc_locks_operations); 870 proc_create("locks", 0, NULL, &proc_locks_operations);
871#endif
885 proc_create("devices", 0, NULL, &proc_devinfo_operations); 872 proc_create("devices", 0, NULL, &proc_devinfo_operations);
886 proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); 873 proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
887#ifdef CONFIG_BLOCK 874#ifdef CONFIG_BLOCK
@@ -924,7 +911,4 @@ void __init proc_misc_init(void)
924#ifdef CONFIG_PROC_VMCORE 911#ifdef CONFIG_PROC_VMCORE
925 proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); 912 proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
926#endif 913#endif
927#ifdef CONFIG_MAGIC_SYSRQ
928 proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations);
929#endif
930} 914}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f9a8b892718f..945a81043ba2 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -66,7 +66,7 @@ static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
66 return NULL; 66 return NULL;
67} 67}
68 68
69struct ctl_table_header *grab_header(struct inode *inode) 69static struct ctl_table_header *grab_header(struct inode *inode)
70{ 70{
71 if (PROC_I(inode)->sysctl) 71 if (PROC_I(inode)->sysctl)
72 return sysctl_head_grab(PROC_I(inode)->sysctl); 72 return sysctl_head_grab(PROC_I(inode)->sysctl);
@@ -395,10 +395,10 @@ static struct dentry_operations proc_sys_dentry_operations = {
395 .d_compare = proc_sys_compare, 395 .d_compare = proc_sys_compare,
396}; 396};
397 397
398static struct proc_dir_entry *proc_sys_root;
399
400int proc_sys_init(void) 398int proc_sys_init(void)
401{ 399{
400 struct proc_dir_entry *proc_sys_root;
401
402 proc_sys_root = proc_mkdir("sys", NULL); 402 proc_sys_root = proc_mkdir("sys", NULL);
403 proc_sys_root->proc_iops = &proc_sys_dir_operations; 403 proc_sys_root->proc_iops = &proc_sys_dir_operations;
404 proc_sys_root->proc_fops = &proc_sys_dir_file_operations; 404 proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 73d1891ee625..4806830ea2a1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,9 +210,6 @@ static int show_map(struct seq_file *m, void *v)
210 dev_t dev = 0; 210 dev_t dev = 0;
211 int len; 211 int len;
212 212
213 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
214 return -EACCES;
215
216 if (file) { 213 if (file) {
217 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 214 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
218 dev = inode->i_sb->s_dev; 215 dev = inode->i_sb->s_dev;
@@ -742,22 +739,11 @@ const struct file_operations proc_pagemap_operations = {
742#ifdef CONFIG_NUMA 739#ifdef CONFIG_NUMA
743extern int show_numa_map(struct seq_file *m, void *v); 740extern int show_numa_map(struct seq_file *m, void *v);
744 741
745static int show_numa_map_checked(struct seq_file *m, void *v)
746{
747 struct proc_maps_private *priv = m->private;
748 struct task_struct *task = priv->task;
749
750 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
751 return -EACCES;
752
753 return show_numa_map(m, v);
754}
755
756static const struct seq_operations proc_pid_numa_maps_op = { 742static const struct seq_operations proc_pid_numa_maps_op = {
757 .start = m_start, 743 .start = m_start,
758 .next = m_next, 744 .next = m_next,
759 .stop = m_stop, 745 .stop = m_stop,
760 .show = show_numa_map_checked 746 .show = show_numa_map,
761}; 747};
762 748
763static int numa_maps_open(struct inode *inode, struct file *file) 749static int numa_maps_open(struct inode *inode, struct file *file)
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d84e7121df8..219bd79ea894 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -110,11 +110,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
110static int show_map(struct seq_file *m, void *_vml) 110static int show_map(struct seq_file *m, void *_vml)
111{ 111{
112 struct vm_list_struct *vml = _vml; 112 struct vm_list_struct *vml = _vml;
113 struct proc_maps_private *priv = m->private;
114 struct task_struct *task = priv->task;
115
116 if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
117 return -EACCES;
118 113
119 return nommu_vma_show(m, vml->vma); 114 return nommu_vma_show(m, vml->vma);
120} 115}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9ac0f5e064e0..841368b87a29 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -165,14 +165,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
165 return acc; 165 return acc;
166} 166}
167 167
168static int open_vmcore(struct inode *inode, struct file *filp)
169{
170 return 0;
171}
172
173const struct file_operations proc_vmcore_operations = { 168const struct file_operations proc_vmcore_operations = {
174 .read = read_vmcore, 169 .read = read_vmcore,
175 .open = open_vmcore,
176}; 170};
177 171
178static struct vmcore* __init get_new_element(void) 172static struct vmcore* __init get_new_element(void)
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 52312ec93ff4..5145cb9125af 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {
58 * size 0 on the assumption that it's going to be used for an mmap of shared 58 * size 0 on the assumption that it's going to be used for an mmap of shared
59 * memory 59 * memory
60 */ 60 */
61static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) 61int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
62{ 62{
63 struct pagevec lru_pvec; 63 struct pagevec lru_pvec;
64 unsigned long npages, xpages, loop, limit; 64 unsigned long npages, xpages, loop, limit;
diff --git a/fs/readdir.c b/fs/readdir.c
index 4e026e5407fb..93a7559bbfd8 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset
80 if (buf->result) 80 if (buf->result)
81 return -EINVAL; 81 return -EINVAL;
82 d_ino = ino; 82 d_ino = ino;
83 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) 83 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
84 buf->result = -EOVERFLOW;
84 return -EOVERFLOW; 85 return -EOVERFLOW;
86 }
85 buf->result++; 87 buf->result++;
86 dirent = buf->dirent; 88 dirent = buf->dirent;
87 if (!access_ok(VERIFY_WRITE, dirent, 89 if (!access_ok(VERIFY_WRITE, dirent,
@@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
155 if (reclen > buf->count) 157 if (reclen > buf->count)
156 return -EINVAL; 158 return -EINVAL;
157 d_ino = ino; 159 d_ino = ino;
158 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) 160 if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
161 buf->error = -EOVERFLOW;
159 return -EOVERFLOW; 162 return -EOVERFLOW;
163 }
160 dirent = buf->previous; 164 dirent = buf->previous;
161 if (dirent) { 165 if (dirent) {
162 if (__put_user(offset, &dirent->d_off)) 166 if (__put_user(offset, &dirent->d_off))
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index b9dbeeca7049..37173fa07d15 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -8,8 +8,6 @@
8 8
9/* proc info support a la one created by Sizif@Botik.RU for PGC */ 9/* proc info support a la one created by Sizif@Botik.RU for PGC */
10 10
11/* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */
12
13#include <linux/module.h> 11#include <linux/module.h>
14#include <linux/time.h> 12#include <linux/time.h>
15#include <linux/seq_file.h> 13#include <linux/seq_file.h>
@@ -621,7 +619,6 @@ int reiserfs_global_version_in_proc(char *buffer, char **start,
621#endif 619#endif
622 620
623/* 621/*
624 * $Log: procfs.c,v $
625 * Revision 1.1.8.2 2001/07/15 17:08:42 god 622 * Revision 1.1.8.2 2001/07/15 17:08:42 god
626 * . use get_super() in procfs.c 623 * . use get_super() in procfs.c
627 * . remove remove_save_link() from reiserfs_do_truncate() 624 * . remove remove_save_link() from reiserfs_do_truncate()
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index bb3cb5b7cdb2..ad92461cbfc3 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -155,7 +155,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode,
155 xadir = open_xa_dir(inode, flags); 155 xadir = open_xa_dir(inode, flags);
156 if (IS_ERR(xadir)) { 156 if (IS_ERR(xadir)) {
157 return ERR_CAST(xadir); 157 return ERR_CAST(xadir);
158 } else if (xadir && !xadir->d_inode) { 158 } else if (!xadir->d_inode) {
159 dput(xadir); 159 dput(xadir);
160 return ERR_PTR(-ENODATA); 160 return ERR_PTR(-ENODATA);
161 } 161 }
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5d54205e486b..bd20f7f5a933 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
108 goto Done; 108 goto Done;
109 } 109 }
110 /* we need at least one record in buffer */ 110 /* we need at least one record in buffer */
111 pos = m->index;
112 p = m->op->start(m, &pos);
111 while (1) { 113 while (1) {
112 pos = m->index;
113 p = m->op->start(m, &pos);
114 err = PTR_ERR(p); 114 err = PTR_ERR(p);
115 if (!p || IS_ERR(p)) 115 if (!p || IS_ERR(p))
116 break; 116 break;
@@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
119 break; 119 break;
120 if (unlikely(err)) 120 if (unlikely(err))
121 m->count = 0; 121 m->count = 0;
122 if (unlikely(!m->count)) {
123 p = m->op->next(m, p, &pos);
124 m->index = pos;
125 continue;
126 }
122 if (m->count < m->size) 127 if (m->count < m->size)
123 goto Fill; 128 goto Fill;
124 m->op->stop(m, p); 129 m->op->stop(m, p);
@@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
128 goto Enomem; 133 goto Enomem;
129 m->count = 0; 134 m->count = 0;
130 m->version = 0; 135 m->version = 0;
136 pos = m->index;
137 p = m->op->start(m, &pos);
131 } 138 }
132 m->op->stop(m, p); 139 m->op->stop(m, p);
133 m->count = 0; 140 m->count = 0;
diff --git a/fs/splice.c b/fs/splice.c
index 1bbc6f4bb09c..a1e701c27156 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
898 if (unlikely(!(out->f_mode & FMODE_WRITE))) 898 if (unlikely(!(out->f_mode & FMODE_WRITE)))
899 return -EBADF; 899 return -EBADF;
900 900
901 if (unlikely(out->f_flags & O_APPEND))
902 return -EINVAL;
903
901 ret = rw_verify_area(WRITE, out, ppos, len); 904 ret = rw_verify_area(WRITE, out, ppos, len);
902 if (unlikely(ret < 0)) 905 if (unlikely(ret < 0))
903 return ret; 906 return ret;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 006fc64227dd..66f6e58a7e4b 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -61,6 +61,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
61 int size = dentry->d_inode->i_size; 61 int size = dentry->d_inode->i_size;
62 loff_t offs = *off; 62 loff_t offs = *off;
63 int count = min_t(size_t, bytes, PAGE_SIZE); 63 int count = min_t(size_t, bytes, PAGE_SIZE);
64 char *temp;
64 65
65 if (size) { 66 if (size) {
66 if (offs > size) 67 if (offs > size)
@@ -69,23 +70,33 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
69 count = size - offs; 70 count = size - offs;
70 } 71 }
71 72
73 temp = kmalloc(count, GFP_KERNEL);
74 if (!temp)
75 return -ENOMEM;
76
72 mutex_lock(&bb->mutex); 77 mutex_lock(&bb->mutex);
73 78
74 count = fill_read(dentry, bb->buffer, offs, count); 79 count = fill_read(dentry, bb->buffer, offs, count);
75 if (count < 0) 80 if (count < 0) {
76 goto out_unlock; 81 mutex_unlock(&bb->mutex);
82 goto out_free;
83 }
77 84
78 if (copy_to_user(userbuf, bb->buffer, count)) { 85 memcpy(temp, bb->buffer, count);
86
87 mutex_unlock(&bb->mutex);
88
89 if (copy_to_user(userbuf, temp, count)) {
79 count = -EFAULT; 90 count = -EFAULT;
80 goto out_unlock; 91 goto out_free;
81 } 92 }
82 93
83 pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); 94 pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
84 95
85 *off = offs + count; 96 *off = offs + count;
86 97
87 out_unlock: 98 out_free:
88 mutex_unlock(&bb->mutex); 99 kfree(temp);
89 return count; 100 return count;
90} 101}
91 102
@@ -118,6 +129,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
118 int size = dentry->d_inode->i_size; 129 int size = dentry->d_inode->i_size;
119 loff_t offs = *off; 130 loff_t offs = *off;
120 int count = min_t(size_t, bytes, PAGE_SIZE); 131 int count = min_t(size_t, bytes, PAGE_SIZE);
132 char *temp;
121 133
122 if (size) { 134 if (size) {
123 if (offs > size) 135 if (offs > size)
@@ -126,19 +138,27 @@ static ssize_t write(struct file *file, const char __user *userbuf,
126 count = size - offs; 138 count = size - offs;
127 } 139 }
128 140
129 mutex_lock(&bb->mutex); 141 temp = kmalloc(count, GFP_KERNEL);
142 if (!temp)
143 return -ENOMEM;
130 144
131 if (copy_from_user(bb->buffer, userbuf, count)) { 145 if (copy_from_user(temp, userbuf, count)) {
132 count = -EFAULT; 146 count = -EFAULT;
133 goto out_unlock; 147 goto out_free;
134 } 148 }
135 149
150 mutex_lock(&bb->mutex);
151
152 memcpy(bb->buffer, temp, count);
153
136 count = flush_write(dentry, bb->buffer, offs, count); 154 count = flush_write(dentry, bb->buffer, offs, count);
155 mutex_unlock(&bb->mutex);
156
137 if (count > 0) 157 if (count > 0)
138 *off = offs + count; 158 *off = offs + count;
139 159
140 out_unlock: 160out_free:
141 mutex_unlock(&bb->mutex); 161 kfree(temp);
142 return count; 162 return count;
143} 163}
144 164
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index aedaeba82ae5..3a05a596e3b4 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -370,17 +370,17 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
370 memset(acxt, 0, sizeof(*acxt)); 370 memset(acxt, 0, sizeof(*acxt));
371 acxt->parent_sd = parent_sd; 371 acxt->parent_sd = parent_sd;
372 372
373 /* Lookup parent inode. inode initialization and I_NEW 373 /* Lookup parent inode. inode initialization is protected by
374 * clearing are protected by sysfs_mutex. By grabbing it and 374 * sysfs_mutex, so inode existence can be determined by
375 * looking up with _nowait variant, inode state can be 375 * looking up inode while holding sysfs_mutex.
376 * determined reliably.
377 */ 376 */
378 mutex_lock(&sysfs_mutex); 377 mutex_lock(&sysfs_mutex);
379 378
380 inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, 379 inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
381 parent_sd); 380 parent_sd);
381 if (inode) {
382 WARN_ON(inode->i_state & I_NEW);
382 383
383 if (inode && !(inode->i_state & I_NEW)) {
384 /* parent inode available */ 384 /* parent inode available */
385 acxt->parent_inode = inode; 385 acxt->parent_inode = inode;
386 386
@@ -393,8 +393,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
393 mutex_lock(&inode->i_mutex); 393 mutex_lock(&inode->i_mutex);
394 mutex_lock(&sysfs_mutex); 394 mutex_lock(&sysfs_mutex);
395 } 395 }
396 } else 396 }
397 iput(inode);
398} 397}
399 398
400/** 399/**
@@ -636,6 +635,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
636 635
637 return sd; 636 return sd;
638} 637}
638EXPORT_SYMBOL_GPL(sysfs_get_dirent);
639 639
640static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, 640static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
641 const char *name, struct sysfs_dirent **p_sd) 641 const char *name, struct sysfs_dirent **p_sd)
@@ -829,16 +829,12 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
829 if (!new_dentry) 829 if (!new_dentry)
830 goto out_unlock; 830 goto out_unlock;
831 831
832 /* rename kobject and sysfs_dirent */ 832 /* rename sysfs_dirent */
833 error = -ENOMEM; 833 error = -ENOMEM;
834 new_name = dup_name = kstrdup(new_name, GFP_KERNEL); 834 new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
835 if (!new_name) 835 if (!new_name)
836 goto out_unlock; 836 goto out_unlock;
837 837
838 error = kobject_set_name(kobj, "%s", new_name);
839 if (error)
840 goto out_unlock;
841
842 dup_name = sd->s_name; 838 dup_name = sd->s_name;
843 sd->s_name = new_name; 839 sd->s_name = new_name;
844 840
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index c9e4e5091da1..1f4a3f877262 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -19,10 +19,18 @@
19#include <linux/poll.h> 19#include <linux/poll.h>
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/mutex.h> 21#include <linux/mutex.h>
22#include <linux/limits.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23 24
24#include "sysfs.h" 25#include "sysfs.h"
25 26
27/* used in crash dumps to help with debugging */
28static char last_sysfs_file[PATH_MAX];
29void sysfs_printk_last_file(void)
30{
31 printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file);
32}
33
26/* 34/*
27 * There's one sysfs_buffer for each open file and one 35 * There's one sysfs_buffer for each open file and one
28 * sysfs_open_dirent for each sysfs_dirent with one or more open 36 * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -328,6 +336,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
328 struct sysfs_buffer *buffer; 336 struct sysfs_buffer *buffer;
329 struct sysfs_ops *ops; 337 struct sysfs_ops *ops;
330 int error = -EACCES; 338 int error = -EACCES;
339 char *p;
340
341 p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
342 if (p)
343 memmove(last_sysfs_file, p, strlen(p) + 1);
331 344
332 /* need attr_sd for attr and ops, its parent for kobj */ 345 /* need attr_sd for attr and ops, its parent for kobj */
333 if (!sysfs_get_active_two(attr_sd)) 346 if (!sysfs_get_active_two(attr_sd))
@@ -440,7 +453,23 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
440 return POLLERR|POLLPRI; 453 return POLLERR|POLLPRI;
441} 454}
442 455
443void sysfs_notify(struct kobject *k, char *dir, char *attr) 456void sysfs_notify_dirent(struct sysfs_dirent *sd)
457{
458 struct sysfs_open_dirent *od;
459
460 spin_lock(&sysfs_open_dirent_lock);
461
462 od = sd->s_attr.open;
463 if (od) {
464 atomic_inc(&od->event);
465 wake_up_interruptible(&od->poll);
466 }
467
468 spin_unlock(&sysfs_open_dirent_lock);
469}
470EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
471
472void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
444{ 473{
445 struct sysfs_dirent *sd = k->sd; 474 struct sysfs_dirent *sd = k->sd;
446 475
@@ -450,19 +479,8 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr)
450 sd = sysfs_find_dirent(sd, dir); 479 sd = sysfs_find_dirent(sd, dir);
451 if (sd && attr) 480 if (sd && attr)
452 sd = sysfs_find_dirent(sd, attr); 481 sd = sysfs_find_dirent(sd, attr);
453 if (sd) { 482 if (sd)
454 struct sysfs_open_dirent *od; 483 sysfs_notify_dirent(sd);
455
456 spin_lock(&sysfs_open_dirent_lock);
457
458 od = sd->s_attr.open;
459 if (od) {
460 atomic_inc(&od->event);
461 wake_up_interruptible(&od->poll);
462 }
463
464 spin_unlock(&sysfs_open_dirent_lock);
465 }
466 484
467 mutex_unlock(&sysfs_mutex); 485 mutex_unlock(&sysfs_mutex);
468} 486}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 14f0023984d7..ab343e371d64 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -16,6 +16,7 @@
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h>
19 20
20#include "sysfs.h" 21#include "sysfs.h"
21 22
@@ -115,3 +116,17 @@ out_err:
115 sysfs_dir_cachep = NULL; 116 sysfs_dir_cachep = NULL;
116 goto out; 117 goto out;
117} 118}
119
120#undef sysfs_get
121struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
122{
123 return __sysfs_get(sd);
124}
125EXPORT_SYMBOL_GPL(sysfs_get);
126
127#undef sysfs_put
128void sysfs_put(struct sysfs_dirent *sd)
129{
130 __sysfs_put(sd);
131}
132EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index a5db496f71c7..93c6d6b27c4d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -124,7 +124,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
124 struct sysfs_dirent **p_sd); 124 struct sysfs_dirent **p_sd);
125void sysfs_remove_subdir(struct sysfs_dirent *sd); 125void sysfs_remove_subdir(struct sysfs_dirent *sd);
126 126
127static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) 127static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
128{ 128{
129 if (sd) { 129 if (sd) {
130 WARN_ON(!atomic_read(&sd->s_count)); 130 WARN_ON(!atomic_read(&sd->s_count));
@@ -132,12 +132,14 @@ static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
132 } 132 }
133 return sd; 133 return sd;
134} 134}
135#define sysfs_get(sd) __sysfs_get(sd)
135 136
136static inline void sysfs_put(struct sysfs_dirent *sd) 137static inline void __sysfs_put(struct sysfs_dirent *sd)
137{ 138{
138 if (sd && atomic_dec_and_test(&sd->s_count)) 139 if (sd && atomic_dec_and_test(&sd->s_count))
139 release_sysfs_dirent(sd); 140 release_sysfs_dirent(sd);
140} 141}
142#define sysfs_put(sd) __sysfs_put(sd)
141 143
142/* 144/*
143 * inode.c 145 * inode.c
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 154098157473..73db464cd08b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
302 int subtract_lebs; 302 int subtract_lebs;
303 long long available; 303 long long available;
304 304
305 /*
306 * Force the amount available to the total size reported if the used
307 * space is zero.
308 */
309 if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
310 c->budg_data_growth + c->budg_dd_growth == 0) {
311 /* Do the same calculation as for c->block_cnt */
312 available = c->main_lebs - 2;
313 available *= c->leb_size - c->dark_wm;
314 return available;
315 }
316
317 available = c->main_bytes - c->lst.total_used; 305 available = c->main_bytes - c->lst.total_used;
318 306
319 /* 307 /*
@@ -714,34 +702,106 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
714} 702}
715 703
716/** 704/**
717 * ubifs_budg_get_free_space - return amount of free space. 705 * ubifs_reported_space - calculate reported free space.
706 * @c: the UBIFS file-system description object
707 * @free: amount of free space
708 *
709 * This function calculates amount of free space which will be reported to
710 * user-space. User-space application tend to expect that if the file-system
711 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
712 * are able to write a file of size N. UBIFS attaches node headers to each data
713 * node and it has to write indexind nodes as well. This introduces additional
714 * overhead, and UBIFS it has to report sligtly less free space to meet the
715 * above expectetion.
716 *
717 * This function assumes free space is made up of uncompressed data nodes and
718 * full index nodes (one per data node, tripled because we always allow enough
719 * space to write the index thrice).
720 *
721 * Note, the calculation is pessimistic, which means that most of the time
722 * UBIFS reports less space than it actually has.
723 */
724long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
725{
726 int divisor, factor, f;
727
728 /*
729 * Reported space size is @free * X, where X is UBIFS block size
730 * divided by UBIFS block size + all overhead one data block
731 * introduces. The overhead is the node header + indexing overhead.
732 *
733 * Indexing overhead calculations are based on the following formula:
734 * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
735 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
736 * as less than maximum fanout, we assume that each data node
737 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
738 * Note, the multiplier 3 is because UBIFS reseves thrice as more space
739 * for the index.
740 */
741 f = c->fanout > 3 ? c->fanout >> 1 : 2;
742 factor = UBIFS_BLOCK_SIZE;
743 divisor = UBIFS_MAX_DATA_NODE_SZ;
744 divisor += (c->max_idx_node_sz * 3) / (f - 1);
745 free *= factor;
746 do_div(free, divisor);
747 return free;
748}
749
750/**
751 * ubifs_get_free_space - return amount of free space.
718 * @c: UBIFS file-system description object 752 * @c: UBIFS file-system description object
719 * 753 *
720 * This function returns amount of free space on the file-system. 754 * This function calculates amount of free space to report to user-space.
755 *
756 * Because UBIFS may introduce substantial overhead (the index, node headers,
757 * alighment, wastage at the end of eraseblocks, etc), it cannot report real
758 * amount of free flash space it has (well, because not all dirty space is
759 * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
760 * it would bread user expectetion about what free space is. Users seem to
761 * accustomed to assume that if the file-system reports N bytes of free space,
762 * they would be able to fit a file of N bytes to the FS. This almost works for
763 * traditional file-systems, because they have way less overhead than UBIFS.
764 * So, to keep users happy, UBIFS tries to take the overhead into account.
721 */ 765 */
722long long ubifs_budg_get_free_space(struct ubifs_info *c) 766long long ubifs_get_free_space(struct ubifs_info *c)
723{ 767{
724 int min_idx_lebs, rsvd_idx_lebs; 768 int min_idx_lebs, rsvd_idx_lebs, lebs;
725 long long available, outstanding, free; 769 long long available, outstanding, free;
726 770
727 /* Do exactly the same calculations as in 'do_budget_space()' */
728 spin_lock(&c->space_lock); 771 spin_lock(&c->space_lock);
729 min_idx_lebs = ubifs_calc_min_idx_lebs(c); 772 min_idx_lebs = ubifs_calc_min_idx_lebs(c);
773 outstanding = c->budg_data_growth + c->budg_dd_growth;
730 774
731 if (min_idx_lebs > c->lst.idx_lebs) 775 /*
732 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; 776 * Force the amount available to the total size reported if the used
733 else 777 * space is zero.
734 rsvd_idx_lebs = 0; 778 */
735 779 if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
736 if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
737 - c->lst.taken_empty_lebs) {
738 spin_unlock(&c->space_lock); 780 spin_unlock(&c->space_lock);
739 return 0; 781 return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
740 } 782 }
741 783
742 available = ubifs_calc_available(c, min_idx_lebs); 784 available = ubifs_calc_available(c, min_idx_lebs);
743 outstanding = c->budg_data_growth + c->budg_dd_growth; 785
744 c->min_idx_lebs = min_idx_lebs; 786 /*
787 * When reporting free space to user-space, UBIFS guarantees that it is
788 * possible to write a file of free space size. This means that for
789 * empty LEBs we may use more precise calculations than
790 * 'ubifs_calc_available()' is using. Namely, we know that in empty
791 * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
792 * Thus, amend the available space.
793 *
794 * Note, the calculations below are similar to what we have in
795 * 'do_budget_space()', so refer there for comments.
796 */
797 if (min_idx_lebs > c->lst.idx_lebs)
798 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
799 else
800 rsvd_idx_lebs = 0;
801 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
802 c->lst.taken_empty_lebs;
803 lebs -= rsvd_idx_lebs;
804 available += lebs * (c->dark_wm - c->leb_overhead);
745 spin_unlock(&c->space_lock); 805 spin_unlock(&c->space_lock);
746 806
747 if (available > outstanding) 807 if (available > outstanding)
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b9cb77473758..d7f7645779f2 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
538 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); 538 printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
539 for (i = 0; i < n; i++) 539 for (i = 0; i < n; i++)
540 printk(KERN_DEBUG "\t ino %llu\n", 540 printk(KERN_DEBUG "\t ino %llu\n",
541 le64_to_cpu(orph->inos[i])); 541 (unsigned long long)le64_to_cpu(orph->inos[i]));
542 break; 542 break;
543 } 543 }
544 default: 544 default:
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c96f1fb7016..526c01ec8003 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -426,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
426 426
427 while (1) { 427 while (1) {
428 dbg_gen("feed '%s', ino %llu, new f_pos %#x", 428 dbg_gen("feed '%s', ino %llu, new f_pos %#x",
429 dent->name, le64_to_cpu(dent->inum), 429 dent->name, (unsigned long long)le64_to_cpu(dent->inum),
430 key_hash_flash(c, &dent->key)); 430 key_hash_flash(c, &dent->key));
431 ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); 431 ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
432 432
@@ -587,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
587 if (err) { 587 if (err) {
588 if (err != -ENOSPC) 588 if (err != -ENOSPC)
589 return err; 589 return err;
590 err = 0;
591 budgeted = 0; 590 budgeted = 0;
592 } 591 }
593 592
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 4071d1cae29f..3d698e2022b1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
793 int err; 793 int err;
794 struct ubifs_budget_req req; 794 struct ubifs_budget_req req;
795 loff_t old_size = inode->i_size, new_size = attr->ia_size; 795 loff_t old_size = inode->i_size, new_size = attr->ia_size;
796 int offset = new_size & (UBIFS_BLOCK_SIZE - 1); 796 int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
797 struct ubifs_inode *ui = ubifs_inode(inode); 797 struct ubifs_inode *ui = ubifs_inode(inode);
798 798
799 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); 799 dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
811 /* A funny way to budget for truncation node */ 811 /* A funny way to budget for truncation node */
812 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; 812 req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
813 err = ubifs_budget_space(c, &req); 813 err = ubifs_budget_space(c, &req);
814 if (err) 814 if (err) {
815 return err; 815 /*
816 * Treat truncations to zero as deletion and always allow them,
817 * just like we do for '->unlink()'.
818 */
819 if (new_size || err != -ENOSPC)
820 return err;
821 budgeted = 0;
822 }
816 823
817 err = vmtruncate(inode, new_size); 824 err = vmtruncate(inode, new_size);
818 if (err) 825 if (err)
@@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
869 err = ubifs_jnl_truncate(c, inode, old_size, new_size); 876 err = ubifs_jnl_truncate(c, inode, old_size, new_size);
870 mutex_unlock(&ui->ui_mutex); 877 mutex_unlock(&ui->ui_mutex);
871out_budg: 878out_budg:
872 ubifs_release_budget(c, &req); 879 if (budgeted)
880 ubifs_release_budget(c, &req);
881 else {
882 c->nospace = c->nospace_rp = 0;
883 smp_wmb();
884 }
873 return err; 885 return err;
874} 886}
875 887
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index adee7b5ddeab..47814cde2407 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
211 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty 211 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
212 * or do not have an LEB which satisfies the @min_space criteria. 212 * or do not have an LEB which satisfies the @min_space criteria.
213 * 213 *
214 * Note: 214 * Note, LEBs which have less than dead watermark of free + dirty space are
215 * o LEBs which have less than dead watermark of dirty space are never picked 215 * never picked by this function.
216 * by this function;
217 *
218 * Returns zero and the LEB properties of
219 * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
220 * negative error code in case of other failures. The returned LEB is marked as
221 * "taken".
222 * 216 *
223 * The additional @pick_free argument controls if this function has to return a 217 * The additional @pick_free argument controls if this function has to return a
224 * free or freeable LEB if one is present. For example, GC must to set it to %1, 218 * free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
231 * 225 *
232 * In addition @pick_free is set to %2 by the recovery process in order to 226 * In addition @pick_free is set to %2 by the recovery process in order to
233 * recover gc_lnum in which case an index LEB must not be returned. 227 * recover gc_lnum in which case an index LEB must not be returned.
228 *
229 * This function returns zero and the LEB properties of found dirty LEB in case
230 * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
231 * case of other failures. The returned LEB is marked as "taken".
234 */ 232 */
235int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, 233int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
236 int min_space, int pick_free) 234 int min_space, int pick_free)
@@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
245 int lebs, rsvd_idx_lebs = 0; 243 int lebs, rsvd_idx_lebs = 0;
246 244
247 spin_lock(&c->space_lock); 245 spin_lock(&c->space_lock);
248 lebs = c->lst.empty_lebs; 246 lebs = c->lst.empty_lebs + c->idx_gc_cnt;
249 lebs += c->freeable_cnt - c->lst.taken_empty_lebs; 247 lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
250 248
251 /* 249 /*
@@ -317,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
317 lp = idx_lp; 315 lp = idx_lp;
318 316
319 if (lp) { 317 if (lp) {
320 ubifs_assert(lp->dirty >= c->dead_wm); 318 ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
321 goto found; 319 goto found;
322 } 320 }
323 321
@@ -509,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
509 rsvd_idx_lebs = 0; 507 rsvd_idx_lebs = 0;
510 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 508 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
511 c->lst.taken_empty_lebs; 509 c->lst.taken_empty_lebs;
512 ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
513 if (rsvd_idx_lebs < lebs) 510 if (rsvd_idx_lebs < lebs)
514 /* 511 /*
515 * OK to allocate an empty LEB, but we still don't want to go 512 * OK to allocate an empty LEB, but we still don't want to go
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac29081..02aba36fe3d4 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -334,15 +334,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
334 334
335 err = move_nodes(c, sleb); 335 err = move_nodes(c, sleb);
336 if (err) 336 if (err)
337 goto out; 337 goto out_inc_seq;
338 338
339 err = gc_sync_wbufs(c); 339 err = gc_sync_wbufs(c);
340 if (err) 340 if (err)
341 goto out; 341 goto out_inc_seq;
342 342
343 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); 343 err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
344 if (err) 344 if (err)
345 goto out; 345 goto out_inc_seq;
346
347 /* Allow for races with TNC */
348 c->gced_lnum = lnum;
349 smp_wmb();
350 c->gc_seq += 1;
351 smp_wmb();
346 352
347 if (c->gc_lnum == -1) { 353 if (c->gc_lnum == -1) {
348 c->gc_lnum = lnum; 354 c->gc_lnum = lnum;
@@ -363,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
363out: 369out:
364 ubifs_scan_destroy(sleb); 370 ubifs_scan_destroy(sleb);
365 return err; 371 return err;
372
373out_inc_seq:
374 /* We may have moved at least some nodes so allow for races with TNC */
375 c->gced_lnum = lnum;
376 smp_wmb();
377 c->gc_seq += 1;
378 smp_wmb();
379 goto out;
366} 380}
367 381
368/** 382/**
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 87dabf9fe742..4c12a9215d7f 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -284,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
284} 284}
285 285
286/** 286/**
287 * ubifs_reported_space - calculate reported free space.
288 * @c: the UBIFS file-system description object
289 * @free: amount of free space
290 *
291 * This function calculates amount of free space which will be reported to
292 * user-space. User-space application tend to expect that if the file-system
293 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
294 * are able to write a file of size N. UBIFS attaches node headers to each data
295 * node and it has to write indexind nodes as well. This introduces additional
296 * overhead, and UBIFS it has to report sligtly less free space to meet the
297 * above expectetion.
298 *
299 * This function assumes free space is made up of uncompressed data nodes and
300 * full index nodes (one per data node, doubled because we always allow enough
301 * space to write the index twice).
302 *
303 * Note, the calculation is pessimistic, which means that most of the time
304 * UBIFS reports less space than it actually has.
305 */
306static inline long long ubifs_reported_space(const struct ubifs_info *c,
307 uint64_t free)
308{
309 int divisor, factor;
310
311 divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3);
312 factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
313 do_div(free, divisor);
314
315 return free * factor;
316}
317
318/**
319 * ubifs_current_time - round current time to time granularity. 287 * ubifs_current_time - round current time to time granularity.
320 * @inode: inode 288 * @inode: inode
321 */ 289 */
@@ -325,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode)
325 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; 293 current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
326} 294}
327 295
296/**
297 * ubifs_tnc_lookup - look up a file-system node.
298 * @c: UBIFS file-system description object
299 * @key: node key to lookup
300 * @node: the node is returned here
301 *
302 * This function look up and reads node with key @key. The caller has to make
303 * sure the @node buffer is large enough to fit the node. Returns zero in case
304 * of success, %-ENOENT if the node was not found, and a negative error code in
305 * case of failure.
306 */
307static inline int ubifs_tnc_lookup(struct ubifs_info *c,
308 const union ubifs_key *key, void *node)
309{
310 return ubifs_tnc_locate(c, key, node, NULL, NULL);
311}
312
328#endif /* __UBIFS_MISC_H__ */ 313#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f71e6b8822c4..9a9220333b3b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -370,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
370{ 370{
371 struct ubifs_info *c = dentry->d_sb->s_fs_info; 371 struct ubifs_info *c = dentry->d_sb->s_fs_info;
372 unsigned long long free; 372 unsigned long long free;
373 __le32 *uuid = (__le32 *)c->uuid;
373 374
374 free = ubifs_budg_get_free_space(c); 375 free = ubifs_get_free_space(c);
375 dbg_gen("free space %lld bytes (%lld blocks)", 376 dbg_gen("free space %lld bytes (%lld blocks)",
376 free, free >> UBIFS_BLOCK_SHIFT); 377 free, free >> UBIFS_BLOCK_SHIFT);
377 378
@@ -386,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
386 buf->f_files = 0; 387 buf->f_files = 0;
387 buf->f_ffree = 0; 388 buf->f_ffree = 0;
388 buf->f_namelen = UBIFS_MAX_NLEN; 389 buf->f_namelen = UBIFS_MAX_NLEN;
389 390 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
391 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
390 return 0; 392 return 0;
391} 393}
392 394
@@ -530,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c)
530 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); 532 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
531 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); 533 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
532 534
535 /*
536 * Calculate how many bytes would be wasted at the end of LEB if it was
537 * fully filled with data nodes of maximum size. This is used in
538 * calculations when reporting free space.
539 */
540 c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
533 return 0; 541 return 0;
534} 542}
535 543
@@ -647,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c)
647 * internally because it does not make much sense for UBIFS, but it is 655 * internally because it does not make much sense for UBIFS, but it is
648 * necessary to report something for the 'statfs()' call. 656 * necessary to report something for the 'statfs()' call.
649 * 657 *
650 * Subtract the LEB reserved for GC and the LEB which is reserved for 658 * Subtract the LEB reserved for GC, the LEB which is reserved for
651 * deletions. 659 * deletions, and assume only one journal head is available.
652 *
653 * Review 'ubifs_calc_available()' if changing this calculation.
654 */ 660 */
655 tmp64 = c->main_lebs - 2; 661 tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
656 tmp64 *= (uint64_t)c->leb_size - c->dark_wm; 662 tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
657 tmp64 = ubifs_reported_space(c, tmp64); 663 tmp64 = ubifs_reported_space(c, tmp64);
658 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; 664 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
659 665
@@ -842,7 +848,7 @@ enum {
842 Opt_err, 848 Opt_err,
843}; 849};
844 850
845static match_table_t tokens = { 851static const match_table_t tokens = {
846 {Opt_fast_unmount, "fast_unmount"}, 852 {Opt_fast_unmount, "fast_unmount"},
847 {Opt_norm_unmount, "norm_unmount"}, 853 {Opt_norm_unmount, "norm_unmount"},
848 {Opt_err, NULL}, 854 {Opt_err, NULL},
@@ -1018,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c)
1018 goto out_dereg; 1024 goto out_dereg;
1019 } 1025 }
1020 1026
1027 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
1021 if (!mounted_read_only) { 1028 if (!mounted_read_only) {
1022 err = alloc_wbufs(c); 1029 err = alloc_wbufs(c);
1023 if (err) 1030 if (err)
1024 goto out_cbuf; 1031 goto out_cbuf;
1025 1032
1026 /* Create background thread */ 1033 /* Create background thread */
1027 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
1028 c->vi.vol_id);
1029 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1034 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
1030 if (!c->bgt) 1035 if (!c->bgt)
1031 c->bgt = ERR_PTR(-EINVAL); 1036 c->bgt = ERR_PTR(-EINVAL);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a96443..7634c5970887 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
506 if (keys_cmp(c, key, &node_key) != 0) 506 if (keys_cmp(c, key, &node_key) != 0)
507 ret = 0; 507 ret = 0;
508 } 508 }
509 if (ret == 0) 509 if (ret == 0 && c->replaying)
510 dbg_mnt("dangling branch LEB %d:%d len %d, key %s", 510 dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
511 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); 511 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
512 return ret; 512 return ret;
@@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
1382} 1382}
1383 1383
1384/** 1384/**
1385 * ubifs_tnc_lookup - look up a file-system node. 1385 * maybe_leb_gced - determine if a LEB may have been garbage collected.
1386 * @c: UBIFS file-system description object 1386 * @c: UBIFS file-system description object
1387 * @key: node key to lookup 1387 * @lnum: LEB number
1388 * @node: the node is returned here 1388 * @gc_seq1: garbage collection sequence number
1389 * 1389 *
1390 * This function look up and reads node with key @key. The caller has to make 1390 * This function determines if @lnum may have been garbage collected since
1391 * sure the @node buffer is large enough to fit the node. Returns zero in case 1391 * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
1392 * of success, %-ENOENT if the node was not found, and a negative error code in 1392 * %0 is returned.
1393 * case of failure.
1394 */ 1393 */
1395int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, 1394static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
1396 void *node)
1397{ 1395{
1398 int found, n, err; 1396 int gc_seq2, gced_lnum;
1399 struct ubifs_znode *znode;
1400 struct ubifs_zbranch zbr, *zt;
1401 1397
1402 mutex_lock(&c->tnc_mutex); 1398 gced_lnum = c->gced_lnum;
1403 found = ubifs_lookup_level0(c, key, &znode, &n); 1399 smp_rmb();
1404 if (!found) { 1400 gc_seq2 = c->gc_seq;
1405 err = -ENOENT; 1401 /* Same seq means no GC */
1406 goto out; 1402 if (gc_seq1 == gc_seq2)
1407 } else if (found < 0) { 1403 return 0;
1408 err = found; 1404 /* Different by more than 1 means we don't know */
1409 goto out; 1405 if (gc_seq1 + 1 != gc_seq2)
1410 } 1406 return 1;
1411 zt = &znode->zbranch[n]; 1407 /*
1412 if (is_hash_key(c, key)) { 1408 * We have seen the sequence number has increased by 1. Now we need to
1413 /* 1409 * be sure we read the right LEB number, so read it again.
1414 * In this case the leaf node cache gets used, so we pass the 1410 */
1415 * address of the zbranch and keep the mutex locked 1411 smp_rmb();
1416 */ 1412 if (gced_lnum != c->gced_lnum)
1417 err = tnc_read_node_nm(c, zt, node); 1413 return 1;
1418 goto out; 1414 /* Finally we can check lnum */
1419 } 1415 if (gced_lnum == lnum)
1420 zbr = znode->zbranch[n]; 1416 return 1;
1421 mutex_unlock(&c->tnc_mutex); 1417 return 0;
1422
1423 err = ubifs_tnc_read_node(c, &zbr, node);
1424 return err;
1425
1426out:
1427 mutex_unlock(&c->tnc_mutex);
1428 return err;
1429} 1418}
1430 1419
1431/** 1420/**
@@ -1436,16 +1425,19 @@ out:
1436 * @lnum: LEB number is returned here 1425 * @lnum: LEB number is returned here
1437 * @offs: offset is returned here 1426 * @offs: offset is returned here
1438 * 1427 *
1439 * This function is the same as 'ubifs_tnc_lookup()' but it returns the node 1428 * This function look up and reads node with key @key. The caller has to make
1440 * location also. See 'ubifs_tnc_lookup()'. 1429 * sure the @node buffer is large enough to fit the node. Returns zero in case
1430 * of success, %-ENOENT if the node was not found, and a negative error code in
1431 * case of failure. The node location can be returned in @lnum and @offs.
1441 */ 1432 */
1442int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, 1433int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1443 void *node, int *lnum, int *offs) 1434 void *node, int *lnum, int *offs)
1444{ 1435{
1445 int found, n, err; 1436 int found, n, err, safely = 0, gc_seq1;
1446 struct ubifs_znode *znode; 1437 struct ubifs_znode *znode;
1447 struct ubifs_zbranch zbr, *zt; 1438 struct ubifs_zbranch zbr, *zt;
1448 1439
1440again:
1449 mutex_lock(&c->tnc_mutex); 1441 mutex_lock(&c->tnc_mutex);
1450 found = ubifs_lookup_level0(c, key, &znode, &n); 1442 found = ubifs_lookup_level0(c, key, &znode, &n);
1451 if (!found) { 1443 if (!found) {
@@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
1456 goto out; 1448 goto out;
1457 } 1449 }
1458 zt = &znode->zbranch[n]; 1450 zt = &znode->zbranch[n];
1451 if (lnum) {
1452 *lnum = zt->lnum;
1453 *offs = zt->offs;
1454 }
1459 if (is_hash_key(c, key)) { 1455 if (is_hash_key(c, key)) {
1460 /* 1456 /*
1461 * In this case the leaf node cache gets used, so we pass the 1457 * In this case the leaf node cache gets used, so we pass the
1462 * address of the zbranch and keep the mutex locked 1458 * address of the zbranch and keep the mutex locked
1463 */ 1459 */
1464 *lnum = zt->lnum;
1465 *offs = zt->offs;
1466 err = tnc_read_node_nm(c, zt, node); 1460 err = tnc_read_node_nm(c, zt, node);
1467 goto out; 1461 goto out;
1468 } 1462 }
1463 if (safely) {
1464 err = ubifs_tnc_read_node(c, zt, node);
1465 goto out;
1466 }
1467 /* Drop the TNC mutex prematurely and race with garbage collection */
1469 zbr = znode->zbranch[n]; 1468 zbr = znode->zbranch[n];
1469 gc_seq1 = c->gc_seq;
1470 mutex_unlock(&c->tnc_mutex); 1470 mutex_unlock(&c->tnc_mutex);
1471 1471
1472 *lnum = zbr.lnum; 1472 if (ubifs_get_wbuf(c, zbr.lnum)) {
1473 *offs = zbr.offs; 1473 /* We do not GC journal heads */
1474 err = ubifs_tnc_read_node(c, &zbr, node);
1475 return err;
1476 }
1474 1477
1475 err = ubifs_tnc_read_node(c, &zbr, node); 1478 err = fallible_read_node(c, key, &zbr, node);
1476 return err; 1479 if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
1480 /*
1481 * The node may have been GC'ed out from under us so try again
1482 * while keeping the TNC mutex locked.
1483 */
1484 safely = 1;
1485 goto again;
1486 }
1487 return 0;
1477 1488
1478out: 1489out:
1479 mutex_unlock(&c->tnc_mutex); 1490 mutex_unlock(&c->tnc_mutex);
@@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1498{ 1509{
1499 int found, n, err; 1510 int found, n, err;
1500 struct ubifs_znode *znode; 1511 struct ubifs_znode *znode;
1501 struct ubifs_zbranch zbr;
1502 1512
1503 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); 1513 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
1504 mutex_lock(&c->tnc_mutex); 1514 mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1522 goto out_unlock; 1532 goto out_unlock;
1523 } 1533 }
1524 1534
1525 zbr = znode->zbranch[n]; 1535 err = tnc_read_node_nm(c, &znode->zbranch[n], node);
1526 mutex_unlock(&c->tnc_mutex);
1527
1528 err = tnc_read_node_nm(c, &zbr, node);
1529 return err;
1530 1536
1531out_unlock: 1537out_unlock:
1532 mutex_unlock(&c->tnc_mutex); 1538 mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index bd2121f3426e..a9ecbd9af20d 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -87,7 +87,7 @@
87#define UBIFS_SK_LEN 8 87#define UBIFS_SK_LEN 8
88 88
89/* Minimum index tree fanout */ 89/* Minimum index tree fanout */
90#define UBIFS_MIN_FANOUT 2 90#define UBIFS_MIN_FANOUT 3
91 91
92/* Maximum number of levels in UBIFS indexing B-tree */ 92/* Maximum number of levels in UBIFS indexing B-tree */
93#define UBIFS_MAX_LEVELS 512 93#define UBIFS_MAX_LEVELS 512
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d7f706f7a302..17c620b93eec 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -995,6 +995,9 @@ struct ubifs_mount_opts {
995 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary 995 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
996 * @max_inode_sz: maximum possible inode size in bytes 996 * @max_inode_sz: maximum possible inode size in bytes
997 * @max_znode_sz: size of znode in bytes 997 * @max_znode_sz: size of znode in bytes
998 *
999 * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
1000 * data nodes of maximum size - used in free space reporting
998 * @dead_wm: LEB dead space watermark 1001 * @dead_wm: LEB dead space watermark
999 * @dark_wm: LEB dark space watermark 1002 * @dark_wm: LEB dark space watermark
1000 * @block_cnt: count of 4KiB blocks on the FS 1003 * @block_cnt: count of 4KiB blocks on the FS
@@ -1028,6 +1031,8 @@ struct ubifs_mount_opts {
1028 * @sbuf: a buffer of LEB size used by GC and replay for scanning 1031 * @sbuf: a buffer of LEB size used by GC and replay for scanning
1029 * @idx_gc: list of index LEBs that have been garbage collected 1032 * @idx_gc: list of index LEBs that have been garbage collected
1030 * @idx_gc_cnt: number of elements on the idx_gc list 1033 * @idx_gc_cnt: number of elements on the idx_gc list
1034 * @gc_seq: incremented for every non-index LEB garbage collected
1035 * @gced_lnum: last non-index LEB that was garbage collected
1031 * 1036 *
1032 * @infos_list: links all 'ubifs_info' objects 1037 * @infos_list: links all 'ubifs_info' objects
1033 * @umount_mutex: serializes shrinker and un-mount 1038 * @umount_mutex: serializes shrinker and un-mount
@@ -1224,6 +1229,8 @@ struct ubifs_info {
1224 int max_idx_node_sz; 1229 int max_idx_node_sz;
1225 long long max_inode_sz; 1230 long long max_inode_sz;
1226 int max_znode_sz; 1231 int max_znode_sz;
1232
1233 int leb_overhead;
1227 int dead_wm; 1234 int dead_wm;
1228 int dark_wm; 1235 int dark_wm;
1229 int block_cnt; 1236 int block_cnt;
@@ -1257,6 +1264,8 @@ struct ubifs_info {
1257 void *sbuf; 1264 void *sbuf;
1258 struct list_head idx_gc; 1265 struct list_head idx_gc;
1259 int idx_gc_cnt; 1266 int idx_gc_cnt;
1267 volatile int gc_seq;
1268 volatile int gced_lnum;
1260 1269
1261 struct list_head infos_list; 1270 struct list_head infos_list;
1262 struct mutex umount_mutex; 1271 struct mutex umount_mutex;
@@ -1434,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1434 struct ubifs_budget_req *req); 1443 struct ubifs_budget_req *req);
1435void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, 1444void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1436 struct ubifs_budget_req *req); 1445 struct ubifs_budget_req *req);
1437long long ubifs_budg_get_free_space(struct ubifs_info *c); 1446long long ubifs_get_free_space(struct ubifs_info *c);
1438int ubifs_calc_min_idx_lebs(struct ubifs_info *c); 1447int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1439void ubifs_convert_page_budget(struct ubifs_info *c); 1448void ubifs_convert_page_budget(struct ubifs_info *c);
1449long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
1440long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1450long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1441 1451
1442/* find.c */ 1452/* find.c */
@@ -1451,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
1451/* tnc.c */ 1461/* tnc.c */
1452int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, 1462int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1453 struct ubifs_znode **zn, int *n); 1463 struct ubifs_znode **zn, int *n);
1454int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
1455 void *node);
1456int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, 1464int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1457 void *node, const struct qstr *nm); 1465 void *node, const struct qstr *nm);
1458int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, 1466int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d9..eb91f3b70320 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
211 .release = udf_release_file, 211 .release = udf_release_file,
212 .fsync = udf_fsync_file, 212 .fsync = udf_fsync_file,
213 .splice_read = generic_file_splice_read, 213 .splice_read = generic_file_splice_read,
214 .llseek = generic_file_llseek,
214}; 215};
215 216
216const struct inode_operations udf_file_inode_operations = { 217const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3d..a4f2b3ce45b0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
76 *err = -ENOSPC; 76 *err = -ENOSPC;
77 77
78 iinfo = UDF_I(inode); 78 iinfo = UDF_I(inode);
79 iinfo->i_unique = 0; 79 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
80 iinfo->i_lenExtents = 0; 80 iinfo->i_efe = 1;
81 iinfo->i_next_alloc_block = 0; 81 if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
82 iinfo->i_next_alloc_goal = 0; 82 sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
83 iinfo->i_strat4096 = 0; 83 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
84 sizeof(struct extendedFileEntry),
85 GFP_KERNEL);
86 } else {
87 iinfo->i_efe = 0;
88 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
89 sizeof(struct fileEntry),
90 GFP_KERNEL);
91 }
92 if (!iinfo->i_ext.i_data) {
93 iput(inode);
94 *err = -ENOMEM;
95 return NULL;
96 }
84 97
85 block = udf_new_block(dir->i_sb, NULL, 98 block = udf_new_block(dir->i_sb, NULL,
86 dinfo->i_location.partitionReferenceNum, 99 dinfo->i_location.partitionReferenceNum,
@@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
111 lvhd->uniqueID = cpu_to_le64(uniqueID); 124 lvhd->uniqueID = cpu_to_le64(uniqueID);
112 mark_buffer_dirty(sbi->s_lvid_bh); 125 mark_buffer_dirty(sbi->s_lvid_bh);
113 } 126 }
127 mutex_unlock(&sbi->s_alloc_mutex);
114 inode->i_mode = mode; 128 inode->i_mode = mode;
115 inode->i_uid = current->fsuid; 129 inode->i_uid = current->fsuid;
116 if (dir->i_mode & S_ISGID) { 130 if (dir->i_mode & S_ISGID) {
@@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
129 iinfo->i_lenEAttr = 0; 143 iinfo->i_lenEAttr = 0;
130 iinfo->i_lenAlloc = 0; 144 iinfo->i_lenAlloc = 0;
131 iinfo->i_use = 0; 145 iinfo->i_use = 0;
132 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
133 iinfo->i_efe = 1;
134 if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
135 sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
136 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
137 sizeof(struct extendedFileEntry),
138 GFP_KERNEL);
139 } else {
140 iinfo->i_efe = 0;
141 iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
142 sizeof(struct fileEntry),
143 GFP_KERNEL);
144 }
145 if (!iinfo->i_ext.i_data) {
146 iput(inode);
147 *err = -ENOMEM;
148 mutex_unlock(&sbi->s_alloc_mutex);
149 return NULL;
150 }
151 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) 146 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
152 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; 147 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
153 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 148 else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
158 iinfo->i_crtime = current_fs_time(inode->i_sb); 153 iinfo->i_crtime = current_fs_time(inode->i_sb);
159 insert_inode_hash(inode); 154 insert_inode_hash(inode);
160 mark_inode_dirty(inode); 155 mark_inode_dirty(inode);
161 mutex_unlock(&sbi->s_alloc_mutex);
162 156
163 if (DQUOT_ALLOC_INODE(inode)) { 157 if (DQUOT_ALLOC_INODE(inode)) {
164 DQUOT_DROP(inode); 158 DQUOT_DROP(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 5698bbf83bbf..e25e7010627b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -369,7 +369,7 @@ enum {
369 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore 369 Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
370}; 370};
371 371
372static match_table_t tokens = { 372static const match_table_t tokens = {
373 {Opt_novrs, "novrs"}, 373 {Opt_novrs, "novrs"},
374 {Opt_nostrict, "nostrict"}, 374 {Opt_nostrict, "nostrict"},
375 {Opt_bs, "bs=%u"}, 375 {Opt_bs, "bs=%u"},
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3141969b456d..e65212dfb60e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -309,7 +309,7 @@ enum {
309 Opt_err 309 Opt_err
310}; 310};
311 311
312static match_table_t tokens = { 312static const match_table_t tokens = {
313 {Opt_type_old, "ufstype=old"}, 313 {Opt_type_old, "ufstype=old"},
314 {Opt_type_sunx86, "ufstype=sunx86"}, 314 {Opt_type_sunx86, "ufstype=sunx86"},
315 {Opt_type_sun, "ufstype=sun"}, 315 {Opt_type_sun, "ufstype=sun"},
@@ -1233,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
1233{ 1233{
1234 struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); 1234 struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
1235 unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; 1235 unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
1236 struct match_token *tp = tokens; 1236 const struct match_token *tp = tokens;
1237 1237
1238 while (tp->token != Opt_onerror_panic && tp->token != mval) 1238 while (tp->token != Opt_onerror_panic && tp->token != mval)
1239 ++tp; 1239 ++tp;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index f42f80a3b1fa..a44d68eb50b5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1338,6 +1338,10 @@ __xfs_get_blocks(
1338 offset = (xfs_off_t)iblock << inode->i_blkbits; 1338 offset = (xfs_off_t)iblock << inode->i_blkbits;
1339 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1339 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1340 size = bh_result->b_size; 1340 size = bh_result->b_size;
1341
1342 if (!create && direct && offset >= i_size_read(inode))
1343 return 0;
1344
1341 error = xfs_iomap(XFS_I(inode), offset, size, 1345 error = xfs_iomap(XFS_I(inode), offset, size,
1342 create ? flags : BMAPI_READ, &iomap, &niomap); 1346 create ? flags : BMAPI_READ, &iomap, &niomap);
1343 if (error) 1347 if (error)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 986061ae1b9b..36d5fcd3f593 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1001,12 +1001,13 @@ xfs_buf_iodone_work(
1001 * We can get an EOPNOTSUPP to ordered writes. Here we clear the 1001 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
1002 * ordered flag and reissue them. Because we can't tell the higher 1002 * ordered flag and reissue them. Because we can't tell the higher
1003 * layers directly that they should not issue ordered I/O anymore, they 1003 * layers directly that they should not issue ordered I/O anymore, they
1004 * need to check if the ordered flag was cleared during I/O completion. 1004 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
1005 */ 1005 */
1006 if ((bp->b_error == EOPNOTSUPP) && 1006 if ((bp->b_error == EOPNOTSUPP) &&
1007 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { 1007 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
1008 XB_TRACE(bp, "ordered_retry", bp->b_iodone); 1008 XB_TRACE(bp, "ordered_retry", bp->b_iodone);
1009 bp->b_flags &= ~XBF_ORDERED; 1009 bp->b_flags &= ~XBF_ORDERED;
1010 bp->b_flags |= _XFS_BARRIER_FAILED;
1010 xfs_buf_iorequest(bp); 1011 xfs_buf_iorequest(bp);
1011 } else if (bp->b_iodone) 1012 } else if (bp->b_iodone)
1012 (*(bp->b_iodone))(bp); 1013 (*(bp->b_iodone))(bp);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index fe0109956656..456519a088c7 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -85,6 +85,14 @@ typedef enum {
85 * modifications being lost. 85 * modifications being lost.
86 */ 86 */
87 _XBF_PAGE_LOCKED = (1 << 22), 87 _XBF_PAGE_LOCKED = (1 << 22),
88
89 /*
90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information.
94 */
95 _XFS_BARRIER_FAILED = (1 << 23),
88} xfs_buf_flags_t; 96} xfs_buf_flags_t;
89 97
90typedef enum { 98typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5f60363b9343..5311c1acdd40 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = {
475const struct file_operations xfs_dir_file_operations = { 475const struct file_operations xfs_dir_file_operations = {
476 .read = generic_read_dir, 476 .read = generic_read_dir,
477 .readdir = xfs_file_readdir, 477 .readdir = xfs_file_readdir,
478 .llseek = generic_file_llseek,
478 .unlocked_ioctl = xfs_file_ioctl, 479 .unlocked_ioctl = xfs_file_ioctl,
479#ifdef CONFIG_COMPAT 480#ifdef CONFIG_COMPAT
480 .compat_ioctl = xfs_file_compat_ioctl, 481 .compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 91bcd979242c..095d271f3434 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -355,7 +355,7 @@ xfs_vn_ci_lookup(
355 /* else case-insensitive match... */ 355 /* else case-insensitive match... */
356 dname.name = ci_name.name; 356 dname.name = ci_name.name;
357 dname.len = ci_name.len; 357 dname.len = ci_name.len;
358 dentry = d_add_ci(VFS_I(ip), dentry, &dname); 358 dentry = d_add_ci(dentry, VFS_I(ip), &dname);
359 kmem_free(ci_name.name); 359 kmem_free(ci_name.name);
360 return dentry; 360 return dentry;
361} 361}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 73c65f19e549..e39013619b26 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -158,7 +158,7 @@ enum {
158 Opt_barrier, Opt_nobarrier, Opt_err 158 Opt_barrier, Opt_nobarrier, Opt_err
159}; 159};
160 160
161static match_table_t tokens = { 161static const match_table_t tokens = {
162 {Opt_barrier, "barrier"}, 162 {Opt_barrier, "barrier"},
163 {Opt_nobarrier, "nobarrier"}, 163 {Opt_nobarrier, "nobarrier"},
164 {Opt_err, NULL} 164 {Opt_err, NULL}
@@ -1302,9 +1302,29 @@ xfs_fs_remount(
1302 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1302 mp->m_flags &= ~XFS_MOUNT_BARRIER;
1303 break; 1303 break;
1304 default: 1304 default:
1305 /*
1306 * Logically we would return an error here to prevent
1307 * users from believing they might have changed
1308 * mount options using remount which can't be changed.
1309 *
1310 * But unfortunately mount(8) adds all options from
1311 * mtab and fstab to the mount arguments in some cases
1312 * so we can't blindly reject options, but have to
1313 * check for each specified option if it actually
1314 * differs from the currently set option and only
1315 * reject it if that's the case.
1316 *
1317 * Until that is implemented we return success for
1318 * every remount request, and silently ignore all
1319 * options that we can't actually change.
1320 */
1321#if 0
1305 printk(KERN_INFO 1322 printk(KERN_INFO
1306 "XFS: mount option \"%s\" not supported for remount\n", p); 1323 "XFS: mount option \"%s\" not supported for remount\n", p);
1307 return -EINVAL; 1324 return -EINVAL;
1325#else
1326 break;
1327#endif
1308 } 1328 }
1309 } 1329 }
1310 1330
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 608c30c3f76b..002fc2617c8e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,6 +732,7 @@ xfs_buf_item_init(
732 bip->bli_item.li_ops = &xfs_buf_item_ops; 732 bip->bli_item.li_ops = &xfs_buf_item_ops;
733 bip->bli_item.li_mountp = mp; 733 bip->bli_item.li_mountp = mp;
734 bip->bli_buf = bp; 734 bip->bli_buf = bp;
735 xfs_buf_hold(bp);
735 bip->bli_format.blf_type = XFS_LI_BUF; 736 bip->bli_format.blf_type = XFS_LI_BUF;
736 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); 737 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
737 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); 738 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
867 return (bip->bli_flags & XFS_BLI_DIRTY); 868 return (bip->bli_flags & XFS_BLI_DIRTY);
868} 869}
869 870
871STATIC void
872xfs_buf_item_free(
873 xfs_buf_log_item_t *bip)
874{
875#ifdef XFS_TRANS_DEBUG
876 kmem_free(bip->bli_orig);
877 kmem_free(bip->bli_logged);
878#endif /* XFS_TRANS_DEBUG */
879
880#ifdef XFS_BLI_TRACE
881 ktrace_free(bip->bli_trace);
882#endif
883 kmem_zone_free(xfs_buf_item_zone, bip);
884}
885
870/* 886/*
871 * This is called when the buf log item is no longer needed. It should 887 * This is called when the buf log item is no longer needed. It should
872 * free the buf log item associated with the given buffer and clear 888 * free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
887 (XFS_BUF_IODONE_FUNC(bp) != NULL)) { 903 (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
888 XFS_BUF_CLR_IODONE_FUNC(bp); 904 XFS_BUF_CLR_IODONE_FUNC(bp);
889 } 905 }
890 906 xfs_buf_rele(bp);
891#ifdef XFS_TRANS_DEBUG 907 xfs_buf_item_free(bip);
892 kmem_free(bip->bli_orig);
893 bip->bli_orig = NULL;
894 kmem_free(bip->bli_logged);
895 bip->bli_logged = NULL;
896#endif /* XFS_TRANS_DEBUG */
897
898#ifdef XFS_BLI_TRACE
899 ktrace_free(bip->bli_trace);
900#endif
901 kmem_zone_free(xfs_buf_item_zone, bip);
902} 908}
903 909
904 910
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
1120 1126
1121 ASSERT(bip->bli_buf == bp); 1127 ASSERT(bip->bli_buf == bp);
1122 1128
1129 xfs_buf_rele(bp);
1123 mp = bip->bli_item.li_mountp; 1130 mp = bip->bli_item.li_mountp;
1124 1131
1125 /* 1132 /*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
1136 * xfs_trans_delete_ail() drops the AIL lock. 1143 * xfs_trans_delete_ail() drops the AIL lock.
1137 */ 1144 */
1138 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); 1145 xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
1139 1146 xfs_buf_item_free(bip);
1140#ifdef XFS_TRANS_DEBUG
1141 kmem_free(bip->bli_orig);
1142 bip->bli_orig = NULL;
1143 kmem_free(bip->bli_logged);
1144 bip->bli_logged = NULL;
1145#endif /* XFS_TRANS_DEBUG */
1146
1147#ifdef XFS_BLI_TRACE
1148 ktrace_free(bip->bli_trace);
1149#endif
1150 kmem_zone_free(xfs_buf_item_zone, bip);
1151} 1147}
1152 1148
1153#if defined(XFS_BLI_TRACE) 1149#if defined(XFS_BLI_TRACE)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 760f4c5b5160..75b0cd4da0ea 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -149,7 +149,14 @@ xfs_swap_extents(
149 149
150 sbp = &sxp->sx_stat; 150 sbp = &sxp->sx_stat;
151 151
152 xfs_lock_two_inodes(ip, tip, lock_flags); 152 /*
153 * we have to do two separate lock calls here to keep lockdep
154 * happy. If we try to get all the locks in one call, lock will
155 * report false positives when we drop the ILOCK and regain them
156 * below.
157 */
158 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
159 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
153 locked = 1; 160 locked = 1;
154 161
155 /* Verify that both files have the same format */ 162 /* Verify that both files have the same format */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index cdc2d3464a1a..2813cdd72375 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -18,7 +18,6 @@
18#ifndef __XFS_DMAPI_H__ 18#ifndef __XFS_DMAPI_H__
19#define __XFS_DMAPI_H__ 19#define __XFS_DMAPI_H__
20 20
21#include <linux/version.h>
22/* Values used to define the on-disk version of dm_attrname_t. All 21/* Values used to define the on-disk version of dm_attrname_t. All
23 * on-disk attribute names start with the 8-byte string "SGI_DMI_". 22 * on-disk attribute names start with the 8-byte string "SGI_DMI_".
24 * 23 *
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 00e80df9dd9d..dbd9cef852ec 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct(
4118 ASSERT(nextents <= XFS_LINEAR_EXTS); 4118 ASSERT(nextents <= XFS_LINEAR_EXTS);
4119 size = nextents * sizeof(xfs_bmbt_rec_t); 4119 size = nextents * sizeof(xfs_bmbt_rec_t);
4120 4120
4121 xfs_iext_irec_compact_full(ifp); 4121 xfs_iext_irec_compact_pages(ifp);
4122 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 4122 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
4123 4123
4124 ep = ifp->if_u1.if_ext_irec->er_extbuf; 4124 ep = ifp->if_u1.if_ext_irec->er_extbuf;
@@ -4449,8 +4449,7 @@ xfs_iext_irec_remove(
4449 * compaction policy is as follows: 4449 * compaction policy is as follows:
4450 * 4450 *
4451 * Full Compaction: Extents fit into a single page (or inline buffer) 4451 * Full Compaction: Extents fit into a single page (or inline buffer)
4452 * Full Compaction: Extents occupy less than 10% of allocated space 4452 * Partial Compaction: Extents occupy less than 50% of allocated space
4453 * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
4454 * No Compaction: Extents occupy at least 50% of allocated space 4453 * No Compaction: Extents occupy at least 50% of allocated space
4455 */ 4454 */
4456void 4455void
@@ -4471,8 +4470,6 @@ xfs_iext_irec_compact(
4471 xfs_iext_direct_to_inline(ifp, nextents); 4470 xfs_iext_direct_to_inline(ifp, nextents);
4472 } else if (nextents <= XFS_LINEAR_EXTS) { 4471 } else if (nextents <= XFS_LINEAR_EXTS) {
4473 xfs_iext_indirect_to_direct(ifp); 4472 xfs_iext_indirect_to_direct(ifp);
4474 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
4475 xfs_iext_irec_compact_full(ifp);
4476 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 4473 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
4477 xfs_iext_irec_compact_pages(ifp); 4474 xfs_iext_irec_compact_pages(ifp);
4478 } 4475 }
@@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages(
4496 erp_next = erp + 1; 4493 erp_next = erp + 1;
4497 if (erp_next->er_extcount <= 4494 if (erp_next->er_extcount <=
4498 (XFS_LINEAR_EXTS - erp->er_extcount)) { 4495 (XFS_LINEAR_EXTS - erp->er_extcount)) {
4499 memmove(&erp->er_extbuf[erp->er_extcount], 4496 memcpy(&erp->er_extbuf[erp->er_extcount],
4500 erp_next->er_extbuf, erp_next->er_extcount * 4497 erp_next->er_extbuf, erp_next->er_extcount *
4501 sizeof(xfs_bmbt_rec_t)); 4498 sizeof(xfs_bmbt_rec_t));
4502 erp->er_extcount += erp_next->er_extcount; 4499 erp->er_extcount += erp_next->er_extcount;
@@ -4516,91 +4513,6 @@ xfs_iext_irec_compact_pages(
4516} 4513}
4517 4514
4518/* 4515/*
4519 * Fully compact the extent records managed by the indirection array.
4520 */
4521void
4522xfs_iext_irec_compact_full(
4523 xfs_ifork_t *ifp) /* inode fork pointer */
4524{
4525 xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */
4526 xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */
4527 int erp_idx = 0; /* extent irec index */
4528 int ext_avail; /* empty entries in ex list */
4529 int ext_diff; /* number of exts to add */
4530 int nlists; /* number of irec's (ex lists) */
4531
4532 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4533
4534 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4535 erp = ifp->if_u1.if_ext_irec;
4536 ep = &erp->er_extbuf[erp->er_extcount];
4537 erp_next = erp + 1;
4538 ep_next = erp_next->er_extbuf;
4539
4540 while (erp_idx < nlists - 1) {
4541 /*
4542 * Check how many extent records are available in this irec.
4543 * If there is none skip the whole exercise.
4544 */
4545 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
4546 if (ext_avail) {
4547
4548 /*
4549 * Copy over as many as possible extent records into
4550 * the previous page.
4551 */
4552 ext_diff = MIN(ext_avail, erp_next->er_extcount);
4553 memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
4554 erp->er_extcount += ext_diff;
4555 erp_next->er_extcount -= ext_diff;
4556
4557 /*
4558 * If the next irec is empty now we can simply
4559 * remove it.
4560 */
4561 if (erp_next->er_extcount == 0) {
4562 /*
4563 * Free page before removing extent record
4564 * so er_extoffs don't get modified in
4565 * xfs_iext_irec_remove.
4566 */
4567 kmem_free(erp_next->er_extbuf);
4568 erp_next->er_extbuf = NULL;
4569 xfs_iext_irec_remove(ifp, erp_idx + 1);
4570 erp = &ifp->if_u1.if_ext_irec[erp_idx];
4571 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4572
4573 /*
4574 * If the next irec is not empty move up the content
4575 * that has not been copied to the previous page to
4576 * the beggining of this one.
4577 */
4578 } else {
4579 memmove(erp_next->er_extbuf, &ep_next[ext_diff],
4580 erp_next->er_extcount *
4581 sizeof(xfs_bmbt_rec_t));
4582 ep_next = erp_next->er_extbuf;
4583 memset(&ep_next[erp_next->er_extcount], 0,
4584 (XFS_LINEAR_EXTS -
4585 erp_next->er_extcount) *
4586 sizeof(xfs_bmbt_rec_t));
4587 }
4588 }
4589
4590 if (erp->er_extcount == XFS_LINEAR_EXTS) {
4591 erp_idx++;
4592 if (erp_idx < nlists)
4593 erp = &ifp->if_u1.if_ext_irec[erp_idx];
4594 else
4595 break;
4596 }
4597 ep = &erp->er_extbuf[erp->er_extcount];
4598 erp_next = erp + 1;
4599 ep_next = erp_next->er_extbuf;
4600 }
4601}
4602
4603/*
4604 * This is called to update the er_extoff field in the indirection 4516 * This is called to update the er_extoff field in the indirection
4605 * array when extents have been added or removed from one of the 4517 * array when extents have been added or removed from one of the
4606 * extent lists. erp_idx contains the irec index to begin updating 4518 * extent lists. erp_idx contains the irec index to begin updating
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ccba14eb9dbe..0b02c6443551 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
124STATIC int xlog_iclogs_empty(xlog_t *log); 124STATIC int xlog_iclogs_empty(xlog_t *log);
125 125
126#if defined(XFS_LOG_TRACE) 126#if defined(XFS_LOG_TRACE)
127
128#define XLOG_TRACE_LOGGRANT_SIZE 2048
129#define XLOG_TRACE_ICLOG_SIZE 256
130
131void
132xlog_trace_loggrant_alloc(xlog_t *log)
133{
134 log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
135}
136
137void
138xlog_trace_loggrant_dealloc(xlog_t *log)
139{
140 ktrace_free(log->l_grant_trace);
141}
142
127void 143void
128xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) 144xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
129{ 145{
130 unsigned long cnts; 146 unsigned long cnts;
131 147
132 if (!log->l_grant_trace) {
133 log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
134 if (!log->l_grant_trace)
135 return;
136 }
137 /* ticket counts are 1 byte each */ 148 /* ticket counts are 1 byte each */
138 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; 149 cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
139 150
@@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
157} 168}
158 169
159void 170void
171xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
172{
173 iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
174}
175
176void
177xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
178{
179 ktrace_free(iclog->ic_trace);
180}
181
182void
160xlog_trace_iclog(xlog_in_core_t *iclog, uint state) 183xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
161{ 184{
162 if (!iclog->ic_trace)
163 iclog->ic_trace = ktrace_alloc(256, KM_NOFS);
164 ktrace_enter(iclog->ic_trace, 185 ktrace_enter(iclog->ic_trace,
165 (void *)((unsigned long)state), 186 (void *)((unsigned long)state),
166 (void *)((unsigned long)current_pid()), 187 (void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
170 (void *)NULL, (void *)NULL); 191 (void *)NULL, (void *)NULL);
171} 192}
172#else 193#else
194
195#define xlog_trace_loggrant_alloc(log)
196#define xlog_trace_loggrant_dealloc(log)
173#define xlog_trace_loggrant(log,tic,string) 197#define xlog_trace_loggrant(log,tic,string)
198
199#define xlog_trace_iclog_alloc(iclog)
200#define xlog_trace_iclog_dealloc(iclog)
174#define xlog_trace_iclog(iclog,state) 201#define xlog_trace_iclog(iclog,state)
202
175#endif /* XFS_LOG_TRACE */ 203#endif /* XFS_LOG_TRACE */
176 204
177 205
@@ -1005,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp)
1005 l = iclog->ic_log; 1033 l = iclog->ic_log;
1006 1034
1007 /* 1035 /*
1008 * If the ordered flag has been removed by a lower 1036 * If the _XFS_BARRIER_FAILED flag was set by a lower
1009 * layer, it means the underlyin device no longer supports 1037 * layer, it means the underlying device no longer supports
1010 * barrier I/O. Warn loudly and turn off barriers. 1038 * barrier I/O. Warn loudly and turn off barriers.
1011 */ 1039 */
1012 if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { 1040 if (bp->b_flags & _XFS_BARRIER_FAILED) {
1041 bp->b_flags &= ~_XFS_BARRIER_FAILED;
1013 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; 1042 l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
1014 xfs_fs_cmn_err(CE_WARN, l->l_mp, 1043 xfs_fs_cmn_err(CE_WARN, l->l_mp,
1015 "xlog_iodone: Barriers are no longer supported" 1044 "xlog_iodone: Barriers are no longer supported"
@@ -1231,6 +1260,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1231 spin_lock_init(&log->l_grant_lock); 1260 spin_lock_init(&log->l_grant_lock);
1232 sv_init(&log->l_flush_wait, 0, "flush_wait"); 1261 sv_init(&log->l_flush_wait, 0, "flush_wait");
1233 1262
1263 xlog_trace_loggrant_alloc(log);
1234 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1264 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1235 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1265 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
1236 1266
@@ -1285,6 +1315,8 @@ xlog_alloc_log(xfs_mount_t *mp,
1285 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); 1315 sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
1286 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); 1316 sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
1287 1317
1318 xlog_trace_iclog_alloc(iclog);
1319
1288 iclogp = &iclog->ic_next; 1320 iclogp = &iclog->ic_next;
1289 } 1321 }
1290 *iclogp = log->l_iclog; /* complete ring */ 1322 *iclogp = log->l_iclog; /* complete ring */
@@ -1565,11 +1597,7 @@ xlog_dealloc_log(xlog_t *log)
1565 sv_destroy(&iclog->ic_force_wait); 1597 sv_destroy(&iclog->ic_force_wait);
1566 sv_destroy(&iclog->ic_write_wait); 1598 sv_destroy(&iclog->ic_write_wait);
1567 xfs_buf_free(iclog->ic_bp); 1599 xfs_buf_free(iclog->ic_bp);
1568#ifdef XFS_LOG_TRACE 1600 xlog_trace_iclog_dealloc(iclog);
1569 if (iclog->ic_trace != NULL) {
1570 ktrace_free(iclog->ic_trace);
1571 }
1572#endif
1573 next_iclog = iclog->ic_next; 1601 next_iclog = iclog->ic_next;
1574 kmem_free(iclog); 1602 kmem_free(iclog);
1575 iclog = next_iclog; 1603 iclog = next_iclog;
@@ -1578,14 +1606,7 @@ xlog_dealloc_log(xlog_t *log)
1578 spinlock_destroy(&log->l_grant_lock); 1606 spinlock_destroy(&log->l_grant_lock);
1579 1607
1580 xfs_buf_free(log->l_xbuf); 1608 xfs_buf_free(log->l_xbuf);
1581#ifdef XFS_LOG_TRACE 1609 xlog_trace_loggrant_dealloc(log);
1582 if (log->l_trace != NULL) {
1583 ktrace_free(log->l_trace);
1584 }
1585 if (log->l_grant_trace != NULL) {
1586 ktrace_free(log->l_grant_trace);
1587 }
1588#endif
1589 log->l_mp->m_log = NULL; 1610 log->l_mp->m_log = NULL;
1590 kmem_free(log); 1611 kmem_free(log);
1591} /* xlog_dealloc_log */ 1612} /* xlog_dealloc_log */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c8a5b22ee3e3..e7d8f84443fa 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -448,7 +448,6 @@ typedef struct log {
448 int l_grant_write_bytes; 448 int l_grant_write_bytes;
449 449
450#ifdef XFS_LOG_TRACE 450#ifdef XFS_LOG_TRACE
451 struct ktrace *l_trace;
452 struct ktrace *l_grant_trace; 451 struct ktrace *l_grant_trace;
453#endif 452#endif
454 453
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index aa238c8fbd7a..8b6812f66a15 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1838,6 +1838,12 @@ again:
1838#endif 1838#endif
1839} 1839}
1840 1840
1841/*
1842 * xfs_lock_two_inodes() can only be used to lock one type of lock
1843 * at a time - the iolock or the ilock, but not both at once. If
1844 * we lock both at once, lockdep will report false positives saying
1845 * we have violated locking orders.
1846 */
1841void 1847void
1842xfs_lock_two_inodes( 1848xfs_lock_two_inodes(
1843 xfs_inode_t *ip0, 1849 xfs_inode_t *ip0,
@@ -1848,6 +1854,8 @@ xfs_lock_two_inodes(
1848 int attempts = 0; 1854 int attempts = 0;
1849 xfs_log_item_t *lp; 1855 xfs_log_item_t *lp;
1850 1856
1857 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
1858 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
1851 ASSERT(ip0->i_ino != ip1->i_ino); 1859 ASSERT(ip0->i_ino != ip1->i_ino);
1852 1860
1853 if (ip0->i_ino > ip1->i_ino) { 1861 if (ip0->i_ino > ip1->i_ino) {
@@ -3152,6 +3160,13 @@ error1: /* Just cancel transaction */
3152/* 3160/*
3153 * Zero file bytes between startoff and endoff inclusive. 3161 * Zero file bytes between startoff and endoff inclusive.
3154 * The iolock is held exclusive and no blocks are buffered. 3162 * The iolock is held exclusive and no blocks are buffered.
3163 *
3164 * This function is used by xfs_free_file_space() to zero
3165 * partial blocks when the range to free is not block aligned.
3166 * When unreserving space with boundaries that are not block
3167 * aligned we round up the start and round down the end
3168 * boundaries and then use this function to zero the parts of
3169 * the blocks that got dropped during the rounding.
3155 */ 3170 */
3156STATIC int 3171STATIC int
3157xfs_zero_remaining_bytes( 3172xfs_zero_remaining_bytes(
@@ -3168,6 +3183,17 @@ xfs_zero_remaining_bytes(
3168 int nimap; 3183 int nimap;
3169 int error = 0; 3184 int error = 0;
3170 3185
3186 /*
3187 * Avoid doing I/O beyond eof - it's not necessary
3188 * since nothing can read beyond eof. The space will
3189 * be zeroed when the file is extended anyway.
3190 */
3191 if (startoff >= ip->i_size)
3192 return 0;
3193
3194 if (endoff > ip->i_size)
3195 endoff = ip->i_size;
3196
3171 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, 3197 bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
3172 XFS_IS_REALTIME_INODE(ip) ? 3198 XFS_IS_REALTIME_INODE(ip) ?
3173 mp->m_rtdev_targp : mp->m_ddev_targp); 3199 mp->m_rtdev_targp : mp->m_ddev_targp);