aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2006-10-21 11:46:04 -0400
committerDavid Woodhouse <dwmw2@infradead.org>2006-10-21 11:46:04 -0400
commit513b046c96cc2fbce730a3474f6f7ff0c4fdd05c (patch)
treee8006368b6f643067486f92405a404757807d6da /fs
parent82810b7b6cc7a74c68881a13b0eb66c7a6370fcc (diff)
parentc7a3bd177f248d01ee18a01d22048c80e071c331 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig131
-rw-r--r--fs/Makefile5
-rw-r--r--fs/afs/dir.c8
-rw-r--r--fs/autofs/autofs_i.h1
-rw-r--r--fs/autofs/dirhash.c1
-rw-r--r--fs/autofs/init.c2
-rw-r--r--fs/autofs/inode.c4
-rw-r--r--fs/autofs4/autofs_i.h3
-rw-r--r--fs/autofs4/init.c2
-rw-r--r--fs/autofs4/inode.c22
-rw-r--r--fs/autofs4/waitq.c1
-rw-r--r--fs/befs/befs.h6
-rw-r--r--fs/befs/befs_fs_types.h112
-rw-r--r--fs/befs/btree.c29
-rw-r--r--fs/befs/datastream.c11
-rw-r--r--fs/befs/debug.c12
-rw-r--r--fs/befs/endian.h57
-rw-r--r--fs/befs/inode.c1
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/befs/super.c1
-rw-r--r--fs/binfmt_elf.c10
-rw-r--r--fs/binfmt_som.c18
-rw-r--r--fs/bio.c9
-rw-r--r--fs/buffer.c39
-rw-r--r--fs/cifs/cifsacl.h4
-rw-r--r--fs/cifs/cifsencrypt.h2
-rw-r--r--fs/cifs/cifsfs.c27
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifspdu.h12
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c102
-rw-r--r--fs/cifs/connect.c35
-rw-r--r--fs/cifs/inode.c12
-rw-r--r--fs/cifs/link.c6
-rw-r--r--fs/cifs/md5.c8
-rw-r--r--fs/cifs/md5.h8
-rw-r--r--fs/cifs/misc.c44
-rw-r--r--fs/cifs/netmisc.c58
-rw-r--r--fs/cifs/readdir.c27
-rw-r--r--fs/cifs/sess.c23
-rw-r--r--fs/cifs/smbdes.c6
-rw-r--r--fs/cifs/smbencrypt.c11
-rw-r--r--fs/compat.c2
-rw-r--r--fs/compat_ioctl.c10
-rw-r--r--fs/configfs/file.c14
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dcache.c139
-rw-r--r--fs/dlm/Kconfig20
-rw-r--r--fs/dlm/Makefile19
-rw-r--r--fs/dlm/ast.c173
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c387
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h543
-rw-r--r--fs/dlm/lock.c3871
-rw-r--r--fs/dlm/lock.h62
-rw-r--r--fs/dlm/lockspace.c717
-rw-r--r--fs/dlm/lockspace.h25
-rw-r--r--fs/dlm/lowcomms.c1239
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c97
-rw-r--r--fs/dlm/member.c327
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c116
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c472
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c765
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c290
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/user.c788
-rw-r--r--fs/dlm/user.h16
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/ecryptfs/Makefile7
-rw-r--r--fs/ecryptfs/crypto.c1659
-rw-r--r--fs/ecryptfs/debug.c123
-rw-r--r--fs/ecryptfs/dentry.c87
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h482
-rw-r--r--fs/ecryptfs/file.c440
-rw-r--r--fs/ecryptfs/inode.c1079
-rw-r--r--fs/ecryptfs/keystore.c1061
-rw-r--r--fs/ecryptfs/main.c828
-rw-r--r--fs/ecryptfs/mmap.c788
-rw-r--r--fs/ecryptfs/super.c198
-rw-r--r--fs/eventpoll.c56
-rw-r--r--fs/ext2/super.c16
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/Makefile12
-rw-r--r--fs/ext4/acl.c551
-rw-r--r--fs/ext4/acl.h81
-rw-r--r--fs/ext4/balloc.c1833
-rw-r--r--fs/ext4/bitmap.c32
-rw-r--r--fs/ext4/dir.c518
-rw-r--r--fs/ext4/extents.c2152
-rw-r--r--fs/ext4/file.c139
-rw-r--r--fs/ext4/fsync.c88
-rw-r--r--fs/ext4/hash.c152
-rw-r--r--fs/ext4/ialloc.c772
-rw-r--r--fs/ext4/inode.c3233
-rw-r--r--fs/ext4/ioctl.c306
-rw-r--r--fs/ext4/namei.c2395
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/resize.c1045
-rw-r--r--fs/ext4/super.c2829
-rw-r--r--fs/ext4/symlink.c54
-rw-r--r--fs/ext4/xattr.c1317
-rw-r--r--fs/ext4/xattr.h145
-rw-r--r--fs/ext4/xattr_security.c77
-rw-r--r--fs/ext4/xattr_trusted.c62
-rw-r--r--fs/ext4/xattr_user.c64
-rw-r--r--fs/fat/file.c3
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fuse/dir.c107
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c15
-rw-r--r--fs/gfs2/Kconfig44
-rw-r--r--fs/gfs2/Makefile10
-rw-r--r--fs/gfs2/acl.c309
-rw-r--r--fs/gfs2/acl.h39
-rw-r--r--fs/gfs2/bmap.c1222
-rw-r--r--fs/gfs2/bmap.h31
-rw-r--r--fs/gfs2/daemon.c196
-rw-r--r--fs/gfs2/daemon.h19
-rw-r--r--fs/gfs2/dir.c1957
-rw-r--r--fs/gfs2/dir.h79
-rw-r--r--fs/gfs2/eaops.c230
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/eattr.c1501
-rw-r--r--fs/gfs2/eattr.h100
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2231
-rw-r--r--fs/gfs2/glock.h153
-rw-r--r--fs/gfs2/glops.c615
-rw-r--r--fs/gfs2/glops.h25
-rw-r--r--fs/gfs2/incore.h634
-rw-r--r--fs/gfs2/inode.c1379
-rw-r--r--fs/gfs2/inode.h56
-rw-r--r--fs/gfs2/lm.c217
-rw-r--r--fs/gfs2/lm.h42
-rw-r--r--fs/gfs2/locking.c184
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c524
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h187
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c255
-rw-r--r--fs/gfs2/locking/dlm/plock.c301
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c226
-rw-r--r--fs/gfs2/locking/dlm/thread.c359
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c246
-rw-r--r--fs/gfs2/log.c688
-rw-r--r--fs/gfs2/log.h65
-rw-r--r--fs/gfs2/lops.c809
-rw-r--r--fs/gfs2/lops.h99
-rw-r--r--fs/gfs2/main.c150
-rw-r--r--fs/gfs2/meta_io.c590
-rw-r--r--fs/gfs2/meta_io.h78
-rw-r--r--fs/gfs2/mount.c214
-rw-r--r--fs/gfs2/mount.h17
-rw-r--r--fs/gfs2/ondisk.c308
-rw-r--r--fs/gfs2/ops_address.c793
-rw-r--r--fs/gfs2/ops_address.h22
-rw-r--r--fs/gfs2/ops_dentry.c119
-rw-r--r--fs/gfs2/ops_dentry.h17
-rw-r--r--fs/gfs2/ops_export.c298
-rw-r--r--fs/gfs2/ops_export.h22
-rw-r--r--fs/gfs2/ops_file.c661
-rw-r--r--fs/gfs2/ops_file.h24
-rw-r--r--fs/gfs2/ops_fstype.c925
-rw-r--r--fs/gfs2/ops_fstype.h18
-rw-r--r--fs/gfs2/ops_inode.c1151
-rw-r--r--fs/gfs2/ops_inode.h20
-rw-r--r--fs/gfs2/ops_super.c468
-rw-r--r--fs/gfs2/ops_super.h17
-rw-r--r--fs/gfs2/ops_vm.c184
-rw-r--r--fs/gfs2/ops_vm.h18
-rw-r--r--fs/gfs2/quota.c1228
-rw-r--r--fs/gfs2/quota.h35
-rw-r--r--fs/gfs2/recovery.c571
-rw-r--r--fs/gfs2/recovery.h34
-rw-r--r--fs/gfs2/rgrp.c1513
-rw-r--r--fs/gfs2/rgrp.h69
-rw-r--r--fs/gfs2/super.c976
-rw-r--r--fs/gfs2/super.h55
-rw-r--r--fs/gfs2/sys.c583
-rw-r--r--fs/gfs2/sys.h27
-rw-r--r--fs/gfs2/trans.c184
-rw-r--r--fs/gfs2/trans.h39
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h170
-rw-r--r--fs/hpfs/inode.c11
-rw-r--r--fs/hppfs/hppfs_kern.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/ioprio.c5
-rw-r--r--fs/isofs/joliet.c10
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd/transaction.c5
-rw-r--r--fs/jbd2/Makefile7
-rw-r--r--fs/jbd2/checkpoint.c697
-rw-r--r--fs/jbd2/commit.c920
-rw-r--r--fs/jbd2/journal.c2084
-rw-r--r--fs/jbd2/recovery.c609
-rw-r--r--fs/jbd2/revoke.c712
-rw-r--r--fs/jbd2/transaction.c2081
-rw-r--r--fs/jffs2/super.c8
-rw-r--r--fs/lockd/clntlock.c58
-rw-r--r--fs/lockd/clntproc.c17
-rw-r--r--fs/lockd/host.c325
-rw-r--r--fs/lockd/mon.c77
-rw-r--r--fs/lockd/svc.c19
-rw-r--r--fs/lockd/svc4proc.c85
-rw-r--r--fs/lockd/svclock.c207
-rw-r--r--fs/lockd/svcproc.c89
-rw-r--r--fs/lockd/svcshare.c24
-rw-r--r--fs/lockd/svcsubs.c183
-rw-r--r--fs/lockd/xdr.c76
-rw-r--r--fs/lockd/xdr4.c80
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfs/callback.h10
-rw-r--r--fs/nfs/callback_proc.c6
-rw-r--r--fs/nfs/callback_xdr.c106
-rw-r--r--fs/nfs/client.c52
-rw-r--r--fs/nfs/dir.c16
-rw-r--r--fs/nfs/direct.c25
-rw-r--r--fs/nfs/getroot.c1
-rw-r--r--fs/nfs/inode.c30
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/mount_clnt.c6
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs2xdr.c78
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs3xdr.c118
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4namespace.c2
-rw-r--r--fs/nfs/nfs4proc.c16
-rw-r--r--fs/nfs/nfs4xdr.c360
-rw-r--r--fs/nfs/nfsroot.c1
-rw-r--r--fs/nfs/super.c3
-rw-r--r--fs/nfs/write.c8
-rw-r--r--fs/nfs_common/nfsacl.c4
-rw-r--r--fs/nfsd/export.c153
-rw-r--r--fs/nfsd/lockd.c16
-rw-r--r--fs/nfsd/nfs2acl.c37
-rw-r--r--fs/nfsd/nfs3acl.c23
-rw-r--r--fs/nfsd/nfs3proc.c108
-rw-r--r--fs/nfsd/nfs3xdr.c182
-rw-r--r--fs/nfsd/nfs4acl.c711
-rw-r--r--fs/nfsd/nfs4callback.c26
-rw-r--r--fs/nfsd/nfs4proc.c140
-rw-r--r--fs/nfsd/nfs4recover.c14
-rw-r--r--fs/nfsd/nfs4state.c119
-rw-r--r--fs/nfsd/nfs4xdr.c454
-rw-r--r--fs/nfsd/nfscache.c8
-rw-r--r--fs/nfsd/nfsctl.c49
-rw-r--r--fs/nfsd/nfsfh.c10
-rw-r--r--fs/nfsd/nfsproc.c91
-rw-r--r--fs/nfsd/nfssvc.c29
-rw-r--r--fs/nfsd/nfsxdr.c115
-rw-r--r--fs/nfsd/vfs.c381
-rw-r--r--fs/ocfs2/cluster/nodemanager.c10
-rw-r--r--fs/ocfs2/file.c51
-rw-r--r--fs/ocfs2/namei.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/partitions/check.c50
-rw-r--r--fs/partitions/msdos.c6
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/proc_misc.c2
-rw-r--r--fs/reiserfs/bitmap.c4
-rw-r--r--fs/reiserfs/file.c1
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/journal.c3
-rw-r--r--fs/reiserfs/super.c31
-rw-r--r--fs/splice.c6
-rw-r--r--fs/super.c12
-rw-r--r--fs/sysfs/file.c7
-rw-r--r--fs/sysv/super.c15
-rw-r--r--fs/udf/super.c3
-rw-r--r--fs/ufs/util.c14
-rw-r--r--fs/xattr.c33
-rw-r--r--fs/xfs/linux-2.6/kmem.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c3
296 files changed, 74480 insertions, 2597 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 68f4561423ff..fee318e6f4bb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -140,6 +140,73 @@ config EXT3_FS_SECURITY
140 If you are not using a security module that requires using 140 If you are not using a security module that requires using
141 extended attributes for file security labels, say N. 141 extended attributes for file security labels, say N.
142 142
143config EXT4DEV_FS
144 tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
145 depends on EXPERIMENTAL
146 select JBD2
147 help
148 Ext4dev is a predecessor filesystem of the next generation
149 extended fs ext4, based on ext3 filesystem code. It will be
150 renamed ext4 fs later, once ext4dev is mature and stabilized.
151
152 Unlike the change from ext2 filesystem to ext3 filesystem,
153 the on-disk format of ext4dev is not the same as ext3 any more:
154 it is based on extent maps and it supports 48-bit physical block
155 numbers. These combined on-disk format changes will allow
156 ext4dev/ext4 to handle more than 16 TB filesystem volumes --
157 a hard limit that ext3 cannot overcome without changing the
158 on-disk format.
159
160 Other than extent maps and 48-bit block numbers, ext4dev also is
161 likely to have other new features such as persistent preallocation,
162 high resolution time stamps, and larger file support etc. These
163 features will be added to ext4dev gradually.
164
165 To compile this file system support as a module, choose M here. The
166 module will be called ext4dev. Be aware, however, that the filesystem
167 of your root partition (the one containing the directory /) cannot
168 be compiled as a module, and so this could be dangerous.
169
170 If unsure, say N.
171
172config EXT4DEV_FS_XATTR
173 bool "Ext4dev extended attributes"
174 depends on EXT4DEV_FS
175 default y
176 help
177 Extended attributes are name:value pairs associated with inodes by
178 the kernel or by users (see the attr(5) manual page, or visit
179 <http://acl.bestbits.at/> for details).
180
181 If unsure, say N.
182
183 You need this for POSIX ACL support on ext4dev/ext4.
184
185config EXT4DEV_FS_POSIX_ACL
186 bool "Ext4dev POSIX Access Control Lists"
187 depends on EXT4DEV_FS_XATTR
188 select FS_POSIX_ACL
189 help
190 POSIX Access Control Lists (ACLs) support permissions for users and
191 groups beyond the owner/group/world scheme.
192
193 To learn more about Access Control Lists, visit the POSIX ACLs for
194 Linux website <http://acl.bestbits.at/>.
195
196 If you don't know what Access Control Lists are, say N
197
198config EXT4DEV_FS_SECURITY
199 bool "Ext4dev Security Labels"
200 depends on EXT4DEV_FS_XATTR
201 help
202 Security labels support alternative access control models
203 implemented by security modules like SELinux. This option
204 enables an extended attribute handler for file security
205 labels in the ext4dev/ext4 filesystem.
206
207 If you are not using a security module that requires using
208 extended attributes for file security labels, say N.
209
143config JBD 210config JBD
144 tristate 211 tristate
145 help 212 help
@@ -172,12 +239,44 @@ config JBD_DEBUG
172 generated. To turn debugging off again, do 239 generated. To turn debugging off again, do
173 "echo 0 > /proc/sys/fs/jbd-debug". 240 "echo 0 > /proc/sys/fs/jbd-debug".
174 241
242config JBD2
243 tristate
244 help
245 This is a generic journaling layer for block devices that support
246 both 32-bit and 64-bit block numbers. It is currently used by
247 the ext4dev/ext4 filesystem, but it could also be used to add
248 journal support to other file systems or block devices such
249 as RAID or LVM.
250
251 If you are using ext4dev/ext4, you need to say Y here. If you are not
252 using ext4dev/ext4 then you will probably want to say N.
253
254 To compile this device as a module, choose M here. The module will be
255 called jbd2. If you are compiling ext4dev/ext4 into the kernel,
256 you cannot compile this code as a module.
257
258config JBD2_DEBUG
259 bool "JBD2 (ext4dev/ext4) debugging support"
260 depends on JBD2
261 help
262 If you are using the ext4dev/ext4 journaled file system (or
263 potentially any other filesystem/device using JBD2), this option
264 allows you to enable debugging output while the system is running,
265 in order to help track down any problems you are having.
266 By default, the debugging output will be turned off.
267
268 If you select Y here, then you will be able to turn on debugging
269 with "echo N > /proc/sys/fs/jbd2-debug", where N is a number between
270 1 and 5. The higher the number, the more debugging output is
271 generated. To turn debugging off again, do
272 "echo 0 > /proc/sys/fs/jbd2-debug".
273
175config FS_MBCACHE 274config FS_MBCACHE
176# Meta block cache for Extended Attributes (ext2/ext3) 275# Meta block cache for Extended Attributes (ext2/ext3/ext4)
177 tristate 276 tristate
178 depends on EXT2_FS_XATTR || EXT3_FS_XATTR 277 depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
179 default y if EXT2_FS=y || EXT3_FS=y 278 default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
180 default m if EXT2_FS=m || EXT3_FS=m 279 default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
181 280
182config REISERFS_FS 281config REISERFS_FS
183 tristate "Reiserfs support" 282 tristate "Reiserfs support"
@@ -325,6 +424,7 @@ config FS_POSIX_ACL
325 default n 424 default n
326 425
327source "fs/xfs/Kconfig" 426source "fs/xfs/Kconfig"
427source "fs/gfs2/Kconfig"
328 428
329config OCFS2_FS 429config OCFS2_FS
330 tristate "OCFS2 file system support" 430 tristate "OCFS2 file system support"
@@ -534,6 +634,10 @@ config FUSE_FS
534 If you want to develop a userspace FS, or if you want to use 634 If you want to develop a userspace FS, or if you want to use
535 a filesystem based on FUSE, answer Y or M. 635 a filesystem based on FUSE, answer Y or M.
536 636
637config GENERIC_ACL
638 bool
639 select FS_POSIX_ACL
640
537if BLOCK 641if BLOCK
538menu "CD-ROM/DVD Filesystems" 642menu "CD-ROM/DVD Filesystems"
539 643
@@ -995,6 +1099,18 @@ config AFFS_FS
995 To compile this file system support as a module, choose M here: the 1099 To compile this file system support as a module, choose M here: the
996 module will be called affs. If unsure, say N. 1100 module will be called affs. If unsure, say N.
997 1101
1102config ECRYPT_FS
1103 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
1104 depends on EXPERIMENTAL && KEYS && CRYPTO
1105 help
1106 Encrypted filesystem that operates on the VFS layer. See
1107 <file:Documentation/ecryptfs.txt> to learn more about
1108 eCryptfs. Userspace components are required and can be
1109 obtained from <http://ecryptfs.sf.net>.
1110
1111 To compile this file system support as a module, choose M here: the
1112 module will be called ecryptfs.
1113
998config HFS_FS 1114config HFS_FS
999 tristate "Apple Macintosh file system support (EXPERIMENTAL)" 1115 tristate "Apple Macintosh file system support (EXPERIMENTAL)"
1000 depends on BLOCK && EXPERIMENTAL 1116 depends on BLOCK && EXPERIMENTAL
@@ -1874,7 +1990,7 @@ config CIFS_EXPERIMENTAL
1874config CIFS_UPCALL 1990config CIFS_UPCALL
1875 bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" 1991 bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
1876 depends on CIFS_EXPERIMENTAL 1992 depends on CIFS_EXPERIMENTAL
1877 select CONNECTOR 1993 depends on CONNECTOR
1878 help 1994 help
1879 Enables an upcall mechanism for CIFS which will be used to contact 1995 Enables an upcall mechanism for CIFS which will be used to contact
1880 userspace helper utilities to provide SPNEGO packaged Kerberos 1996 userspace helper utilities to provide SPNEGO packaged Kerberos
@@ -1968,10 +2084,6 @@ config 9P_FS
1968 2084
1969 If unsure, say N. 2085 If unsure, say N.
1970 2086
1971config GENERIC_ACL
1972 bool
1973 select FS_POSIX_ACL
1974
1975endmenu 2087endmenu
1976 2088
1977if BLOCK 2089if BLOCK
@@ -1983,6 +2095,7 @@ endmenu
1983endif 2095endif
1984 2096
1985source "fs/nls/Kconfig" 2097source "fs/nls/Kconfig"
2098source "fs/dlm/Kconfig"
1986 2099
1987endmenu 2100endmenu
1988 2101
diff --git a/fs/Makefile b/fs/Makefile
index 819b2a93bebe..9a5ce9323bfd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -57,11 +57,14 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/
57obj-y += devpts/ 57obj-y += devpts/
58 58
59obj-$(CONFIG_PROFILING) += dcookies.o 59obj-$(CONFIG_PROFILING) += dcookies.o
60obj-$(CONFIG_DLM) += dlm/
60 61
61# Do not add any filesystems before this line 62# Do not add any filesystems before this line
62obj-$(CONFIG_REISERFS_FS) += reiserfs/ 63obj-$(CONFIG_REISERFS_FS) += reiserfs/
63obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 64obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
65obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
64obj-$(CONFIG_JBD) += jbd/ 66obj-$(CONFIG_JBD) += jbd/
67obj-$(CONFIG_JBD2) += jbd2/
65obj-$(CONFIG_EXT2_FS) += ext2/ 68obj-$(CONFIG_EXT2_FS) += ext2/
66obj-$(CONFIG_CRAMFS) += cramfs/ 69obj-$(CONFIG_CRAMFS) += cramfs/
67obj-$(CONFIG_RAMFS) += ramfs/ 70obj-$(CONFIG_RAMFS) += ramfs/
@@ -75,6 +78,7 @@ obj-$(CONFIG_BFS_FS) += bfs/
75obj-$(CONFIG_ISO9660_FS) += isofs/ 78obj-$(CONFIG_ISO9660_FS) += isofs/
76obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ 79obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+
77obj-$(CONFIG_HFS_FS) += hfs/ 80obj-$(CONFIG_HFS_FS) += hfs/
81obj-$(CONFIG_ECRYPT_FS) += ecryptfs/
78obj-$(CONFIG_VXFS_FS) += freevxfs/ 82obj-$(CONFIG_VXFS_FS) += freevxfs/
79obj-$(CONFIG_NFS_FS) += nfs/ 83obj-$(CONFIG_NFS_FS) += nfs/
80obj-$(CONFIG_EXPORTFS) += exportfs/ 84obj-$(CONFIG_EXPORTFS) += exportfs/
@@ -109,3 +113,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/
109obj-$(CONFIG_HPPFS) += hppfs/ 113obj-$(CONFIG_HPPFS) += hppfs/
110obj-$(CONFIG_DEBUG_FS) += debugfs/ 114obj-$(CONFIG_DEBUG_FS) += debugfs/
111obj-$(CONFIG_OCFS2_FS) += ocfs2/ 115obj-$(CONFIG_OCFS2_FS) += ocfs2/
116obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index cf8a2cb28505..a6ec75c56fcf 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -211,8 +211,8 @@ static int afs_dir_open(struct inode *inode, struct file *file)
211{ 211{
212 _enter("{%lu}", inode->i_ino); 212 _enter("{%lu}", inode->i_ino);
213 213
214 BUG_ON(sizeof(union afs_dir_block) != 2048); 214 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
215 BUG_ON(sizeof(union afs_dirent) != 32); 215 BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
216 216
217 if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED) 217 if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
218 return -ENOENT; 218 return -ENOENT;
@@ -446,8 +446,8 @@ static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
446 _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name); 446 _enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
447 447
448 /* insanity checks first */ 448 /* insanity checks first */
449 BUG_ON(sizeof(union afs_dir_block) != 2048); 449 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
450 BUG_ON(sizeof(union afs_dirent) != 32); 450 BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
451 451
452 if (dentry->d_name.len > 255) { 452 if (dentry->d_name.len > 255) {
453 _leave(" = -ENAMETOOLONG"); 453 _leave(" = -ENAMETOOLONG");
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index c7700d9b3f96..906ba5ce2261 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -149,6 +149,7 @@ extern const struct file_operations autofs_root_operations;
149/* Initializing function */ 149/* Initializing function */
150 150
151int autofs_fill_super(struct super_block *, void *, int); 151int autofs_fill_super(struct super_block *, void *, int);
152void autofs_kill_sb(struct super_block *sb);
152 153
153/* Queue management functions */ 154/* Queue management functions */
154 155
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 3fded389d06b..bf8c8af98004 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -246,5 +246,4 @@ void autofs_hash_nuke(struct autofs_sb_info *sbi)
246 kfree(ent); 246 kfree(ent);
247 } 247 }
248 } 248 }
249 shrink_dcache_sb(sbi->sb);
250} 249}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index aca123752406..cea5219b4f37 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE, 24 .owner = THIS_MODULE,
25 .name = "autofs", 25 .name = "autofs",
26 .get_sb = autofs_get_sb, 26 .get_sb = autofs_get_sb,
27 .kill_sb = kill_anon_super, 27 .kill_sb = autofs_kill_sb,
28}; 28};
29 29
30static int __init init_autofs_fs(void) 30static int __init init_autofs_fs(void)
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 2c9759baad61..54c518c89e4c 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,7 +20,7 @@
20#include "autofs_i.h" 20#include "autofs_i.h"
21#include <linux/module.h> 21#include <linux/module.h>
22 22
23static void autofs_put_super(struct super_block *sb) 23void autofs_kill_sb(struct super_block *sb)
24{ 24{
25 struct autofs_sb_info *sbi = autofs_sbi(sb); 25 struct autofs_sb_info *sbi = autofs_sbi(sb);
26 unsigned int n; 26 unsigned int n;
@@ -37,13 +37,13 @@ static void autofs_put_super(struct super_block *sb)
37 kfree(sb->s_fs_info); 37 kfree(sb->s_fs_info);
38 38
39 DPRINTK(("autofs: shutting down\n")); 39 DPRINTK(("autofs: shutting down\n"));
40 kill_anon_super(sb);
40} 41}
41 42
42static void autofs_read_inode(struct inode *inode); 43static void autofs_read_inode(struct inode *inode);
43 44
44static struct super_operations autofs_sops = { 45static struct super_operations autofs_sops = {
45 .read_inode = autofs_read_inode, 46 .read_inode = autofs_read_inode,
46 .put_super = autofs_put_super,
47 .statfs = simple_statfs, 47 .statfs = simple_statfs,
48}; 48};
49 49
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 480ab178cba5..b13f32c8aeee 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -94,7 +94,6 @@ struct autofs_wait_queue {
94 94
95struct autofs_sb_info { 95struct autofs_sb_info {
96 u32 magic; 96 u32 magic;
97 struct dentry *root;
98 int pipefd; 97 int pipefd;
99 struct file *pipe; 98 struct file *pipe;
100 pid_t oz_pgrp; 99 pid_t oz_pgrp;
@@ -229,4 +228,4 @@ out:
229} 228}
230 229
231void autofs4_dentry_release(struct dentry *); 230void autofs4_dentry_release(struct dentry *);
232 231extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 5d9193332bef..723a1c5e361b 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
24 .owner = THIS_MODULE, 24 .owner = THIS_MODULE,
25 .name = "autofs", 25 .name = "autofs",
26 .get_sb = autofs_get_sb, 26 .get_sb = autofs_get_sb,
27 .kill_sb = kill_anon_super, 27 .kill_sb = autofs4_kill_sb,
28}; 28};
29 29
30static int __init init_autofs4_fs(void) 30static int __init init_autofs4_fs(void)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 800ce876caec..51fd8595bf85 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -96,7 +96,7 @@ void autofs4_free_ino(struct autofs_info *ino)
96 */ 96 */
97static void autofs4_force_release(struct autofs_sb_info *sbi) 97static void autofs4_force_release(struct autofs_sb_info *sbi)
98{ 98{
99 struct dentry *this_parent = sbi->root; 99 struct dentry *this_parent = sbi->sb->s_root;
100 struct list_head *next; 100 struct list_head *next;
101 101
102 spin_lock(&dcache_lock); 102 spin_lock(&dcache_lock);
@@ -127,7 +127,7 @@ resume:
127 spin_lock(&dcache_lock); 127 spin_lock(&dcache_lock);
128 } 128 }
129 129
130 if (this_parent != sbi->root) { 130 if (this_parent != sbi->sb->s_root) {
131 struct dentry *dentry = this_parent; 131 struct dentry *dentry = this_parent;
132 132
133 next = this_parent->d_u.d_child.next; 133 next = this_parent->d_u.d_child.next;
@@ -140,15 +140,9 @@ resume:
140 goto resume; 140 goto resume;
141 } 141 }
142 spin_unlock(&dcache_lock); 142 spin_unlock(&dcache_lock);
143
144 dput(sbi->root);
145 sbi->root = NULL;
146 shrink_dcache_sb(sbi->sb);
147
148 return;
149} 143}
150 144
151static void autofs4_put_super(struct super_block *sb) 145void autofs4_kill_sb(struct super_block *sb)
152{ 146{
153 struct autofs_sb_info *sbi = autofs4_sbi(sb); 147 struct autofs_sb_info *sbi = autofs4_sbi(sb);
154 148
@@ -163,6 +157,7 @@ static void autofs4_put_super(struct super_block *sb)
163 kfree(sbi); 157 kfree(sbi);
164 158
165 DPRINTK("shutting down"); 159 DPRINTK("shutting down");
160 kill_anon_super(sb);
166} 161}
167 162
168static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) 163static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
@@ -189,7 +184,6 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
189} 184}
190 185
191static struct super_operations autofs4_sops = { 186static struct super_operations autofs4_sops = {
192 .put_super = autofs4_put_super,
193 .statfs = simple_statfs, 187 .statfs = simple_statfs,
194 .show_options = autofs4_show_options, 188 .show_options = autofs4_show_options,
195}; 189};
@@ -315,7 +309,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
315 309
316 s->s_fs_info = sbi; 310 s->s_fs_info = sbi;
317 sbi->magic = AUTOFS_SBI_MAGIC; 311 sbi->magic = AUTOFS_SBI_MAGIC;
318 sbi->root = NULL;
319 sbi->pipefd = -1; 312 sbi->pipefd = -1;
320 sbi->catatonic = 0; 313 sbi->catatonic = 0;
321 sbi->exp_timeout = 0; 314 sbi->exp_timeout = 0;
@@ -397,13 +390,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
397 sbi->pipefd = pipefd; 390 sbi->pipefd = pipefd;
398 391
399 /* 392 /*
400 * Take a reference to the root dentry so we get a chance to
401 * clean up the dentry tree on umount.
402 * See autofs4_force_release.
403 */
404 sbi->root = dget(root);
405
406 /*
407 * Success! Install the root dentry now to indicate completion. 393 * Success! Install the root dentry now to indicate completion.
408 */ 394 */
409 s->s_root = root; 395 s->s_root = root;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index ce103e7b0bc3..c0a6c8d445c7 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -45,7 +45,6 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
45 fput(sbi->pipe); /* Close the pipe */ 45 fput(sbi->pipe); /* Close the pipe */
46 sbi->pipe = NULL; 46 sbi->pipe = NULL;
47 } 47 }
48 shrink_dcache_sb(sbi->sb);
49} 48}
50 49
51static int autofs4_write(struct file *file, const void *addr, int bytes) 50static int autofs4_write(struct file *file, const void *addr, int bytes)
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index 057a2c3d73b7..d9a40abda6b7 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -94,7 +94,7 @@ void befs_debug(const struct super_block *sb, const char *fmt, ...);
94 94
95void befs_dump_super_block(const struct super_block *sb, befs_super_block *); 95void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
96void befs_dump_inode(const struct super_block *sb, befs_inode *); 96void befs_dump_inode(const struct super_block *sb, befs_inode *);
97void befs_dump_index_entry(const struct super_block *sb, befs_btree_super *); 97void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *);
98void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *); 98void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *);
99/****************************/ 99/****************************/
100 100
@@ -136,7 +136,7 @@ blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
136static inline unsigned int 136static inline unsigned int
137befs_iaddrs_per_block(struct super_block *sb) 137befs_iaddrs_per_block(struct super_block *sb)
138{ 138{
139 return BEFS_SB(sb)->block_size / sizeof (befs_inode_addr); 139 return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
140} 140}
141 141
142static inline int 142static inline int
@@ -151,4 +151,6 @@ befs_brun_size(struct super_block *sb, befs_block_run run)
151 return BEFS_SB(sb)->block_size * run.len; 151 return BEFS_SB(sb)->block_size * run.len;
152} 152}
153 153
154#include "endian.h"
155
154#endif /* _LINUX_BEFS_H */ 156#endif /* _LINUX_BEFS_H */
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index 63ef1e18fb84..e2595c2c403a 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -79,17 +79,27 @@ enum inode_flags {
79 * On-Disk datastructures of BeFS 79 * On-Disk datastructures of BeFS
80 */ 80 */
81 81
82typedef u64 __bitwise fs64;
83typedef u32 __bitwise fs32;
84typedef u16 __bitwise fs16;
85
82typedef u64 befs_off_t; 86typedef u64 befs_off_t;
83typedef u64 befs_time_t; 87typedef fs64 befs_time_t;
84typedef void befs_binode_etc;
85 88
86/* Block runs */ 89/* Block runs */
87typedef struct { 90typedef struct {
91 fs32 allocation_group;
92 fs16 start;
93 fs16 len;
94} PACKED befs_disk_block_run;
95
96typedef struct {
88 u32 allocation_group; 97 u32 allocation_group;
89 u16 start; 98 u16 start;
90 u16 len; 99 u16 len;
91} PACKED befs_block_run; 100} PACKED befs_block_run;
92 101
102typedef befs_disk_block_run befs_disk_inode_addr;
93typedef befs_block_run befs_inode_addr; 103typedef befs_block_run befs_inode_addr;
94 104
95/* 105/*
@@ -97,31 +107,31 @@ typedef befs_block_run befs_inode_addr;
97 */ 107 */
98typedef struct { 108typedef struct {
99 char name[B_OS_NAME_LENGTH]; 109 char name[B_OS_NAME_LENGTH];
100 u32 magic1; 110 fs32 magic1;
101 u32 fs_byte_order; 111 fs32 fs_byte_order;
102 112
103 u32 block_size; 113 fs32 block_size;
104 u32 block_shift; 114 fs32 block_shift;
105 115
106 befs_off_t num_blocks; 116 fs64 num_blocks;
107 befs_off_t used_blocks; 117 fs64 used_blocks;
108 118
109 u32 inode_size; 119 fs32 inode_size;
110 120
111 u32 magic2; 121 fs32 magic2;
112 u32 blocks_per_ag; 122 fs32 blocks_per_ag;
113 u32 ag_shift; 123 fs32 ag_shift;
114 u32 num_ags; 124 fs32 num_ags;
115 125
116 u32 flags; 126 fs32 flags;
117 127
118 befs_block_run log_blocks; 128 befs_disk_block_run log_blocks;
119 befs_off_t log_start; 129 fs64 log_start;
120 befs_off_t log_end; 130 fs64 log_end;
121 131
122 u32 magic3; 132 fs32 magic3;
123 befs_inode_addr root_dir; 133 befs_disk_inode_addr root_dir;
124 befs_inode_addr indices; 134 befs_disk_inode_addr indices;
125 135
126} PACKED befs_super_block; 136} PACKED befs_super_block;
127 137
@@ -130,6 +140,16 @@ typedef struct {
130 * be longer than one block! 140 * be longer than one block!
131 */ 141 */
132typedef struct { 142typedef struct {
143 befs_disk_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
144 fs64 max_direct_range;
145 befs_disk_block_run indirect;
146 fs64 max_indirect_range;
147 befs_disk_block_run double_indirect;
148 fs64 max_double_indirect_range;
149 fs64 size;
150} PACKED befs_disk_data_stream;
151
152typedef struct {
133 befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS]; 153 befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
134 befs_off_t max_direct_range; 154 befs_off_t max_direct_range;
135 befs_block_run indirect; 155 befs_block_run indirect;
@@ -141,35 +161,35 @@ typedef struct {
141 161
142/* Attribute */ 162/* Attribute */
143typedef struct { 163typedef struct {
144 u32 type; 164 fs32 type;
145 u16 name_size; 165 fs16 name_size;
146 u16 data_size; 166 fs16 data_size;
147 char name[1]; 167 char name[1];
148} PACKED befs_small_data; 168} PACKED befs_small_data;
149 169
150/* Inode structure */ 170/* Inode structure */
151typedef struct { 171typedef struct {
152 u32 magic1; 172 fs32 magic1;
153 befs_inode_addr inode_num; 173 befs_disk_inode_addr inode_num;
154 u32 uid; 174 fs32 uid;
155 u32 gid; 175 fs32 gid;
156 u32 mode; 176 fs32 mode;
157 u32 flags; 177 fs32 flags;
158 befs_time_t create_time; 178 befs_time_t create_time;
159 befs_time_t last_modified_time; 179 befs_time_t last_modified_time;
160 befs_inode_addr parent; 180 befs_disk_inode_addr parent;
161 befs_inode_addr attributes; 181 befs_disk_inode_addr attributes;
162 u32 type; 182 fs32 type;
163 183
164 u32 inode_size; 184 fs32 inode_size;
165 u32 etc; /* not use */ 185 fs32 etc; /* not use */
166 186
167 union { 187 union {
168 befs_data_stream datastream; 188 befs_disk_data_stream datastream;
169 char symlink[BEFS_SYMLINK_LEN]; 189 char symlink[BEFS_SYMLINK_LEN];
170 } data; 190 } data;
171 191
172 u32 pad[4]; /* not use */ 192 fs32 pad[4]; /* not use */
173 befs_small_data small_data[1]; 193 befs_small_data small_data[1];
174} PACKED befs_inode; 194} PACKED befs_inode;
175 195
@@ -190,6 +210,16 @@ enum btree_types {
190}; 210};
191 211
192typedef struct { 212typedef struct {
213 fs32 magic;
214 fs32 node_size;
215 fs32 max_depth;
216 fs32 data_type;
217 fs64 root_node_ptr;
218 fs64 free_node_ptr;
219 fs64 max_size;
220} PACKED befs_disk_btree_super;
221
222typedef struct {
193 u32 magic; 223 u32 magic;
194 u32 node_size; 224 u32 node_size;
195 u32 max_depth; 225 u32 max_depth;
@@ -203,11 +233,19 @@ typedef struct {
203 * Header stucture of each btree node 233 * Header stucture of each btree node
204 */ 234 */
205typedef struct { 235typedef struct {
236 fs64 left;
237 fs64 right;
238 fs64 overflow;
239 fs16 all_key_count;
240 fs16 all_key_length;
241} PACKED befs_btree_nodehead;
242
243typedef struct {
206 befs_off_t left; 244 befs_off_t left;
207 befs_off_t right; 245 befs_off_t right;
208 befs_off_t overflow; 246 befs_off_t overflow;
209 u16 all_key_count; 247 u16 all_key_count;
210 u16 all_key_length; 248 u16 all_key_length;
211} PACKED befs_btree_nodehead; 249} PACKED befs_host_btree_nodehead;
212 250
213#endif /* _LINUX_BEFS_FS_TYPES */ 251#endif /* _LINUX_BEFS_FS_TYPES */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 76e219799409..81b042ee24e6 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -30,7 +30,6 @@
30#include "befs.h" 30#include "befs.h"
31#include "btree.h" 31#include "btree.h"
32#include "datastream.h" 32#include "datastream.h"
33#include "endian.h"
34 33
35/* 34/*
36 * The btree functions in this file are built on top of the 35 * The btree functions in this file are built on top of the
@@ -80,7 +79,7 @@
80 * In memory structure of each btree node 79 * In memory structure of each btree node
81 */ 80 */
82typedef struct { 81typedef struct {
83 befs_btree_nodehead head; /* head of node converted to cpu byteorder */ 82 befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */
84 struct buffer_head *bh; 83 struct buffer_head *bh;
85 befs_btree_nodehead *od_node; /* on disk node */ 84 befs_btree_nodehead *od_node; /* on disk node */
86} befs_btree_node; 85} befs_btree_node;
@@ -102,9 +101,9 @@ static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
102 101
103static int befs_leafnode(befs_btree_node * node); 102static int befs_leafnode(befs_btree_node * node);
104 103
105static u16 *befs_bt_keylen_index(befs_btree_node * node); 104static fs16 *befs_bt_keylen_index(befs_btree_node * node);
106 105
107static befs_off_t *befs_bt_valarray(befs_btree_node * node); 106static fs64 *befs_bt_valarray(befs_btree_node * node);
108 107
109static char *befs_bt_keydata(befs_btree_node * node); 108static char *befs_bt_keydata(befs_btree_node * node);
110 109
@@ -136,7 +135,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
136 befs_btree_super * sup) 135 befs_btree_super * sup)
137{ 136{
138 struct buffer_head *bh = NULL; 137 struct buffer_head *bh = NULL;
139 befs_btree_super *od_sup = NULL; 138 befs_disk_btree_super *od_sup = NULL;
140 139
141 befs_debug(sb, "---> befs_btree_read_super()"); 140 befs_debug(sb, "---> befs_btree_read_super()");
142 141
@@ -146,7 +145,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
146 befs_error(sb, "Couldn't read index header."); 145 befs_error(sb, "Couldn't read index header.");
147 goto error; 146 goto error;
148 } 147 }
149 od_sup = (befs_btree_super *) bh->b_data; 148 od_sup = (befs_disk_btree_super *) bh->b_data;
150 befs_dump_index_entry(sb, od_sup); 149 befs_dump_index_entry(sb, od_sup);
151 150
152 sup->magic = fs32_to_cpu(sb, od_sup->magic); 151 sup->magic = fs32_to_cpu(sb, od_sup->magic);
@@ -342,7 +341,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
342 u16 keylen; 341 u16 keylen;
343 int findkey_len; 342 int findkey_len;
344 char *thiskey; 343 char *thiskey;
345 befs_off_t *valarray; 344 fs64 *valarray;
346 345
347 befs_debug(sb, "---> befs_find_key() %s", findkey); 346 befs_debug(sb, "---> befs_find_key() %s", findkey);
348 347
@@ -422,7 +421,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
422 befs_btree_super bt_super; 421 befs_btree_super bt_super;
423 befs_off_t node_off = 0; 422 befs_off_t node_off = 0;
424 int cur_key; 423 int cur_key;
425 befs_off_t *valarray; 424 fs64 *valarray;
426 char *keystart; 425 char *keystart;
427 u16 keylen; 426 u16 keylen;
428 int res; 427 int res;
@@ -572,7 +571,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
572 this_node->head.overflow); 571 this_node->head.overflow);
573 *node_off = this_node->head.overflow; 572 *node_off = this_node->head.overflow;
574 } else { 573 } else {
575 befs_off_t *valarray = befs_bt_valarray(this_node); 574 fs64 *valarray = befs_bt_valarray(this_node);
576 *node_off = fs64_to_cpu(sb, valarray[0]); 575 *node_off = fs64_to_cpu(sb, valarray[0]);
577 } 576 }
578 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { 577 if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
@@ -622,7 +621,7 @@ befs_leafnode(befs_btree_node * node)
622 * 621 *
623 * Except that rounding up to 8 works, and rounding up to 4 doesn't. 622 * Except that rounding up to 8 works, and rounding up to 4 doesn't.
624 */ 623 */
625static u16 * 624static fs16 *
626befs_bt_keylen_index(befs_btree_node * node) 625befs_bt_keylen_index(befs_btree_node * node)
627{ 626{
628 const int keylen_align = 8; 627 const int keylen_align = 8;
@@ -633,7 +632,7 @@ befs_bt_keylen_index(befs_btree_node * node)
633 if (tmp) 632 if (tmp)
634 off += keylen_align - tmp; 633 off += keylen_align - tmp;
635 634
636 return (u16 *) ((void *) node->od_node + off); 635 return (fs16 *) ((void *) node->od_node + off);
637} 636}
638 637
639/** 638/**
@@ -643,13 +642,13 @@ befs_bt_keylen_index(befs_btree_node * node)
643 * Returns a pointer to the start of the value array 642 * Returns a pointer to the start of the value array
644 * of the node pointed to by the node header 643 * of the node pointed to by the node header
645 */ 644 */
646static befs_off_t * 645static fs64 *
647befs_bt_valarray(befs_btree_node * node) 646befs_bt_valarray(befs_btree_node * node)
648{ 647{
649 void *keylen_index_start = (void *) befs_bt_keylen_index(node); 648 void *keylen_index_start = (void *) befs_bt_keylen_index(node);
650 size_t keylen_index_size = node->head.all_key_count * sizeof (u16); 649 size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
651 650
652 return (befs_off_t *) (keylen_index_start + keylen_index_size); 651 return (fs64 *) (keylen_index_start + keylen_index_size);
653} 652}
654 653
655/** 654/**
@@ -681,7 +680,7 @@ befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
681{ 680{
682 int prev_key_end; 681 int prev_key_end;
683 char *keystart; 682 char *keystart;
684 u16 *keylen_index; 683 fs16 *keylen_index;
685 684
686 if (index < 0 || index > node->head.all_key_count) { 685 if (index < 0 || index > node->head.all_key_count) {
687 *keylen = 0; 686 *keylen = 0;
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index b7d6b920f65f..aacb4da6298a 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -18,7 +18,6 @@
18#include "befs.h" 18#include "befs.h"
19#include "datastream.h" 19#include "datastream.h"
20#include "io.h" 20#include "io.h"
21#include "endian.h"
22 21
23const befs_inode_addr BAD_IADDR = { 0, 0, 0 }; 22const befs_inode_addr BAD_IADDR = { 0, 0, 0 };
24 23
@@ -312,7 +311,7 @@ befs_find_brun_indirect(struct super_block *sb,
312 befs_blocknr_t indir_start_blk; 311 befs_blocknr_t indir_start_blk;
313 befs_blocknr_t search_blk; 312 befs_blocknr_t search_blk;
314 struct buffer_head *indirblock; 313 struct buffer_head *indirblock;
315 befs_block_run *array; 314 befs_disk_block_run *array;
316 315
317 befs_block_run indirect = data->indirect; 316 befs_block_run indirect = data->indirect;
318 befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); 317 befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
@@ -334,7 +333,7 @@ befs_find_brun_indirect(struct super_block *sb,
334 return BEFS_ERR; 333 return BEFS_ERR;
335 } 334 }
336 335
337 array = (befs_block_run *) indirblock->b_data; 336 array = (befs_disk_block_run *) indirblock->b_data;
338 337
339 for (j = 0; j < arraylen; ++j) { 338 for (j = 0; j < arraylen; ++j) {
340 int len = fs16_to_cpu(sb, array[j].len); 339 int len = fs16_to_cpu(sb, array[j].len);
@@ -427,7 +426,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
427 struct buffer_head *dbl_indir_block; 426 struct buffer_head *dbl_indir_block;
428 struct buffer_head *indir_block; 427 struct buffer_head *indir_block;
429 befs_block_run indir_run; 428 befs_block_run indir_run;
430 befs_inode_addr *iaddr_array = NULL; 429 befs_disk_inode_addr *iaddr_array = NULL;
431 befs_sb_info *befs_sb = BEFS_SB(sb); 430 befs_sb_info *befs_sb = BEFS_SB(sb);
432 431
433 befs_blocknr_t indir_start_blk = 432 befs_blocknr_t indir_start_blk =
@@ -482,7 +481,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
482 481
483 dbl_block_indx = 482 dbl_block_indx =
484 dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb)); 483 dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb));
485 iaddr_array = (befs_inode_addr *) dbl_indir_block->b_data; 484 iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
486 indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]); 485 indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
487 brelse(dbl_indir_block); 486 brelse(dbl_indir_block);
488 iaddr_array = NULL; 487 iaddr_array = NULL;
@@ -507,7 +506,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
507 } 506 }
508 507
509 block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb)); 508 block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb));
510 iaddr_array = (befs_inode_addr *) indir_block->b_data; 509 iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
511 *run = fsrun_to_cpu(sb, iaddr_array[block_indx]); 510 *run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
512 brelse(indir_block); 511 brelse(indir_block);
513 iaddr_array = NULL; 512 iaddr_array = NULL;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 875cc0aa318c..e831a8f30849 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -21,7 +21,6 @@
21#endif /* __KERNEL__ */ 21#endif /* __KERNEL__ */
22 22
23#include "befs.h" 23#include "befs.h"
24#include "endian.h"
25 24
26#define ERRBUFSIZE 1024 25#define ERRBUFSIZE 1024
27 26
@@ -125,7 +124,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
125 befs_debug(sb, " type %08x", fs32_to_cpu(sb, inode->type)); 124 befs_debug(sb, " type %08x", fs32_to_cpu(sb, inode->type));
126 befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, inode->inode_size)); 125 befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, inode->inode_size));
127 126
128 if (S_ISLNK(inode->mode)) { 127 if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) {
129 befs_debug(sb, " Symbolic link [%s]", inode->data.symlink); 128 befs_debug(sb, " Symbolic link [%s]", inode->data.symlink);
130 } else { 129 } else {
131 int i; 130 int i;
@@ -231,21 +230,20 @@ befs_dump_small_data(const struct super_block *sb, befs_small_data * sd)
231 230
232/* unused */ 231/* unused */
233void 232void
234befs_dump_run(const struct super_block *sb, befs_block_run run) 233befs_dump_run(const struct super_block *sb, befs_disk_block_run run)
235{ 234{
236#ifdef CONFIG_BEFS_DEBUG 235#ifdef CONFIG_BEFS_DEBUG
237 236
238 run = fsrun_to_cpu(sb, run); 237 befs_block_run n = fsrun_to_cpu(sb, run);
239 238
240 befs_debug(sb, "[%u, %hu, %hu]", 239 befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len);
241 run.allocation_group, run.start, run.len);
242 240
243#endif //CONFIG_BEFS_DEBUG 241#endif //CONFIG_BEFS_DEBUG
244} 242}
245#endif /* 0 */ 243#endif /* 0 */
246 244
247void 245void
248befs_dump_index_entry(const struct super_block *sb, befs_btree_super * super) 246befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super)
249{ 247{
250#ifdef CONFIG_BEFS_DEBUG 248#ifdef CONFIG_BEFS_DEBUG
251 249
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 9ecaea4e3325..e254a20869f4 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -10,85 +10,84 @@
10#define LINUX_BEFS_ENDIAN 10#define LINUX_BEFS_ENDIAN
11 11
12#include <linux/byteorder/generic.h> 12#include <linux/byteorder/generic.h>
13#include "befs.h"
14 13
15static inline u64 14static inline u64
16fs64_to_cpu(const struct super_block *sb, u64 n) 15fs64_to_cpu(const struct super_block *sb, fs64 n)
17{ 16{
18 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) 17 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
19 return le64_to_cpu(n); 18 return le64_to_cpu((__force __le64)n);
20 else 19 else
21 return be64_to_cpu(n); 20 return be64_to_cpu((__force __be64)n);
22} 21}
23 22
24static inline u64 23static inline fs64
25cpu_to_fs64(const struct super_block *sb, u64 n) 24cpu_to_fs64(const struct super_block *sb, u64 n)
26{ 25{
27 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) 26 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
28 return cpu_to_le64(n); 27 return (__force fs64)cpu_to_le64(n);
29 else 28 else
30 return cpu_to_be64(n); 29 return (__force fs64)cpu_to_be64(n);
31} 30}
32 31
33static inline u32 32static inline u32
34fs32_to_cpu(const struct super_block *sb, u32 n) 33fs32_to_cpu(const struct super_block *sb, fs32 n)
35{ 34{
36 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) 35 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
37 return le32_to_cpu(n); 36 return le32_to_cpu((__force __le32)n);
38 else 37 else
39 return be32_to_cpu(n); 38 return be32_to_cpu((__force __be32)n);
40} 39}
41 40
42static inline u32 41static inline fs32
43cpu_to_fs32(const struct super_block *sb, u32 n) 42cpu_to_fs32(const struct super_block *sb, u32 n)
44{ 43{
45 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) 44 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
46 return cpu_to_le32(n); 45 return (__force fs32)cpu_to_le32(n);
47 else 46 else
48 return cpu_to_be32(n); 47 return (__force fs32)cpu_to_be32(n);
49} 48}
50 49
51static inline u16 50static inline u16
52fs16_to_cpu(const struct super_block *sb, u16 n) 51fs16_to_cpu(const struct super_block *sb, fs16 n)
53{ 52{
54 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) 53 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
55 return le16_to_cpu(n); 54 return le16_to_cpu((__force __le16)n);
56 else 55 else
57 return be16_to_cpu(n); 56 return be16_to_cpu((__force __be16)n);
58} 57}
59 58
60static inline u16 59static inline fs16
61cpu_to_fs16(const struct super_block *sb, u16 n) 60cpu_to_fs16(const struct super_block *sb, u16 n)
62{ 61{
63 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) 62 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
64 return cpu_to_le16(n); 63 return (__force fs16)cpu_to_le16(n);
65 else 64 else
66 return cpu_to_be16(n); 65 return (__force fs16)cpu_to_be16(n);
67} 66}
68 67
69/* Composite types below here */ 68/* Composite types below here */
70 69
71static inline befs_block_run 70static inline befs_block_run
72fsrun_to_cpu(const struct super_block *sb, befs_block_run n) 71fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n)
73{ 72{
74 befs_block_run run; 73 befs_block_run run;
75 74
76 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) { 75 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
77 run.allocation_group = le32_to_cpu(n.allocation_group); 76 run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group);
78 run.start = le16_to_cpu(n.start); 77 run.start = le16_to_cpu((__force __le16)n.start);
79 run.len = le16_to_cpu(n.len); 78 run.len = le16_to_cpu((__force __le16)n.len);
80 } else { 79 } else {
81 run.allocation_group = be32_to_cpu(n.allocation_group); 80 run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group);
82 run.start = be16_to_cpu(n.start); 81 run.start = be16_to_cpu((__force __be16)n.start);
83 run.len = be16_to_cpu(n.len); 82 run.len = be16_to_cpu((__force __be16)n.len);
84 } 83 }
85 return run; 84 return run;
86} 85}
87 86
88static inline befs_block_run 87static inline befs_disk_block_run
89cpu_to_fsrun(const struct super_block *sb, befs_block_run n) 88cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
90{ 89{
91 befs_block_run run; 90 befs_disk_block_run run;
92 91
93 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) { 92 if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
94 run.allocation_group = cpu_to_le32(n.allocation_group); 93 run.allocation_group = cpu_to_le32(n.allocation_group);
@@ -103,7 +102,7 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
103} 102}
104 103
105static inline befs_data_stream 104static inline befs_data_stream
106fsds_to_cpu(const struct super_block *sb, befs_data_stream n) 105fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
107{ 106{
108 befs_data_stream data; 107 befs_data_stream data;
109 int i; 108 int i;
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index d41c9247ae8a..94c17f9a9576 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -8,7 +8,6 @@
8 8
9#include "befs.h" 9#include "befs.h"
10#include "inode.h" 10#include "inode.h"
11#include "endian.h"
12 11
13/* 12/*
14 Validates the correctness of the befs inode 13 Validates the correctness of the befs inode
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 57020c7a7e65..07f7144f0e2e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -22,7 +22,6 @@
22#include "datastream.h" 22#include "datastream.h"
23#include "super.h" 23#include "super.h"
24#include "io.h" 24#include "io.h"
25#include "endian.h"
26 25
27MODULE_DESCRIPTION("BeOS File System (BeFS) driver"); 26MODULE_DESCRIPTION("BeOS File System (BeFS) driver");
28MODULE_AUTHOR("Will Dyson"); 27MODULE_AUTHOR("Will Dyson");
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 4557acbac528..8c3401ff6d6a 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -11,7 +11,6 @@
11 11
12#include "befs.h" 12#include "befs.h"
13#include "super.h" 13#include "super.h"
14#include "endian.h"
15 14
16/** 15/**
17 * load_befs_sb -- Read from disk and properly byteswap all the fields 16 * load_befs_sb -- Read from disk and properly byteswap all the fields
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06435f3665f4..79b05a1a4365 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1152,7 +1152,7 @@ static int dump_write(struct file *file, const void *addr, int nr)
1152static int dump_seek(struct file *file, loff_t off) 1152static int dump_seek(struct file *file, loff_t off)
1153{ 1153{
1154 if (file->f_op->llseek && file->f_op->llseek != no_llseek) { 1154 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
1155 if (file->f_op->llseek(file, off, 1) != off) 1155 if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
1156 return 0; 1156 return 0;
1157 } else { 1157 } else {
1158 char *buf = (char *)get_zeroed_page(GFP_KERNEL); 1158 char *buf = (char *)get_zeroed_page(GFP_KERNEL);
@@ -1220,7 +1220,7 @@ static int notesize(struct memelfnote *en)
1220 1220
1221static int alignfile(struct file *file, loff_t *foffset) 1221static int alignfile(struct file *file, loff_t *foffset)
1222{ 1222{
1223 char buf[4] = { 0, }; 1223 static const char buf[4] = { 0, };
1224 DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset); 1224 DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
1225 return 1; 1225 return 1;
1226} 1226}
@@ -1569,7 +1569,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1569 1569
1570 DUMP_WRITE(elf, sizeof(*elf)); 1570 DUMP_WRITE(elf, sizeof(*elf));
1571 offset += sizeof(*elf); /* Elf header */ 1571 offset += sizeof(*elf); /* Elf header */
1572 offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ 1572 offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */
1573 foffset = offset;
1573 1574
1574 /* Write notes phdr entry */ 1575 /* Write notes phdr entry */
1575 { 1576 {
@@ -1586,8 +1587,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1586 DUMP_WRITE(&phdr, sizeof(phdr)); 1587 DUMP_WRITE(&phdr, sizeof(phdr));
1587 } 1588 }
1588 1589
1589 foffset = offset;
1590
1591 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 1590 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
1592 1591
1593 /* Write program headers for segments dump */ 1592 /* Write program headers for segments dump */
@@ -1612,7 +1611,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
1612 phdr.p_align = ELF_EXEC_PAGESIZE; 1611 phdr.p_align = ELF_EXEC_PAGESIZE;
1613 1612
1614 DUMP_WRITE(&phdr, sizeof(phdr)); 1613 DUMP_WRITE(&phdr, sizeof(phdr));
1615 foffset += sizeof(phdr);
1616 } 1614 }
1617 1615
1618#ifdef ELF_CORE_WRITE_EXTRA_PHDRS 1616#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 32b5d625ce9c..5bcdaaf4eae0 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -29,6 +29,7 @@
29#include <linux/personality.h> 29#include <linux/personality.h>
30#include <linux/init.h> 30#include <linux/init.h>
31 31
32#include <asm/a.out.h>
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/pgtable.h> 34#include <asm/pgtable.h>
34 35
@@ -194,6 +195,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
194 unsigned long som_entry; 195 unsigned long som_entry;
195 struct som_hdr *som_ex; 196 struct som_hdr *som_ex;
196 struct som_exec_auxhdr *hpuxhdr; 197 struct som_exec_auxhdr *hpuxhdr;
198 struct files_struct *files;
197 199
198 /* Get the exec-header */ 200 /* Get the exec-header */
199 som_ex = (struct som_hdr *) bprm->buf; 201 som_ex = (struct som_hdr *) bprm->buf;
@@ -208,15 +210,27 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
208 size = som_ex->aux_header_size; 210 size = som_ex->aux_header_size;
209 if (size > SOM_PAGESIZE) 211 if (size > SOM_PAGESIZE)
210 goto out; 212 goto out;
211 hpuxhdr = (struct som_exec_auxhdr *) kmalloc(size, GFP_KERNEL); 213 hpuxhdr = kmalloc(size, GFP_KERNEL);
212 if (!hpuxhdr) 214 if (!hpuxhdr)
213 goto out; 215 goto out;
214 216
215 retval = kernel_read(bprm->file, som_ex->aux_header_location, 217 retval = kernel_read(bprm->file, som_ex->aux_header_location,
216 (char *) hpuxhdr, size); 218 (char *) hpuxhdr, size);
219 if (retval != size) {
220 if (retval >= 0)
221 retval = -EIO;
222 goto out_free;
223 }
224
225 files = current->files; /* Refcounted so ok */
226 retval = unshare_files();
217 if (retval < 0) 227 if (retval < 0)
218 goto out_free; 228 goto out_free;
219#error "Fix security hole before enabling me" 229 if (files == current->files) {
230 put_files_struct(files);
231 files = NULL;
232 }
233
220 retval = get_unused_fd(); 234 retval = get_unused_fd();
221 if (retval < 0) 235 if (retval < 0)
222 goto out_free; 236 goto out_free;
diff --git a/fs/bio.c b/fs/bio.c
index 8f93e939f213..f95c8749499f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -79,7 +79,6 @@ static struct bio_set *fs_bio_set;
79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
80{ 80{
81 struct bio_vec *bvl; 81 struct bio_vec *bvl;
82 struct biovec_slab *bp;
83 82
84 /* 83 /*
85 * see comment near bvec_array define! 84 * see comment near bvec_array define!
@@ -98,10 +97,12 @@ static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned lon
98 * idx now points to the pool we want to allocate from 97 * idx now points to the pool we want to allocate from
99 */ 98 */
100 99
101 bp = bvec_slabs + *idx;
102 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 100 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
103 if (bvl) 101 if (bvl) {
102 struct biovec_slab *bp = bvec_slabs + *idx;
103
104 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); 104 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
105 }
105 106
106 return bvl; 107 return bvl;
107} 108}
@@ -166,7 +167,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
166 167
167 bio_init(bio); 168 bio_init(bio);
168 if (likely(nr_iovecs)) { 169 if (likely(nr_iovecs)) {
169 unsigned long idx; 170 unsigned long idx = 0; /* shut up gcc */
170 171
171 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 172 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
172 if (unlikely(!bvl)) { 173 if (unlikely(!bvl)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 16cfbcd254f1..35527dca1dbc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -452,6 +452,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
452 bdevname(bh->b_bdev, b)); 452 bdevname(bh->b_bdev, b));
453 } 453 }
454 set_bit(AS_EIO, &page->mapping->flags); 454 set_bit(AS_EIO, &page->mapping->flags);
455 set_buffer_write_io_error(bh);
455 clear_buffer_uptodate(bh); 456 clear_buffer_uptodate(bh);
456 SetPageError(page); 457 SetPageError(page);
457 } 458 }
@@ -571,6 +572,10 @@ EXPORT_SYMBOL(mark_buffer_async_write);
571static inline void __remove_assoc_queue(struct buffer_head *bh) 572static inline void __remove_assoc_queue(struct buffer_head *bh)
572{ 573{
573 list_del_init(&bh->b_assoc_buffers); 574 list_del_init(&bh->b_assoc_buffers);
575 WARN_ON(!bh->b_assoc_map);
576 if (buffer_write_io_error(bh))
577 set_bit(AS_EIO, &bh->b_assoc_map->flags);
578 bh->b_assoc_map = NULL;
574} 579}
575 580
576int inode_has_buffers(struct inode *inode) 581int inode_has_buffers(struct inode *inode)
@@ -669,6 +674,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
669 spin_lock(&buffer_mapping->private_lock); 674 spin_lock(&buffer_mapping->private_lock);
670 list_move_tail(&bh->b_assoc_buffers, 675 list_move_tail(&bh->b_assoc_buffers,
671 &mapping->private_list); 676 &mapping->private_list);
677 bh->b_assoc_map = mapping;
672 spin_unlock(&buffer_mapping->private_lock); 678 spin_unlock(&buffer_mapping->private_lock);
673 } 679 }
674} 680}
@@ -701,7 +707,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
701 */ 707 */
702int __set_page_dirty_buffers(struct page *page) 708int __set_page_dirty_buffers(struct page *page)
703{ 709{
704 struct address_space * const mapping = page->mapping; 710 struct address_space * const mapping = page_mapping(page);
711
712 if (unlikely(!mapping))
713 return !TestSetPageDirty(page);
705 714
706 spin_lock(&mapping->private_lock); 715 spin_lock(&mapping->private_lock);
707 if (page_has_buffers(page)) { 716 if (page_has_buffers(page)) {
@@ -762,7 +771,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
762 spin_lock(lock); 771 spin_lock(lock);
763 while (!list_empty(list)) { 772 while (!list_empty(list)) {
764 bh = BH_ENTRY(list->next); 773 bh = BH_ENTRY(list->next);
765 list_del_init(&bh->b_assoc_buffers); 774 __remove_assoc_queue(bh);
766 if (buffer_dirty(bh) || buffer_locked(bh)) { 775 if (buffer_dirty(bh) || buffer_locked(bh)) {
767 list_add(&bh->b_assoc_buffers, &tmp); 776 list_add(&bh->b_assoc_buffers, &tmp);
768 if (buffer_dirty(bh)) { 777 if (buffer_dirty(bh)) {
@@ -783,7 +792,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
783 792
784 while (!list_empty(&tmp)) { 793 while (!list_empty(&tmp)) {
785 bh = BH_ENTRY(tmp.prev); 794 bh = BH_ENTRY(tmp.prev);
786 __remove_assoc_queue(bh); 795 list_del_init(&bh->b_assoc_buffers);
787 get_bh(bh); 796 get_bh(bh);
788 spin_unlock(lock); 797 spin_unlock(lock);
789 wait_on_buffer(bh); 798 wait_on_buffer(bh);
@@ -1039,8 +1048,21 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
1039 } while ((size << sizebits) < PAGE_SIZE); 1048 } while ((size << sizebits) < PAGE_SIZE);
1040 1049
1041 index = block >> sizebits; 1050 index = block >> sizebits;
1042 block = index << sizebits;
1043 1051
1052 /*
1053 * Check for a block which wants to lie outside our maximum possible
1054 * pagecache index. (this comparison is done using sector_t types).
1055 */
1056 if (unlikely(index != block >> sizebits)) {
1057 char b[BDEVNAME_SIZE];
1058
1059 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1060 "device %s\n",
1061 __FUNCTION__, (unsigned long long)block,
1062 bdevname(bdev, b));
1063 return -EIO;
1064 }
1065 block = index << sizebits;
1044 /* Create a page with the proper size buffers.. */ 1066 /* Create a page with the proper size buffers.. */
1045 page = grow_dev_page(bdev, block, index, size); 1067 page = grow_dev_page(bdev, block, index, size);
1046 if (!page) 1068 if (!page)
@@ -1067,12 +1089,16 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1067 1089
1068 for (;;) { 1090 for (;;) {
1069 struct buffer_head * bh; 1091 struct buffer_head * bh;
1092 int ret;
1070 1093
1071 bh = __find_get_block(bdev, block, size); 1094 bh = __find_get_block(bdev, block, size);
1072 if (bh) 1095 if (bh)
1073 return bh; 1096 return bh;
1074 1097
1075 if (!grow_buffers(bdev, block, size)) 1098 ret = grow_buffers(bdev, block, size);
1099 if (ret < 0)
1100 return NULL;
1101 if (ret == 0)
1076 free_more_memory(); 1102 free_more_memory();
1077 } 1103 }
1078} 1104}
@@ -1147,6 +1173,7 @@ void __bforget(struct buffer_head *bh)
1147 1173
1148 spin_lock(&buffer_mapping->private_lock); 1174 spin_lock(&buffer_mapping->private_lock);
1149 list_del_init(&bh->b_assoc_buffers); 1175 list_del_init(&bh->b_assoc_buffers);
1176 bh->b_assoc_map = NULL;
1150 spin_unlock(&buffer_mapping->private_lock); 1177 spin_unlock(&buffer_mapping->private_lock);
1151 } 1178 }
1152 __brelse(bh); 1179 __brelse(bh);
@@ -1834,6 +1861,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1834 clear_buffer_new(bh); 1861 clear_buffer_new(bh);
1835 kaddr = kmap_atomic(page, KM_USER0); 1862 kaddr = kmap_atomic(page, KM_USER0);
1836 memset(kaddr+block_start, 0, bh->b_size); 1863 memset(kaddr+block_start, 0, bh->b_size);
1864 flush_dcache_page(page);
1837 kunmap_atomic(kaddr, KM_USER0); 1865 kunmap_atomic(kaddr, KM_USER0);
1838 set_buffer_uptodate(bh); 1866 set_buffer_uptodate(bh);
1839 mark_buffer_dirty(bh); 1867 mark_buffer_dirty(bh);
@@ -2340,6 +2368,7 @@ failed:
2340 */ 2368 */
2341 kaddr = kmap_atomic(page, KM_USER0); 2369 kaddr = kmap_atomic(page, KM_USER0);
2342 memset(kaddr, 0, PAGE_CACHE_SIZE); 2370 memset(kaddr, 0, PAGE_CACHE_SIZE);
2371 flush_dcache_page(page);
2343 kunmap_atomic(kaddr, KM_USER0); 2372 kunmap_atomic(kaddr, KM_USER0);
2344 SetPageUptodate(page); 2373 SetPageUptodate(page);
2345 set_page_dirty(page); 2374 set_page_dirty(page);
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index d0776ac2b804..5eff35d6e564 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -31,8 +31,8 @@ struct cifs_sid {
31} __attribute__((packed)); 31} __attribute__((packed));
32 32
33/* everyone */ 33/* everyone */
34extern const struct cifs_sid sid_everyone; 34/* extern const struct cifs_sid sid_everyone;*/
35/* group users */ 35/* group users */
36extern const struct cifs_sid sid_user; 36/* extern const struct cifs_sid sid_user;*/
37 37
38#endif /* _CIFSACL_H */ 38#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
index 03e359b32861..152fa2dcfc6c 100644
--- a/fs/cifs/cifsencrypt.h
+++ b/fs/cifs/cifsencrypt.h
@@ -27,8 +27,6 @@ extern void mdfour(unsigned char *out, unsigned char *in, int n);
27/* smbdes.c */ 27/* smbdes.c */
28extern void E_P16(unsigned char *p14, unsigned char *p16); 28extern void E_P16(unsigned char *p14, unsigned char *p16);
29extern void E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24); 29extern void E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24);
30extern void D_P16(unsigned char *p14, unsigned char *in, unsigned char *out);
31extern void E_old_pw_hash(unsigned char *, unsigned char *, unsigned char *);
32 30
33 31
34 32
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c00c654f2e11..84976cdbe713 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -63,6 +63,7 @@ extern struct task_struct * oplockThread; /* remove sparse warning */
63struct task_struct * oplockThread = NULL; 63struct task_struct * oplockThread = NULL;
64extern struct task_struct * dnotifyThread; /* remove sparse warning */ 64extern struct task_struct * dnotifyThread; /* remove sparse warning */
65struct task_struct * dnotifyThread = NULL; 65struct task_struct * dnotifyThread = NULL;
66static struct super_operations cifs_super_ops;
66unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 67unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
67module_param(CIFSMaxBufSize, int, 0); 68module_param(CIFSMaxBufSize, int, 0);
68MODULE_PARM_DESC(CIFSMaxBufSize,"Network buffer size (not including header). Default: 16384 Range: 8192 to 130048"); 69MODULE_PARM_DESC(CIFSMaxBufSize,"Network buffer size (not including header). Default: 16384 Range: 8192 to 130048");
@@ -198,10 +199,12 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
198 /* Only need to call the old QFSInfo if failed 199 /* Only need to call the old QFSInfo if failed
199 on newer one */ 200 on newer one */
200 if(rc) 201 if(rc)
201 rc = CIFSSMBQFSInfo(xid, pTcon, buf); 202 if(pTcon->ses->capabilities & CAP_NT_SMBS)
203 rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
202 204
203 /* Old Windows servers do not support level 103, retry with level 205 /* Some old Windows servers also do not support level 103, retry with
204 one if old server failed the previous call */ 206 older level one if old server failed the previous call or we
207 bypassed it because we detected that this was an older LANMAN sess */
205 if(rc) 208 if(rc)
206 rc = SMBOldQFSInfo(xid, pTcon, buf); 209 rc = SMBOldQFSInfo(xid, pTcon, buf);
207 /* 210 /*
@@ -435,13 +438,21 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
435 return; 438 return;
436} 439}
437 440
441#ifdef CONFIG_CIFS_STATS2
442static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt)
443{
444 /* BB FIXME */
445 return 0;
446}
447#endif
448
438static int cifs_remount(struct super_block *sb, int *flags, char *data) 449static int cifs_remount(struct super_block *sb, int *flags, char *data)
439{ 450{
440 *flags |= MS_NODIRATIME; 451 *flags |= MS_NODIRATIME;
441 return 0; 452 return 0;
442} 453}
443 454
444struct super_operations cifs_super_ops = { 455static struct super_operations cifs_super_ops = {
445 .read_inode = cifs_read_inode, 456 .read_inode = cifs_read_inode,
446 .put_super = cifs_put_super, 457 .put_super = cifs_put_super,
447 .statfs = cifs_statfs, 458 .statfs = cifs_statfs,
@@ -454,6 +465,9 @@ struct super_operations cifs_super_ops = {
454 .show_options = cifs_show_options, 465 .show_options = cifs_show_options,
455 .umount_begin = cifs_umount_begin, 466 .umount_begin = cifs_umount_begin,
456 .remount_fs = cifs_remount, 467 .remount_fs = cifs_remount,
468#ifdef CONFIG_CIFS_STATS2
469 .show_stats = cifs_show_stats,
470#endif
457}; 471};
458 472
459static int 473static int
@@ -495,7 +509,7 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
495static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) 509static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
496{ 510{
497 /* origin == SEEK_END => we must revalidate the cached file length */ 511 /* origin == SEEK_END => we must revalidate the cached file length */
498 if (origin == 2) { 512 if (origin == SEEK_END) {
499 int retval = cifs_revalidate(file->f_dentry); 513 int retval = cifs_revalidate(file->f_dentry);
500 if (retval < 0) 514 if (retval < 0)
501 return (loff_t)retval; 515 return (loff_t)retval;
@@ -903,7 +917,7 @@ init_cifs(void)
903#ifdef CONFIG_PROC_FS 917#ifdef CONFIG_PROC_FS
904 cifs_proc_init(); 918 cifs_proc_init();
905#endif 919#endif
906 INIT_LIST_HEAD(&GlobalServerList); /* BB not implemented yet */ 920/* INIT_LIST_HEAD(&GlobalServerList);*/ /* BB not implemented yet */
907 INIT_LIST_HEAD(&GlobalSMBSessionList); 921 INIT_LIST_HEAD(&GlobalSMBSessionList);
908 INIT_LIST_HEAD(&GlobalTreeConnectionList); 922 INIT_LIST_HEAD(&GlobalTreeConnectionList);
909 INIT_LIST_HEAD(&GlobalOplock_Q); 923 INIT_LIST_HEAD(&GlobalOplock_Q);
@@ -931,6 +945,7 @@ init_cifs(void)
931 GlobalCurrentXid = 0; 945 GlobalCurrentXid = 0;
932 GlobalTotalActiveXid = 0; 946 GlobalTotalActiveXid = 0;
933 GlobalMaxActiveXid = 0; 947 GlobalMaxActiveXid = 0;
948 memset(Local_System_Name, 0, 15);
934 rwlock_init(&GlobalSMBSeslock); 949 rwlock_init(&GlobalSMBSeslock);
935 spin_lock_init(&GlobalMid_Lock); 950 spin_lock_init(&GlobalMid_Lock);
936 951
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index bea875d9a46a..a243f779b363 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern const struct address_space_operations cifs_addr_ops;
36extern const struct address_space_operations cifs_addr_ops_smallbuf; 36extern const struct address_space_operations cifs_addr_ops_smallbuf;
37 37
38/* Functions related to super block operations */ 38/* Functions related to super block operations */
39extern struct super_operations cifs_super_ops; 39/* extern struct super_operations cifs_super_ops;*/
40extern void cifs_read_inode(struct inode *); 40extern void cifs_read_inode(struct inode *);
41extern void cifs_delete_inode(struct inode *); 41extern void cifs_delete_inode(struct inode *);
42/* extern void cifs_write_inode(struct inode *); *//* BB not needed yet */ 42/* extern void cifs_write_inode(struct inode *); *//* BB not needed yet */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b24006c47df1..74d3ccbb103b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -153,7 +153,7 @@ struct TCP_Server_Info {
153 char sessid[4]; /* unique token id for this session */ 153 char sessid[4]; /* unique token id for this session */
154 /* (returned on Negotiate */ 154 /* (returned on Negotiate */
155 int capabilities; /* allow selective disabling of caps by smb sess */ 155 int capabilities; /* allow selective disabling of caps by smb sess */
156 __u16 timeZone; 156 int timeAdj; /* Adjust for difference in server time zone in sec */
157 __u16 CurrentMid; /* multiplex id - rotating counter */ 157 __u16 CurrentMid; /* multiplex id - rotating counter */
158 char cryptKey[CIFS_CRYPTO_KEY_SIZE]; 158 char cryptKey[CIFS_CRYPTO_KEY_SIZE];
159 /* 16th byte of RFC1001 workstation name is always null */ 159 /* 16th byte of RFC1001 workstation name is always null */
@@ -203,9 +203,14 @@ struct cifsSesInfo {
203 char * domainName; 203 char * domainName;
204 char * password; 204 char * password;
205}; 205};
206/* session flags */ 206/* no more than one of the following three session flags may be set */
207#define CIFS_SES_NT4 1 207#define CIFS_SES_NT4 1
208 208#define CIFS_SES_OS2 2
209#define CIFS_SES_W9X 4
210/* following flag is set for old servers such as OS2 (and Win95?)
211 which do not negotiate NTLM or POSIX dialects, but instead
212 negotiate one of the older LANMAN dialects */
213#define CIFS_SES_LANMAN 8
209/* 214/*
210 * there is one of these for each connection to a resource on a particular 215 * there is one of these for each connection to a resource on a particular
211 * session 216 * session
@@ -512,7 +517,8 @@ require use of the stronger protocol */
512 * This list helps improve performance and eliminate the messages indicating 517 * This list helps improve performance and eliminate the messages indicating
513 * that we had a communications error talking to the server in this list. 518 * that we had a communications error talking to the server in this list.
514 */ 519 */
515GLOBAL_EXTERN struct servers_not_supported *NotSuppList; /*@z4a */ 520/* Feature not supported */
521/* GLOBAL_EXTERN struct servers_not_supported *NotSuppList; */
516 522
517/* 523/*
518 * The following is a hash table of all the users we know about. 524 * The following is a hash table of all the users we know about.
@@ -568,7 +574,6 @@ GLOBAL_EXTERN unsigned int lookupCacheEnabled;
568GLOBAL_EXTERN unsigned int extended_security; /* if on, session setup sent 574GLOBAL_EXTERN unsigned int extended_security; /* if on, session setup sent
569 with more secure ntlmssp2 challenge/resp */ 575 with more secure ntlmssp2 challenge/resp */
570GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ 576GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
571GLOBAL_EXTERN unsigned int secFlags;
572GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ 577GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
573GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ 578GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
574GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ 579GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 81df2bf8e75a..6df9dadba647 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -26,7 +26,8 @@
26 26
27#ifdef CONFIG_CIFS_WEAK_PW_HASH 27#ifdef CONFIG_CIFS_WEAK_PW_HASH
28#define LANMAN_PROT 0 28#define LANMAN_PROT 0
29#define CIFS_PROT 1 29#define LANMAN2_PROT 1
30#define CIFS_PROT 2
30#else 31#else
31#define CIFS_PROT 0 32#define CIFS_PROT 0
32#endif 33#endif
@@ -408,6 +409,8 @@ typedef struct negotiate_req {
408 409
409/* Dialect index is 13 for LANMAN */ 410/* Dialect index is 13 for LANMAN */
410 411
412#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
413
411typedef struct lanman_neg_rsp { 414typedef struct lanman_neg_rsp {
412 struct smb_hdr hdr; /* wct = 13 */ 415 struct smb_hdr hdr; /* wct = 13 */
413 __le16 DialectIndex; 416 __le16 DialectIndex;
@@ -417,7 +420,10 @@ typedef struct lanman_neg_rsp {
417 __le16 MaxNumberVcs; 420 __le16 MaxNumberVcs;
418 __le16 RawMode; 421 __le16 RawMode;
419 __le32 SessionKey; 422 __le32 SessionKey;
420 __le32 ServerTime; 423 struct {
424 __le16 Time;
425 __le16 Date;
426 } __attribute__((packed)) SrvTime;
421 __le16 ServerTimeZone; 427 __le16 ServerTimeZone;
422 __le16 EncryptionKeyLength; 428 __le16 EncryptionKeyLength;
423 __le16 Reserved; 429 __le16 Reserved;
@@ -674,7 +680,7 @@ typedef union smb_com_tree_disconnect { /* as an altetnative can use flag on
674typedef struct smb_com_close_req { 680typedef struct smb_com_close_req {
675 struct smb_hdr hdr; /* wct = 3 */ 681 struct smb_hdr hdr; /* wct = 3 */
676 __u16 FileID; 682 __u16 FileID;
677 __u32 LastWriteTime; /* should be zero */ 683 __u32 LastWriteTime; /* should be zero or -1 */
678 __u16 ByteCount; /* 0 */ 684 __u16 ByteCount; /* 0 */
679} __attribute__((packed)) CLOSE_REQ; 685} __attribute__((packed)) CLOSE_REQ;
680 686
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b35c55c3c8bb..f1f8225102f0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -50,12 +50,12 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
50extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *, 50extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
51 struct kvec *, int /* nvec to send */, 51 struct kvec *, int /* nvec to send */,
52 int * /* type of buf returned */ , const int long_op); 52 int * /* type of buf returned */ , const int long_op);
53extern int SendReceiveBlockingLock(const unsigned int /* xid */ , struct cifsTconInfo *, 53extern int SendReceiveBlockingLock(const unsigned int /* xid */ ,
54 struct cifsTconInfo *,
54 struct smb_hdr * /* input */ , 55 struct smb_hdr * /* input */ ,
55 struct smb_hdr * /* out */ , 56 struct smb_hdr * /* out */ ,
56 int * /* bytes returned */); 57 int * /* bytes returned */);
57extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid); 58extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
58extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length);
59extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); 59extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
60extern int is_size_safe_to_change(struct cifsInodeInfo *); 60extern int is_size_safe_to_change(struct cifsInodeInfo *);
61extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); 61extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
@@ -80,6 +80,9 @@ extern struct oplock_q_entry * AllocOplockQEntry(struct inode *, u16,
80extern void DeleteOplockQEntry(struct oplock_q_entry *); 80extern void DeleteOplockQEntry(struct oplock_q_entry *);
81extern struct timespec cifs_NTtimeToUnix(u64 /* utc nanoseconds since 1601 */ ); 81extern struct timespec cifs_NTtimeToUnix(u64 /* utc nanoseconds since 1601 */ );
82extern u64 cifs_UnixTimeToNT(struct timespec); 82extern u64 cifs_UnixTimeToNT(struct timespec);
83extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
84extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
85
83extern int cifs_get_inode_info(struct inode **pinode, 86extern int cifs_get_inode_info(struct inode **pinode,
84 const unsigned char *search_path, 87 const unsigned char *search_path,
85 FILE_ALL_INFO * pfile_info, 88 FILE_ALL_INFO * pfile_info,
@@ -116,6 +119,7 @@ extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
116extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 119extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
117 const unsigned char *searchName, 120 const unsigned char *searchName,
118 FILE_ALL_INFO * findData, 121 FILE_ALL_INFO * findData,
122 int legacy /* whether to use old info level */,
119 const struct nls_table *nls_codepage, int remap); 123 const struct nls_table *nls_codepage, int remap);
120extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon, 124extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
121 const unsigned char *searchName, 125 const unsigned char *searchName,
@@ -279,8 +283,6 @@ extern void sesInfoFree(struct cifsSesInfo *);
279extern struct cifsTconInfo *tconInfoAlloc(void); 283extern struct cifsTconInfo *tconInfoAlloc(void);
280extern void tconInfoFree(struct cifsTconInfo *); 284extern void tconInfoFree(struct cifsTconInfo *);
281 285
282extern int cifs_reconnect(struct TCP_Server_Info *server);
283
284extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *,__u32 *); 286extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *,__u32 *);
285extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, 287extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
286 __u32 *); 288 __u32 *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 075d8fb3d376..098790eb2aa1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -46,6 +46,7 @@ static struct {
46} protocols[] = { 46} protocols[] = {
47#ifdef CONFIG_CIFS_WEAK_PW_HASH 47#ifdef CONFIG_CIFS_WEAK_PW_HASH
48 {LANMAN_PROT, "\2LM1.2X002"}, 48 {LANMAN_PROT, "\2LM1.2X002"},
49 {LANMAN2_PROT, "\2LANMAN2.1"},
49#endif /* weak password hashing for legacy clients */ 50#endif /* weak password hashing for legacy clients */
50 {CIFS_PROT, "\2NT LM 0.12"}, 51 {CIFS_PROT, "\2NT LM 0.12"},
51 {POSIX_PROT, "\2POSIX 2"}, 52 {POSIX_PROT, "\2POSIX 2"},
@@ -58,6 +59,7 @@ static struct {
58} protocols[] = { 59} protocols[] = {
59#ifdef CONFIG_CIFS_WEAK_PW_HASH 60#ifdef CONFIG_CIFS_WEAK_PW_HASH
60 {LANMAN_PROT, "\2LM1.2X002"}, 61 {LANMAN_PROT, "\2LM1.2X002"},
62 {LANMAN2_PROT, "\2LANMAN2.1"},
61#endif /* weak password hashing for legacy clients */ 63#endif /* weak password hashing for legacy clients */
62 {CIFS_PROT, "\2NT LM 0.12"}, 64 {CIFS_PROT, "\2NT LM 0.12"},
63 {BAD_PROT, "\2"} 65 {BAD_PROT, "\2"}
@@ -67,13 +69,13 @@ static struct {
67/* define the number of elements in the cifs dialect array */ 69/* define the number of elements in the cifs dialect array */
68#ifdef CONFIG_CIFS_POSIX 70#ifdef CONFIG_CIFS_POSIX
69#ifdef CONFIG_CIFS_WEAK_PW_HASH 71#ifdef CONFIG_CIFS_WEAK_PW_HASH
70#define CIFS_NUM_PROT 3 72#define CIFS_NUM_PROT 4
71#else 73#else
72#define CIFS_NUM_PROT 2 74#define CIFS_NUM_PROT 2
73#endif /* CIFS_WEAK_PW_HASH */ 75#endif /* CIFS_WEAK_PW_HASH */
74#else /* not posix */ 76#else /* not posix */
75#ifdef CONFIG_CIFS_WEAK_PW_HASH 77#ifdef CONFIG_CIFS_WEAK_PW_HASH
76#define CIFS_NUM_PROT 2 78#define CIFS_NUM_PROT 3
77#else 79#else
78#define CIFS_NUM_PROT 1 80#define CIFS_NUM_PROT 1
79#endif /* CONFIG_CIFS_WEAK_PW_HASH */ 81#endif /* CONFIG_CIFS_WEAK_PW_HASH */
@@ -397,6 +399,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
397 struct TCP_Server_Info * server; 399 struct TCP_Server_Info * server;
398 u16 count; 400 u16 count;
399 unsigned int secFlags; 401 unsigned int secFlags;
402 u16 dialect;
400 403
401 if(ses->server) 404 if(ses->server)
402 server = ses->server; 405 server = ses->server;
@@ -436,9 +439,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
436 if (rc != 0) 439 if (rc != 0)
437 goto neg_err_exit; 440 goto neg_err_exit;
438 441
439 cFYI(1,("Dialect: %d", pSMBr->DialectIndex)); 442 dialect = le16_to_cpu(pSMBr->DialectIndex);
443 cFYI(1,("Dialect: %d", dialect));
440 /* Check wct = 1 error case */ 444 /* Check wct = 1 error case */
441 if((pSMBr->hdr.WordCount < 13) || (pSMBr->DialectIndex == BAD_PROT)) { 445 if((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
442 /* core returns wct = 1, but we do not ask for core - otherwise 446 /* core returns wct = 1, but we do not ask for core - otherwise
443 small wct just comes when dialect index is -1 indicating we 447 small wct just comes when dialect index is -1 indicating we
444 could not negotiate a common dialect */ 448 could not negotiate a common dialect */
@@ -446,7 +450,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
446 goto neg_err_exit; 450 goto neg_err_exit;
447#ifdef CONFIG_CIFS_WEAK_PW_HASH 451#ifdef CONFIG_CIFS_WEAK_PW_HASH
448 } else if((pSMBr->hdr.WordCount == 13) 452 } else if((pSMBr->hdr.WordCount == 13)
449 && (pSMBr->DialectIndex == LANMAN_PROT)) { 453 && ((dialect == LANMAN_PROT)
454 || (dialect == LANMAN2_PROT))) {
455 __s16 tmp;
450 struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr; 456 struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr;
451 457
452 if((secFlags & CIFSSEC_MAY_LANMAN) || 458 if((secFlags & CIFSSEC_MAY_LANMAN) ||
@@ -472,12 +478,44 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
472 server->maxRw = 0;/* we do not need to use raw anyway */ 478 server->maxRw = 0;/* we do not need to use raw anyway */
473 server->capabilities = CAP_MPX_MODE; 479 server->capabilities = CAP_MPX_MODE;
474 } 480 }
475 server->timeZone = le16_to_cpu(rsp->ServerTimeZone); 481 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
482 if (tmp == -1) {
483 /* OS/2 often does not set timezone therefore
484 * we must use server time to calc time zone.
485 * Could deviate slightly from the right zone.
486 * Smallest defined timezone difference is 15 minutes
487 * (i.e. Nepal). Rounding up/down is done to match
488 * this requirement.
489 */
490 int val, seconds, remain, result;
491 struct timespec ts, utc;
492 utc = CURRENT_TIME;
493 ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
494 le16_to_cpu(rsp->SrvTime.Time));
495 cFYI(1,("SrvTime: %d sec since 1970 (utc: %d) diff: %d",
496 (int)ts.tv_sec, (int)utc.tv_sec,
497 (int)(utc.tv_sec - ts.tv_sec)));
498 val = (int)(utc.tv_sec - ts.tv_sec);
499 seconds = val < 0 ? -val : val;
500 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
501 remain = seconds % MIN_TZ_ADJ;
502 if(remain >= (MIN_TZ_ADJ / 2))
503 result += MIN_TZ_ADJ;
504 if(val < 0)
505 result = - result;
506 server->timeAdj = result;
507 } else {
508 server->timeAdj = (int)tmp;
509 server->timeAdj *= 60; /* also in seconds */
510 }
511 cFYI(1,("server->timeAdj: %d seconds", server->timeAdj));
512
476 513
477 /* BB get server time for time conversions and add 514 /* BB get server time for time conversions and add
478 code to use it and timezone since this is not UTC */ 515 code to use it and timezone since this is not UTC */
479 516
480 if (rsp->EncryptionKeyLength == cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { 517 if (rsp->EncryptionKeyLength ==
518 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
481 memcpy(server->cryptKey, rsp->EncryptionKey, 519 memcpy(server->cryptKey, rsp->EncryptionKey,
482 CIFS_CRYPTO_KEY_SIZE); 520 CIFS_CRYPTO_KEY_SIZE);
483 } else if (server->secMode & SECMODE_PW_ENCRYPT) { 521 } else if (server->secMode & SECMODE_PW_ENCRYPT) {
@@ -531,7 +569,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
531 cFYI(0, ("Max buf = %d", ses->server->maxBuf)); 569 cFYI(0, ("Max buf = %d", ses->server->maxBuf));
532 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); 570 GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
533 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 571 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
534 server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone); 572 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
573 server->timeAdj *= 60;
535 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { 574 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
536 memcpy(server->cryptKey, pSMBr->u.EncryptionKey, 575 memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
537 CIFS_CRYPTO_KEY_SIZE); 576 CIFS_CRYPTO_KEY_SIZE);
@@ -1617,7 +1656,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
1617 pSMBr = (CLOSE_RSP *)pSMB; /* BB removeme BB */ 1656 pSMBr = (CLOSE_RSP *)pSMB; /* BB removeme BB */
1618 1657
1619 pSMB->FileID = (__u16) smb_file_id; 1658 pSMB->FileID = (__u16) smb_file_id;
1620 pSMB->LastWriteTime = 0; 1659 pSMB->LastWriteTime = 0xFFFFFFFF;
1621 pSMB->ByteCount = 0; 1660 pSMB->ByteCount = 0;
1622 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1661 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
1623 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 1662 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2773,9 +2812,11 @@ GetExtAttrOut:
2773 2812
2774 2813
2775/* security id for everyone */ 2814/* security id for everyone */
2776const struct cifs_sid sid_everyone = {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}}; 2815const static struct cifs_sid sid_everyone =
2816 {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
2777/* group users */ 2817/* group users */
2778const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}}; 2818const static struct cifs_sid sid_user =
2819 {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
2779 2820
2780/* Convert CIFS ACL to POSIX form */ 2821/* Convert CIFS ACL to POSIX form */
2781static int parse_sec_desc(struct cifs_sid * psec_desc, int acl_len) 2822static int parse_sec_desc(struct cifs_sid * psec_desc, int acl_len)
@@ -2856,7 +2897,6 @@ qsec_out:
2856 return rc; 2897 return rc;
2857} 2898}
2858 2899
2859
2860/* Legacy Query Path Information call for lookup to old servers such 2900/* Legacy Query Path Information call for lookup to old servers such
2861 as Win9x/WinME */ 2901 as Win9x/WinME */
2862int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon, 2902int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
@@ -2898,7 +2938,16 @@ QInfRetry:
2898 if (rc) { 2938 if (rc) {
2899 cFYI(1, ("Send error in QueryInfo = %d", rc)); 2939 cFYI(1, ("Send error in QueryInfo = %d", rc));
2900 } else if (pFinfo) { /* decode response */ 2940 } else if (pFinfo) { /* decode response */
2941 struct timespec ts;
2942 __u32 time = le32_to_cpu(pSMBr->last_write_time);
2943 /* BB FIXME - add time zone adjustment BB */
2901 memset(pFinfo, 0, sizeof(FILE_ALL_INFO)); 2944 memset(pFinfo, 0, sizeof(FILE_ALL_INFO));
2945 ts.tv_nsec = 0;
2946 ts.tv_sec = time;
2947 /* decode time fields */
2948 pFinfo->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts));
2949 pFinfo->LastWriteTime = pFinfo->ChangeTime;
2950 pFinfo->LastAccessTime = 0;
2902 pFinfo->AllocationSize = 2951 pFinfo->AllocationSize =
2903 cpu_to_le64(le32_to_cpu(pSMBr->size)); 2952 cpu_to_le64(le32_to_cpu(pSMBr->size));
2904 pFinfo->EndOfFile = pFinfo->AllocationSize; 2953 pFinfo->EndOfFile = pFinfo->AllocationSize;
@@ -2922,6 +2971,7 @@ int
2922CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 2971CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
2923 const unsigned char *searchName, 2972 const unsigned char *searchName,
2924 FILE_ALL_INFO * pFindData, 2973 FILE_ALL_INFO * pFindData,
2974 int legacy /* old style infolevel */,
2925 const struct nls_table *nls_codepage, int remap) 2975 const struct nls_table *nls_codepage, int remap)
2926{ 2976{
2927/* level 263 SMB_QUERY_FILE_ALL_INFO */ 2977/* level 263 SMB_QUERY_FILE_ALL_INFO */
@@ -2970,7 +3020,10 @@ QPathInfoRetry:
2970 byte_count = params + 1 /* pad */ ; 3020 byte_count = params + 1 /* pad */ ;
2971 pSMB->TotalParameterCount = cpu_to_le16(params); 3021 pSMB->TotalParameterCount = cpu_to_le16(params);
2972 pSMB->ParameterCount = pSMB->TotalParameterCount; 3022 pSMB->ParameterCount = pSMB->TotalParameterCount;
2973 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO); 3023 if(legacy)
3024 pSMB->InformationLevel = cpu_to_le16(SMB_INFO_STANDARD);
3025 else
3026 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
2974 pSMB->Reserved4 = 0; 3027 pSMB->Reserved4 = 0;
2975 pSMB->hdr.smb_buf_length += byte_count; 3028 pSMB->hdr.smb_buf_length += byte_count;
2976 pSMB->ByteCount = cpu_to_le16(byte_count); 3029 pSMB->ByteCount = cpu_to_le16(byte_count);
@@ -2982,13 +3035,24 @@ QPathInfoRetry:
2982 } else { /* decode response */ 3035 } else { /* decode response */
2983 rc = validate_t2((struct smb_t2_rsp *)pSMBr); 3036 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
2984 3037
2985 if (rc || (pSMBr->ByteCount < 40)) 3038 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3039 rc = -EIO;
3040 else if (!legacy && (pSMBr->ByteCount < 40))
2986 rc = -EIO; /* bad smb */ 3041 rc = -EIO; /* bad smb */
3042 else if(legacy && (pSMBr->ByteCount < 24))
3043 rc = -EIO; /* 24 or 26 expected but we do not read last field */
2987 else if (pFindData){ 3044 else if (pFindData){
3045 int size;
2988 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); 3046 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3047 if(legacy) /* we do not read the last field, EAsize, fortunately
3048 since it varies by subdialect and on Set vs. Get, is
3049 two bytes or 4 bytes depending but we don't care here */
3050 size = sizeof(FILE_INFO_STANDARD);
3051 else
3052 size = sizeof(FILE_ALL_INFO);
2989 memcpy((char *) pFindData, 3053 memcpy((char *) pFindData,
2990 (char *) &pSMBr->hdr.Protocol + 3054 (char *) &pSMBr->hdr.Protocol +
2991 data_offset, sizeof (FILE_ALL_INFO)); 3055 data_offset, size);
2992 } else 3056 } else
2993 rc = -ENOMEM; 3057 rc = -ENOMEM;
2994 } 3058 }
@@ -3613,6 +3677,14 @@ getDFSRetry:
3613 strncpy(pSMB->RequestFileName, searchName, name_len); 3677 strncpy(pSMB->RequestFileName, searchName, name_len);
3614 } 3678 }
3615 3679
3680 if(ses->server) {
3681 if(ses->server->secMode &
3682 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
3683 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
3684 }
3685
3686 pSMB->hdr.Uid = ses->Suid;
3687
3616 params = 2 /* level */ + name_len /*includes null */ ; 3688 params = 2 /* level */ + name_len /*includes null */ ;
3617 pSMB->TotalDataCount = 0; 3689 pSMB->TotalDataCount = 0;
3618 pSMB->DataCount = 0; 3690 pSMB->DataCount = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c78762051da4..4093d5332930 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -109,7 +109,7 @@ static int ipv6_connect(struct sockaddr_in6 *psin_server,
109 * wake up waiters on reconnection? - (not needed currently) 109 * wake up waiters on reconnection? - (not needed currently)
110 */ 110 */
111 111
112int 112static int
113cifs_reconnect(struct TCP_Server_Info *server) 113cifs_reconnect(struct TCP_Server_Info *server)
114{ 114{
115 int rc = 0; 115 int rc = 0;
@@ -771,13 +771,18 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
771 separator[0] = ','; 771 separator[0] = ',';
772 separator[1] = 0; 772 separator[1] = 0;
773 773
774 memset(vol->source_rfc1001_name,0x20,15); 774 if (Local_System_Name[0] != 0)
775 for(i=0;i < strnlen(utsname()->nodename,15);i++) { 775 memcpy(vol->source_rfc1001_name, Local_System_Name,15);
776 /* does not have to be a perfect mapping since the field is 776 else {
777 informational, only used for servers that do not support 777 char *nodename = utsname()->nodename;
778 port 445 and it can be overridden at mount time */ 778 int n = strnlen(nodename,15);
779 vol->source_rfc1001_name[i] = 779 memset(vol->source_rfc1001_name,0x20,15);
780 toupper(utsname()->nodename[i]); 780 for(i=0 ; i < n ; i++) {
781 /* does not have to be perfect mapping since field is
782 informational, only used for servers that do not support
783 port 445 and it can be overridden at mount time */
784 vol->source_rfc1001_name[i] = toupper(nodename[i]);
785 }
781 } 786 }
782 vol->source_rfc1001_name[15] = 0; 787 vol->source_rfc1001_name[15] = 0;
783 /* null target name indicates to use *SMBSERVR default called name 788 /* null target name indicates to use *SMBSERVR default called name
@@ -3215,7 +3220,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
3215 } 3220 }
3216 /* else do not bother copying these informational fields */ 3221 /* else do not bother copying these informational fields */
3217 } 3222 }
3218 if(smb_buffer_response->WordCount == 3) 3223 if((smb_buffer_response->WordCount == 3) ||
3224 (smb_buffer_response->WordCount == 7))
3225 /* field is in same location */
3219 tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); 3226 tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
3220 else 3227 else
3221 tcon->Flags = 0; 3228 tcon->Flags = 0;
@@ -3312,19 +3319,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3312 first_time = 1; 3319 first_time = 1;
3313 } 3320 }
3314 if (!rc) { 3321 if (!rc) {
3322 pSesInfo->flags = 0;
3315 pSesInfo->capabilities = pSesInfo->server->capabilities; 3323 pSesInfo->capabilities = pSesInfo->server->capabilities;
3316 if(linuxExtEnabled == 0) 3324 if(linuxExtEnabled == 0)
3317 pSesInfo->capabilities &= (~CAP_UNIX); 3325 pSesInfo->capabilities &= (~CAP_UNIX);
3318 /* pSesInfo->sequence_number = 0;*/ 3326 /* pSesInfo->sequence_number = 0;*/
3319 cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x Time Zone: %d", 3327 cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
3320 pSesInfo->server->secMode, 3328 pSesInfo->server->secMode,
3321 pSesInfo->server->capabilities, 3329 pSesInfo->server->capabilities,
3322 pSesInfo->server->timeZone)); 3330 pSesInfo->server->timeAdj));
3323 if(experimEnabled < 2) 3331 if(experimEnabled < 2)
3324 rc = CIFS_SessSetup(xid, pSesInfo, 3332 rc = CIFS_SessSetup(xid, pSesInfo,
3325 first_time, nls_info); 3333 first_time, nls_info);
3326 else if (extended_security 3334 else if (extended_security
3327 && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) 3335 && (pSesInfo->capabilities
3336 & CAP_EXTENDED_SECURITY)
3328 && (pSesInfo->server->secType == NTLMSSP)) { 3337 && (pSesInfo->server->secType == NTLMSSP)) {
3329 rc = -EOPNOTSUPP; 3338 rc = -EOPNOTSUPP;
3330 } else if (extended_security 3339 } else if (extended_security
@@ -3338,7 +3347,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
3338 if (!rc) { 3347 if (!rc) {
3339 if(ntlmv2_flag) { 3348 if(ntlmv2_flag) {
3340 char * v2_response; 3349 char * v2_response;
3341 cFYI(1,("Can use more secure NTLM version 2 password hash")); 3350 cFYI(1,("more secure NTLM ver2 hash"));
3342 if(CalcNTLMv2_partial_mac_key(pSesInfo, 3351 if(CalcNTLMv2_partial_mac_key(pSesInfo,
3343 nls_info)) { 3352 nls_info)) {
3344 rc = -ENOMEM; 3353 rc = -ENOMEM;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6b90ef98e4cf..35d54bb0869a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -337,6 +337,7 @@ int cifs_get_inode_info(struct inode **pinode,
337 pfindData = (FILE_ALL_INFO *)buf; 337 pfindData = (FILE_ALL_INFO *)buf;
338 /* could do find first instead but this returns more info */ 338 /* could do find first instead but this returns more info */
339 rc = CIFSSMBQPathInfo(xid, pTcon, search_path, pfindData, 339 rc = CIFSSMBQPathInfo(xid, pTcon, search_path, pfindData,
340 0 /* not legacy */,
340 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 341 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
341 CIFS_MOUNT_MAP_SPECIAL_CHR); 342 CIFS_MOUNT_MAP_SPECIAL_CHR);
342 /* BB optimize code so we do not make the above call 343 /* BB optimize code so we do not make the above call
@@ -384,8 +385,10 @@ int cifs_get_inode_info(struct inode **pinode,
384 /* get new inode */ 385 /* get new inode */
385 if (*pinode == NULL) { 386 if (*pinode == NULL) {
386 *pinode = new_inode(sb); 387 *pinode = new_inode(sb);
387 if (*pinode == NULL) 388 if (*pinode == NULL) {
389 kfree(buf);
388 return -ENOMEM; 390 return -ENOMEM;
391 }
389 /* Is an i_ino of zero legal? Can we use that to check 392 /* Is an i_ino of zero legal? Can we use that to check
390 if the server supports returning inode numbers? Are 393 if the server supports returning inode numbers? Are
391 there other sanity checks we can use to ensure that 394 there other sanity checks we can use to ensure that
@@ -431,8 +434,11 @@ int cifs_get_inode_info(struct inode **pinode,
431 (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/ 434 (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
432 435
433 /* Linux can not store file creation time so ignore it */ 436 /* Linux can not store file creation time so ignore it */
434 inode->i_atime = 437 if(pfindData->LastAccessTime)
435 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime)); 438 inode->i_atime = cifs_NTtimeToUnix
439 (le64_to_cpu(pfindData->LastAccessTime));
440 else /* do not need to use current_fs_time - time not stored */
441 inode->i_atime = CURRENT_TIME;
436 inode->i_mtime = 442 inode->i_mtime =
437 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); 443 cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
438 inode->i_ctime = 444 inode->i_ctime =
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index a57f5d6e6213..0bee8b7e521a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -254,7 +254,11 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
254 tmpbuffer, 254 tmpbuffer,
255 len - 1, 255 len - 1,
256 cifs_sb->local_nls); 256 cifs_sb->local_nls);
257 else { 257 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
258 cERROR(1,("SFU style symlinks not implemented yet"));
259 /* add open and read as in fs/cifs/inode.c */
260
261 } else {
258 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ, 262 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ,
259 OPEN_REPARSE_POINT,&fid, &oplock, NULL, 263 OPEN_REPARSE_POINT,&fid, &oplock, NULL,
260 cifs_sb->local_nls, 264 cifs_sb->local_nls,
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 7aa23490541f..ccebf9b7eb86 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -252,10 +252,11 @@ MD5Transform(__u32 buf[4], __u32 const in[16])
252 buf[3] += d; 252 buf[3] += d;
253} 253}
254 254
255#if 0 /* currently unused */
255/*********************************************************************** 256/***********************************************************************
256 the rfc 2104 version of hmac_md5 initialisation. 257 the rfc 2104 version of hmac_md5 initialisation.
257***********************************************************************/ 258***********************************************************************/
258void 259static void
259hmac_md5_init_rfc2104(unsigned char *key, int key_len, 260hmac_md5_init_rfc2104(unsigned char *key, int key_len,
260 struct HMACMD5Context *ctx) 261 struct HMACMD5Context *ctx)
261{ 262{
@@ -289,6 +290,7 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
289 MD5Init(&ctx->ctx); 290 MD5Init(&ctx->ctx);
290 MD5Update(&ctx->ctx, ctx->k_ipad, 64); 291 MD5Update(&ctx->ctx, ctx->k_ipad, 64);
291} 292}
293#endif
292 294
293/*********************************************************************** 295/***********************************************************************
294 the microsoft version of hmac_md5 initialisation. 296 the microsoft version of hmac_md5 initialisation.
@@ -350,7 +352,8 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
350 single function to calculate an HMAC MD5 digest from data. 352 single function to calculate an HMAC MD5 digest from data.
351 use the microsoft hmacmd5 init method because the key is 16 bytes. 353 use the microsoft hmacmd5 init method because the key is 16 bytes.
352************************************************************/ 354************************************************************/
353void 355#if 0 /* currently unused */
356static void
354hmac_md5(unsigned char key[16], unsigned char *data, int data_len, 357hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
355 unsigned char *digest) 358 unsigned char *digest)
356{ 359{
@@ -361,3 +364,4 @@ hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
361 } 364 }
362 hmac_md5_final(digest, &ctx); 365 hmac_md5_final(digest, &ctx);
363} 366}
367#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index 00e1c5394fe1..f7d4f4197bac 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -27,12 +27,12 @@ void MD5Final(unsigned char digest[16], struct MD5Context *context);
27 27
28/* The following definitions come from lib/hmacmd5.c */ 28/* The following definitions come from lib/hmacmd5.c */
29 29
30void hmac_md5_init_rfc2104(unsigned char *key, int key_len, 30/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
31 struct HMACMD5Context *ctx); 31 struct HMACMD5Context *ctx);*/
32void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len, 32void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
33 struct HMACMD5Context *ctx); 33 struct HMACMD5Context *ctx);
34void hmac_md5_update(const unsigned char *text, int text_len, 34void hmac_md5_update(const unsigned char *text, int text_len,
35 struct HMACMD5Context *ctx); 35 struct HMACMD5Context *ctx);
36void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx); 36void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
37void hmac_md5(unsigned char key[16], unsigned char *data, int data_len, 37/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
38 unsigned char *digest); 38 unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 22c937e5884f..bbc9cd34b6ea 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -389,7 +389,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
389 return; 389 return;
390} 390}
391 391
392int 392static int
393checkSMBhdr(struct smb_hdr *smb, __u16 mid) 393checkSMBhdr(struct smb_hdr *smb, __u16 mid)
394{ 394{
395 /* Make sure that this really is an SMB, that it is a response, 395 /* Make sure that this really is an SMB, that it is a response,
@@ -418,26 +418,42 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
418} 418}
419 419
420int 420int
421checkSMB(struct smb_hdr *smb, __u16 mid, int length) 421checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
422{ 422{
423 __u32 len = smb->smb_buf_length; 423 __u32 len = smb->smb_buf_length;
424 __u32 clc_len; /* calculated length */ 424 __u32 clc_len; /* calculated length */
425 cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len)); 425 cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
426 if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) || 426
427 (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) { 427 if (length < 2 + sizeof (struct smb_hdr)) {
428 if ((unsigned int)length < 2 + sizeof (struct smb_hdr)) { 428 if ((length >= sizeof (struct smb_hdr) - 1)
429 if (((unsigned int)length >=
430 sizeof (struct smb_hdr) - 1)
431 && (smb->Status.CifsError != 0)) { 429 && (smb->Status.CifsError != 0)) {
432 smb->WordCount = 0; 430 smb->WordCount = 0;
433 /* some error cases do not return wct and bcc */ 431 /* some error cases do not return wct and bcc */
432 return 0;
433 } else if ((length == sizeof(struct smb_hdr) + 1) &&
434 (smb->WordCount == 0)) {
435 char * tmp = (char *)smb;
436 /* Need to work around a bug in two servers here */
437 /* First, check if the part of bcc they sent was zero */
438 if (tmp[sizeof(struct smb_hdr)] == 0) {
439 /* some servers return only half of bcc
440 * on simple responses (wct, bcc both zero)
441 * in particular have seen this on
442 * ulogoffX and FindClose. This leaves
443 * one byte of bcc potentially unitialized
444 */
445 /* zero rest of bcc */
446 tmp[sizeof(struct smb_hdr)+1] = 0;
434 return 0; 447 return 0;
435 } else {
436 cERROR(1, ("Length less than smb header size"));
437 } 448 }
449 cERROR(1,("rcvd invalid byte count (bcc)"));
450 } else {
451 cERROR(1, ("Length less than smb header size"));
438 } 452 }
439 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) 453 return 1;
440 cERROR(1, ("smb length greater than MaxBufSize, mid=%d", 454 }
455 if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
456 cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
441 smb->Mid)); 457 smb->Mid));
442 return 1; 458 return 1;
443 } 459 }
@@ -446,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length)
446 return 1; 462 return 1;
447 clc_len = smbCalcSize_LE(smb); 463 clc_len = smbCalcSize_LE(smb);
448 464
449 if(4 + len != (unsigned int)length) { 465 if(4 + len != length) {
450 cERROR(1, ("Length read does not match RFC1001 length %d",len)); 466 cERROR(1, ("Length read does not match RFC1001 length %d",len));
451 return 1; 467 return 1;
452 } 468 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index ce87550e918f..992e80edc720 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -909,3 +909,61 @@ cifs_UnixTimeToNT(struct timespec t)
909 /* Convert to 100ns intervals and then add the NTFS time offset. */ 909 /* Convert to 100ns intervals and then add the NTFS time offset. */
910 return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET; 910 return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
911} 911}
912
913static int total_days_of_prev_months[] =
914{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
915
916
917__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
918{
919 return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
920}
921
922struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
923{
924 struct timespec ts;
925 int sec, min, days, month, year;
926 SMB_TIME * st = (SMB_TIME *)&time;
927 SMB_DATE * sd = (SMB_DATE *)&date;
928
929 cFYI(1,("date %d time %d",date, time));
930
931 sec = 2 * st->TwoSeconds;
932 min = st->Minutes;
933 if((sec > 59) || (min > 59))
934 cERROR(1,("illegal time min %d sec %d", min, sec));
935 sec += (min * 60);
936 sec += 60 * 60 * st->Hours;
937 if(st->Hours > 24)
938 cERROR(1,("illegal hours %d",st->Hours));
939 days = sd->Day;
940 month = sd->Month;
941 if((days > 31) || (month > 12))
942 cERROR(1,("illegal date, month %d day: %d", month, days));
943 month -= 1;
944 days += total_days_of_prev_months[month];
945 days += 3652; /* account for difference in days between 1980 and 1970 */
946 year = sd->Year;
947 days += year * 365;
948 days += (year/4); /* leap year */
949 /* generalized leap year calculation is more complex, ie no leap year
950 for years/100 except for years/400, but since the maximum number for DOS
951 year is 2**7, the last year is 1980+127, which means we need only
952 consider 2 special case years, ie the years 2000 and 2100, and only
953 adjust for the lack of leap year for the year 2100, as 2000 was a
954 leap year (divisable by 400) */
955 if(year >= 120) /* the year 2100 */
956 days = days - 1; /* do not count leap year for the year 2100 */
957
958 /* adjust for leap year where we are still before leap day */
959 if(year != 120)
960 days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
961 sec += 24 * 60 * 60 * days;
962
963 ts.tv_sec = sec;
964
965 /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
966
967 ts.tv_nsec = 0;
968 return ts;
969}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b27b34537bf2..b5b0a2a41bef 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -106,6 +106,17 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
106 return rc; 106 return rc;
107} 107}
108 108
109static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode)
110{
111 if((tcon) && (tcon->ses) && (tcon->ses->server)) {
112 inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
113 inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
114 inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
115 }
116 return;
117}
118
119
109static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, 120static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
110 char * buf, int *pobject_type, int isNewInode) 121 char * buf, int *pobject_type, int isNewInode)
111{ 122{
@@ -135,16 +146,23 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
135 tmp_inode->i_ctime = 146 tmp_inode->i_ctime =
136 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); 147 cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
137 } else { /* legacy, OS2 and DOS style */ 148 } else { /* legacy, OS2 and DOS style */
149/* struct timespec ts;*/
138 FIND_FILE_STANDARD_INFO * pfindData = 150 FIND_FILE_STANDARD_INFO * pfindData =
139 (FIND_FILE_STANDARD_INFO *)buf; 151 (FIND_FILE_STANDARD_INFO *)buf;
140 152
153 tmp_inode->i_mtime = cnvrtDosUnixTm(
154 le16_to_cpu(pfindData->LastWriteDate),
155 le16_to_cpu(pfindData->LastWriteTime));
156 tmp_inode->i_atime = cnvrtDosUnixTm(
157 le16_to_cpu(pfindData->LastAccessDate),
158 le16_to_cpu(pfindData->LastAccessTime));
159 tmp_inode->i_ctime = cnvrtDosUnixTm(
160 le16_to_cpu(pfindData->LastWriteDate),
161 le16_to_cpu(pfindData->LastWriteTime));
162 AdjustForTZ(cifs_sb->tcon, tmp_inode);
141 attr = le16_to_cpu(pfindData->Attributes); 163 attr = le16_to_cpu(pfindData->Attributes);
142 allocation_size = le32_to_cpu(pfindData->AllocationSize); 164 allocation_size = le32_to_cpu(pfindData->AllocationSize);
143 end_of_file = le32_to_cpu(pfindData->DataSize); 165 end_of_file = le32_to_cpu(pfindData->DataSize);
144 tmp_inode->i_atime = CURRENT_TIME;
145 /* tmp_inode->i_mtime = BB FIXME - add dos time handling
146 tmp_inode->i_ctime = 0; BB FIXME */
147
148 } 166 }
149 167
150 /* Linux can not store file creation time unfortunately so ignore it */ 168 /* Linux can not store file creation time unfortunately so ignore it */
@@ -938,6 +956,7 @@ static int cifs_save_resume_key(const char *current_entry,
938 filename = &pFindData->FileName[0]; 956 filename = &pFindData->FileName[0];
939 /* one byte length, no name conversion */ 957 /* one byte length, no name conversion */
940 len = (unsigned int)pFindData->FileNameLength; 958 len = (unsigned int)pFindData->FileNameLength;
959 cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
941 } else { 960 } else {
942 cFYI(1,("Unknown findfirst level %d",level)); 961 cFYI(1,("Unknown findfirst level %d",level));
943 return -EINVAL; 962 return -EINVAL;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 22b4c35dcfe3..a8a083543ba0 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -268,6 +268,10 @@ static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo
268 ses->serverOS = kzalloc(len + 1, GFP_KERNEL); 268 ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
269 if(ses->serverOS) 269 if(ses->serverOS)
270 strncpy(ses->serverOS, bcc_ptr, len); 270 strncpy(ses->serverOS, bcc_ptr, len);
271 if(strncmp(ses->serverOS, "OS/2",4) == 0) {
272 cFYI(1,("OS/2 server"));
273 ses->flags |= CIFS_SES_OS2;
274 }
271 275
272 bcc_ptr += len + 1; 276 bcc_ptr += len + 1;
273 bleft -= len + 1; 277 bleft -= len + 1;
@@ -290,16 +294,11 @@ static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo
290 if(len > bleft) 294 if(len > bleft)
291 return rc; 295 return rc;
292 296
293 if(ses->serverDomain) 297 /* No domain field in LANMAN case. Domain is
294 kfree(ses->serverDomain); 298 returned by old servers in the SMB negprot response */
295 299 /* BB For newer servers which do not support Unicode,
296 ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); 300 but thus do return domain here we could add parsing
297 if(ses->serverOS) 301 for it later, but it is not very important */
298 strncpy(ses->serverOS, bcc_ptr, len);
299
300 bcc_ptr += len + 1;
301 bleft -= len + 1;
302
303 cFYI(1,("ascii: bytes left %d",bleft)); 302 cFYI(1,("ascii: bytes left %d",bleft));
304 303
305 return rc; 304 return rc;
@@ -366,6 +365,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
366 str_area = kmalloc(2000, GFP_KERNEL); 365 str_area = kmalloc(2000, GFP_KERNEL);
367 bcc_ptr = str_area; 366 bcc_ptr = str_area;
368 367
368 ses->flags &= ~CIFS_SES_LANMAN;
369
369 if(type == LANMAN) { 370 if(type == LANMAN) {
370#ifdef CONFIG_CIFS_WEAK_PW_HASH 371#ifdef CONFIG_CIFS_WEAK_PW_HASH
371 char lnm_session_key[CIFS_SESS_KEY_SIZE]; 372 char lnm_session_key[CIFS_SESS_KEY_SIZE];
@@ -377,7 +378,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
377 /* and copy into bcc */ 378 /* and copy into bcc */
378 379
379 calc_lanman_hash(ses, lnm_session_key); 380 calc_lanman_hash(ses, lnm_session_key);
380 381 ses->flags |= CIFS_SES_LANMAN;
381/* #ifdef CONFIG_CIFS_DEBUG2 382/* #ifdef CONFIG_CIFS_DEBUG2
382 cifs_dump_mem("cryptkey: ",ses->server->cryptKey, 383 cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
383 CIFS_SESS_KEY_SIZE); 384 CIFS_SESS_KEY_SIZE);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index efaa044523a7..7a1b2b961ec8 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -364,20 +364,20 @@ E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24)
364 smbhash(p24 + 16, c8, p21 + 14, 1); 364 smbhash(p24 + 16, c8, p21 + 14, 1);
365} 365}
366 366
367void 367#if 0 /* currently unsued */
368static void
368D_P16(unsigned char *p14, unsigned char *in, unsigned char *out) 369D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
369{ 370{
370 smbhash(out, in, p14, 0); 371 smbhash(out, in, p14, 0);
371 smbhash(out + 8, in + 8, p14 + 7, 0); 372 smbhash(out + 8, in + 8, p14 + 7, 0);
372} 373}
373 374
374void 375static void
375E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out) 376E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out)
376{ 377{
377 smbhash(out, in, p14, 1); 378 smbhash(out, in, p14, 1);
378 smbhash(out + 8, in + 8, p14 + 7, 1); 379 smbhash(out + 8, in + 8, p14 + 7, 1);
379} 380}
380#if 0
381/* these routines are currently unneeded, but may be 381/* these routines are currently unneeded, but may be
382 needed later */ 382 needed later */
383void 383void
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index f518c5e45035..4b25ba92180d 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -51,11 +51,8 @@
51 51
52void SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); 52void SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
53void E_md4hash(const unsigned char *passwd, unsigned char *p16); 53void E_md4hash(const unsigned char *passwd, unsigned char *p16);
54void nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]);
55static void SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8, 54static void SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
56 unsigned char p24[24]); 55 unsigned char p24[24]);
57void NTLMSSPOWFencrypt(unsigned char passwd[8],
58 unsigned char *ntlmchalresp, unsigned char p24[24]);
59void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); 56void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
60 57
61/* 58/*
@@ -144,8 +141,9 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
144 memset(wpwd,0,129 * 2); 141 memset(wpwd,0,129 * 2);
145} 142}
146 143
144#if 0 /* currently unused */
147/* Does both the NT and LM owfs of a user's password */ 145/* Does both the NT and LM owfs of a user's password */
148void 146static void
149nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]) 147nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
150{ 148{
151 char passwd[514]; 149 char passwd[514];
@@ -171,6 +169,7 @@ nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
171 /* clear out local copy of user's password (just being paranoid). */ 169 /* clear out local copy of user's password (just being paranoid). */
172 memset(passwd, '\0', sizeof (passwd)); 170 memset(passwd, '\0', sizeof (passwd));
173} 171}
172#endif
174 173
175/* Does the NTLMv2 owfs of a user's password */ 174/* Does the NTLMv2 owfs of a user's password */
176#if 0 /* function not needed yet - but will be soon */ 175#if 0 /* function not needed yet - but will be soon */
@@ -223,7 +222,8 @@ SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
223} 222}
224 223
225/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */ 224/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
226void 225#if 0 /* currently unused */
226static void
227NTLMSSPOWFencrypt(unsigned char passwd[8], 227NTLMSSPOWFencrypt(unsigned char passwd[8],
228 unsigned char *ntlmchalresp, unsigned char p24[24]) 228 unsigned char *ntlmchalresp, unsigned char p24[24])
229{ 229{
@@ -235,6 +235,7 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
235 235
236 E_P24(p21, ntlmchalresp, p24); 236 E_P24(p21, ntlmchalresp, p24);
237} 237}
238#endif
238 239
239/* Does the NT MD4 hash then des encryption. */ 240/* Does the NT MD4 hash then des encryption. */
240 241
diff --git a/fs/compat.c b/fs/compat.c
index 4d3fbcb2ddb1..50624d4a70c6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1316,7 +1316,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
1316 unsigned int nr_segs, unsigned int flags) 1316 unsigned int nr_segs, unsigned int flags)
1317{ 1317{
1318 unsigned i; 1318 unsigned i;
1319 struct iovec *iov; 1319 struct iovec __user *iov;
1320 if (nr_segs > UIO_MAXIOV) 1320 if (nr_segs > UIO_MAXIOV)
1321 return -EINVAL; 1321 return -EINVAL;
1322 iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); 1322 iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 27ca1aa30562..a91f2628c981 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2438,13 +2438,17 @@ HANDLE_IOCTL(0x1260, broken_blkgetsize)
2438HANDLE_IOCTL(BLKFRAGET, w_long) 2438HANDLE_IOCTL(BLKFRAGET, w_long)
2439HANDLE_IOCTL(BLKSECTGET, w_long) 2439HANDLE_IOCTL(BLKSECTGET, w_long)
2440HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans) 2440HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
2441HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
2442HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans) 2441HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans)
2443HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
2444HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
2445HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans) 2442HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans)
2443HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
2444HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
2446HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans) 2445HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans)
2446HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
2447HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans) 2447HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans)
2448HANDLE_IOCTL(HDIO_GET_WCACHE, hdio_ioctl_trans)
2449HANDLE_IOCTL(HDIO_GET_ACOUSTIC, hdio_ioctl_trans)
2450HANDLE_IOCTL(HDIO_GET_ADDRESS, hdio_ioctl_trans)
2451HANDLE_IOCTL(HDIO_GET_BUSSTATE, hdio_ioctl_trans)
2448HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans) 2452HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
2449HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans) 2453HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
2450HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans) 2454HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index e6d5754a715e..cf33fac68c84 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -275,13 +275,14 @@ static int check_perm(struct inode * inode, struct file * file)
275 * it in file->private_data for easy access. 275 * it in file->private_data for easy access.
276 */ 276 */
277 buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL); 277 buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
278 if (buffer) { 278 if (!buffer) {
279 init_MUTEX(&buffer->sem);
280 buffer->needs_read_fill = 1;
281 buffer->ops = ops;
282 file->private_data = buffer;
283 } else
284 error = -ENOMEM; 279 error = -ENOMEM;
280 goto Enomem;
281 }
282 init_MUTEX(&buffer->sem);
283 buffer->needs_read_fill = 1;
284 buffer->ops = ops;
285 file->private_data = buffer;
285 goto Done; 286 goto Done;
286 287
287 Einval: 288 Einval:
@@ -289,6 +290,7 @@ static int check_perm(struct inode * inode, struct file * file)
289 goto Done; 290 goto Done;
290 Eaccess: 291 Eaccess:
291 error = -EACCES; 292 error = -EACCES;
293 Enomem:
292 module_put(attr->ca_owner); 294 module_put(attr->ca_owner);
293 Done: 295 Done:
294 if (error && item) 296 if (error && item)
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 226EXPORT_SYMBOL(config_item_put);
227 227EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dcache.c b/fs/dcache.c
index fc2faa44f8d1..2bac4ba1d1d3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -291,9 +291,9 @@ struct dentry * dget_locked(struct dentry *dentry)
291 * it can be unhashed only if it has no children, or if it is the root 291 * it can be unhashed only if it has no children, or if it is the root
292 * of a filesystem. 292 * of a filesystem.
293 * 293 *
294 * If the inode has a DCACHE_DISCONNECTED alias, then prefer 294 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
295 * any other hashed alias over that one unless @want_discon is set, 295 * any other hashed alias over that one unless @want_discon is set,
296 * in which case only return a DCACHE_DISCONNECTED alias. 296 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
297 */ 297 */
298 298
299static struct dentry * __d_find_alias(struct inode *inode, int want_discon) 299static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
@@ -309,7 +309,8 @@ static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
309 prefetch(next); 309 prefetch(next);
310 alias = list_entry(tmp, struct dentry, d_alias); 310 alias = list_entry(tmp, struct dentry, d_alias);
311 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { 311 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
312 if (alias->d_flags & DCACHE_DISCONNECTED) 312 if (IS_ROOT(alias) &&
313 (alias->d_flags & DCACHE_DISCONNECTED))
313 discon_alias = alias; 314 discon_alias = alias;
314 else if (!want_discon) { 315 else if (!want_discon) {
315 __dget_locked(alias); 316 __dget_locked(alias);
@@ -548,6 +549,136 @@ repeat:
548} 549}
549 550
550/* 551/*
552 * destroy a single subtree of dentries for unmount
553 * - see the comments on shrink_dcache_for_umount() for a description of the
554 * locking
555 */
556static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
557{
558 struct dentry *parent;
559
560 BUG_ON(!IS_ROOT(dentry));
561
562 /* detach this root from the system */
563 spin_lock(&dcache_lock);
564 if (!list_empty(&dentry->d_lru)) {
565 dentry_stat.nr_unused--;
566 list_del_init(&dentry->d_lru);
567 }
568 __d_drop(dentry);
569 spin_unlock(&dcache_lock);
570
571 for (;;) {
572 /* descend to the first leaf in the current subtree */
573 while (!list_empty(&dentry->d_subdirs)) {
574 struct dentry *loop;
575
576 /* this is a branch with children - detach all of them
577 * from the system in one go */
578 spin_lock(&dcache_lock);
579 list_for_each_entry(loop, &dentry->d_subdirs,
580 d_u.d_child) {
581 if (!list_empty(&loop->d_lru)) {
582 dentry_stat.nr_unused--;
583 list_del_init(&loop->d_lru);
584 }
585
586 __d_drop(loop);
587 cond_resched_lock(&dcache_lock);
588 }
589 spin_unlock(&dcache_lock);
590
591 /* move to the first child */
592 dentry = list_entry(dentry->d_subdirs.next,
593 struct dentry, d_u.d_child);
594 }
595
596 /* consume the dentries from this leaf up through its parents
597 * until we find one with children or run out altogether */
598 do {
599 struct inode *inode;
600
601 if (atomic_read(&dentry->d_count) != 0) {
602 printk(KERN_ERR
603 "BUG: Dentry %p{i=%lx,n=%s}"
604 " still in use (%d)"
605 " [unmount of %s %s]\n",
606 dentry,
607 dentry->d_inode ?
608 dentry->d_inode->i_ino : 0UL,
609 dentry->d_name.name,
610 atomic_read(&dentry->d_count),
611 dentry->d_sb->s_type->name,
612 dentry->d_sb->s_id);
613 BUG();
614 }
615
616 parent = dentry->d_parent;
617 if (parent == dentry)
618 parent = NULL;
619 else
620 atomic_dec(&parent->d_count);
621
622 list_del(&dentry->d_u.d_child);
623 dentry_stat.nr_dentry--; /* For d_free, below */
624
625 inode = dentry->d_inode;
626 if (inode) {
627 dentry->d_inode = NULL;
628 list_del_init(&dentry->d_alias);
629 if (dentry->d_op && dentry->d_op->d_iput)
630 dentry->d_op->d_iput(dentry, inode);
631 else
632 iput(inode);
633 }
634
635 d_free(dentry);
636
637 /* finished when we fall off the top of the tree,
638 * otherwise we ascend to the parent and move to the
639 * next sibling if there is one */
640 if (!parent)
641 return;
642
643 dentry = parent;
644
645 } while (list_empty(&dentry->d_subdirs));
646
647 dentry = list_entry(dentry->d_subdirs.next,
648 struct dentry, d_u.d_child);
649 }
650}
651
652/*
653 * destroy the dentries attached to a superblock on unmounting
654 * - we don't need to use dentry->d_lock, and only need dcache_lock when
655 * removing the dentry from the system lists and hashes because:
656 * - the superblock is detached from all mountings and open files, so the
657 * dentry trees will not be rearranged by the VFS
658 * - s_umount is write-locked, so the memory pressure shrinker will ignore
659 * any dentries belonging to this superblock that it comes across
660 * - the filesystem itself is no longer permitted to rearrange the dentries
661 * in this superblock
662 */
663void shrink_dcache_for_umount(struct super_block *sb)
664{
665 struct dentry *dentry;
666
667 if (down_read_trylock(&sb->s_umount))
668 BUG();
669
670 dentry = sb->s_root;
671 sb->s_root = NULL;
672 atomic_dec(&dentry->d_count);
673 shrink_dcache_for_umount_subtree(dentry);
674
675 while (!hlist_empty(&sb->s_anon)) {
676 dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
677 shrink_dcache_for_umount_subtree(dentry);
678 }
679}
680
681/*
551 * Search for at least 1 mount point in the dentry's subdirs. 682 * Search for at least 1 mount point in the dentry's subdirs.
552 * We descend to the next level whenever the d_subdirs 683 * We descend to the next level whenever the d_subdirs
553 * list is non-empty and continue searching. 684 * list is non-empty and continue searching.
@@ -1004,7 +1135,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1004{ 1135{
1005 struct dentry *new = NULL; 1136 struct dentry *new = NULL;
1006 1137
1007 if (inode) { 1138 if (inode && S_ISDIR(inode->i_mode)) {
1008 spin_lock(&dcache_lock); 1139 spin_lock(&dcache_lock);
1009 new = __d_find_alias(inode, 1); 1140 new = __d_find_alias(inode, 1);
1010 if (new) { 1141 if (new) {
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..81b2c6465eeb
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,20 @@
1menu "Distributed Lock Manager"
2 depends on INET && IP_SCTP && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n
7 select CONFIGFS_FS
8 help
9 A general purpose distributed lock manager for kernel or userspace
10 applications.
11
12config DLM_DEBUG
13 bool "DLM debugging"
14 depends on DLM
15 help
16 Under the debugfs mount point, the name of each lockspace will
17 appear as a file in the "dlm" directory. The output is the
18 list of resource and locks the local node knows about.
19
20endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
1obj-$(CONFIG_DLM) += dlm.o
2dlm-y := ast.o \
3 config.o \
4 dir.o \
5 lock.o \
6 lockspace.o \
7 lowcomms.o \
8 main.o \
9 member.o \
10 memory.o \
11 midcomms.o \
12 rcom.o \
13 recover.o \
14 recoverd.o \
15 requestqueue.o \
16 user.o \
17 util.o
18dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
19
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "user.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 if (lkb->lkb_flags & DLM_IFL_USER) {
38 dlm_user_add_ast(lkb, type);
39 return;
40 }
41 DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
42
43 spin_lock(&ast_queue_lock);
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
45 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 }
48 lkb->lkb_ast_type |= type;
49 spin_unlock(&ast_queue_lock);
50
51 set_bit(WAKE_ASTS, &astd_wakeflags);
52 wake_up_process(astd_task);
53}
54
55static void process_asts(void)
56{
57 struct dlm_ls *ls = NULL;
58 struct dlm_rsb *r = NULL;
59 struct dlm_lkb *lkb;
60 void (*cast) (long param);
61 void (*bast) (long param, int mode);
62 int type = 0, found, bmode;
63
64 for (;;) {
65 found = 0;
66 spin_lock(&ast_queue_lock);
67 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
68 r = lkb->lkb_resource;
69 ls = r->res_ls;
70
71 if (dlm_locking_stopped(ls))
72 continue;
73
74 list_del(&lkb->lkb_astqueue);
75 type = lkb->lkb_ast_type;
76 lkb->lkb_ast_type = 0;
77 found = 1;
78 break;
79 }
80 spin_unlock(&ast_queue_lock);
81
82 if (!found)
83 break;
84
85 cast = lkb->lkb_astaddr;
86 bast = lkb->lkb_bastaddr;
87 bmode = lkb->lkb_bastmode;
88
89 if ((type & AST_COMP) && cast)
90 cast(lkb->lkb_astparam);
91
92 /* FIXME: Is it safe to look at lkb_grmode here
93 without doing a lock_rsb() ?
94 Look at other checks in v1 to avoid basts. */
95
96 if ((type & AST_BAST) && bast)
97 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
98 bast(lkb->lkb_astparam, bmode);
99
100 /* this removes the reference added by dlm_add_ast
101 and may result in the lkb being freed */
102 dlm_put_lkb(lkb);
103
104 schedule();
105 }
106}
107
108static inline int no_asts(void)
109{
110 int ret;
111
112 spin_lock(&ast_queue_lock);
113 ret = list_empty(&ast_queue);
114 spin_unlock(&ast_queue_lock);
115 return ret;
116}
117
118static int dlm_astd(void *data)
119{
120 while (!kthread_should_stop()) {
121 set_current_state(TASK_INTERRUPTIBLE);
122 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
123 schedule();
124 set_current_state(TASK_RUNNING);
125
126 mutex_lock(&astd_running);
127 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
128 process_asts();
129 mutex_unlock(&astd_running);
130 }
131 return 0;
132}
133
134void dlm_astd_wake(void)
135{
136 if (!no_asts()) {
137 set_bit(WAKE_ASTS, &astd_wakeflags);
138 wake_up_process(astd_task);
139 }
140}
141
142int dlm_astd_start(void)
143{
144 struct task_struct *p;
145 int error = 0;
146
147 INIT_LIST_HEAD(&ast_queue);
148 spin_lock_init(&ast_queue_lock);
149 mutex_init(&astd_running);
150
151 p = kthread_run(dlm_astd, NULL, "dlm_astd");
152 if (IS_ERR(p))
153 error = PTR_ERR(p);
154 else
155 astd_task = p;
156 return error;
157}
158
159void dlm_astd_stop(void)
160{
161 kthread_stop(astd_task);
162}
163
164void dlm_astd_suspend(void)
165{
166 mutex_lock(&astd_running);
167}
168
169void dlm_astd_resume(void)
170{
171 mutex_unlock(&astd_running);
172}
173
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20#include "lowcomms.h"
21
22/*
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
24 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
25 * /config/dlm/<cluster>/comms/<comm>/nodeid
26 * /config/dlm/<cluster>/comms/<comm>/local
27 * /config/dlm/<cluster>/comms/<comm>/addr
28 * The <cluster> level is useless, but I haven't figured out how to avoid it.
29 */
30
31static struct config_group *space_list;
32static struct config_group *comm_list;
33static struct comm *local_comm;
34
35struct clusters;
36struct cluster;
37struct spaces;
38struct space;
39struct comms;
40struct comm;
41struct nodes;
42struct node;
43
44static struct config_group *make_cluster(struct config_group *, const char *);
45static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *);
48static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *);
51static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *);
54static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *);
56
57static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
58 char *buf);
59static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
60 const char *buf, size_t len);
61static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
62 char *buf);
63static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
64 const char *buf, size_t len);
65
66static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
67static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
68static ssize_t comm_local_read(struct comm *cm, char *buf);
69static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
71static ssize_t node_nodeid_read(struct node *nd, char *buf);
72static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
73static ssize_t node_weight_read(struct node *nd, char *buf);
74static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
75
76enum {
77 COMM_ATTR_NODEID = 0,
78 COMM_ATTR_LOCAL,
79 COMM_ATTR_ADDR,
80};
81
82struct comm_attribute {
83 struct configfs_attribute attr;
84 ssize_t (*show)(struct comm *, char *);
85 ssize_t (*store)(struct comm *, const char *, size_t);
86};
87
88static struct comm_attribute comm_attr_nodeid = {
89 .attr = { .ca_owner = THIS_MODULE,
90 .ca_name = "nodeid",
91 .ca_mode = S_IRUGO | S_IWUSR },
92 .show = comm_nodeid_read,
93 .store = comm_nodeid_write,
94};
95
96static struct comm_attribute comm_attr_local = {
97 .attr = { .ca_owner = THIS_MODULE,
98 .ca_name = "local",
99 .ca_mode = S_IRUGO | S_IWUSR },
100 .show = comm_local_read,
101 .store = comm_local_write,
102};
103
104static struct comm_attribute comm_attr_addr = {
105 .attr = { .ca_owner = THIS_MODULE,
106 .ca_name = "addr",
107 .ca_mode = S_IRUGO | S_IWUSR },
108 .store = comm_addr_write,
109};
110
111static struct configfs_attribute *comm_attrs[] = {
112 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
113 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
114 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
115 NULL,
116};
117
118enum {
119 NODE_ATTR_NODEID = 0,
120 NODE_ATTR_WEIGHT,
121};
122
123struct node_attribute {
124 struct configfs_attribute attr;
125 ssize_t (*show)(struct node *, char *);
126 ssize_t (*store)(struct node *, const char *, size_t);
127};
128
129static struct node_attribute node_attr_nodeid = {
130 .attr = { .ca_owner = THIS_MODULE,
131 .ca_name = "nodeid",
132 .ca_mode = S_IRUGO | S_IWUSR },
133 .show = node_nodeid_read,
134 .store = node_nodeid_write,
135};
136
137static struct node_attribute node_attr_weight = {
138 .attr = { .ca_owner = THIS_MODULE,
139 .ca_name = "weight",
140 .ca_mode = S_IRUGO | S_IWUSR },
141 .show = node_weight_read,
142 .store = node_weight_write,
143};
144
145static struct configfs_attribute *node_attrs[] = {
146 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
147 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
148 NULL,
149};
150
151struct clusters {
152 struct configfs_subsystem subsys;
153};
154
155struct cluster {
156 struct config_group group;
157};
158
159struct spaces {
160 struct config_group ss_group;
161};
162
163struct space {
164 struct config_group group;
165 struct list_head members;
166 struct mutex members_lock;
167 int members_count;
168};
169
170struct comms {
171 struct config_group cs_group;
172};
173
174struct comm {
175 struct config_item item;
176 int nodeid;
177 int local;
178 int addr_count;
179 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
180};
181
182struct nodes {
183 struct config_group ns_group;
184};
185
186struct node {
187 struct config_item item;
188 struct list_head list; /* space->members */
189 int nodeid;
190 int weight;
191};
192
193static struct configfs_group_operations clusters_ops = {
194 .make_group = make_cluster,
195 .drop_item = drop_cluster,
196};
197
198static struct configfs_item_operations cluster_ops = {
199 .release = release_cluster,
200};
201
202static struct configfs_group_operations spaces_ops = {
203 .make_group = make_space,
204 .drop_item = drop_space,
205};
206
207static struct configfs_item_operations space_ops = {
208 .release = release_space,
209};
210
211static struct configfs_group_operations comms_ops = {
212 .make_item = make_comm,
213 .drop_item = drop_comm,
214};
215
216static struct configfs_item_operations comm_ops = {
217 .release = release_comm,
218 .show_attribute = show_comm,
219 .store_attribute = store_comm,
220};
221
222static struct configfs_group_operations nodes_ops = {
223 .make_item = make_node,
224 .drop_item = drop_node,
225};
226
227static struct configfs_item_operations node_ops = {
228 .release = release_node,
229 .show_attribute = show_node,
230 .store_attribute = store_node,
231};
232
233static struct config_item_type clusters_type = {
234 .ct_group_ops = &clusters_ops,
235 .ct_owner = THIS_MODULE,
236};
237
238static struct config_item_type cluster_type = {
239 .ct_item_ops = &cluster_ops,
240 .ct_owner = THIS_MODULE,
241};
242
243static struct config_item_type spaces_type = {
244 .ct_group_ops = &spaces_ops,
245 .ct_owner = THIS_MODULE,
246};
247
248static struct config_item_type space_type = {
249 .ct_item_ops = &space_ops,
250 .ct_owner = THIS_MODULE,
251};
252
253static struct config_item_type comms_type = {
254 .ct_group_ops = &comms_ops,
255 .ct_owner = THIS_MODULE,
256};
257
258static struct config_item_type comm_type = {
259 .ct_item_ops = &comm_ops,
260 .ct_attrs = comm_attrs,
261 .ct_owner = THIS_MODULE,
262};
263
264static struct config_item_type nodes_type = {
265 .ct_group_ops = &nodes_ops,
266 .ct_owner = THIS_MODULE,
267};
268
269static struct config_item_type node_type = {
270 .ct_item_ops = &node_ops,
271 .ct_attrs = node_attrs,
272 .ct_owner = THIS_MODULE,
273};
274
275static struct cluster *to_cluster(struct config_item *i)
276{
277 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
278}
279
280static struct space *to_space(struct config_item *i)
281{
282 return i ? container_of(to_config_group(i), struct space, group) : NULL;
283}
284
285static struct comm *to_comm(struct config_item *i)
286{
287 return i ? container_of(i, struct comm, item) : NULL;
288}
289
290static struct node *to_node(struct config_item *i)
291{
292 return i ? container_of(i, struct node, item) : NULL;
293}
294
295static struct config_group *make_cluster(struct config_group *g,
296 const char *name)
297{
298 struct cluster *cl = NULL;
299 struct spaces *sps = NULL;
300 struct comms *cms = NULL;
301 void *gps = NULL;
302
303 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
304 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
305 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
306 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
307
308 if (!cl || !gps || !sps || !cms)
309 goto fail;
310
311 config_group_init_type_name(&cl->group, name, &cluster_type);
312 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
313 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
314
315 cl->group.default_groups = gps;
316 cl->group.default_groups[0] = &sps->ss_group;
317 cl->group.default_groups[1] = &cms->cs_group;
318 cl->group.default_groups[2] = NULL;
319
320 space_list = &sps->ss_group;
321 comm_list = &cms->cs_group;
322 return &cl->group;
323
324 fail:
325 kfree(cl);
326 kfree(gps);
327 kfree(sps);
328 kfree(cms);
329 return NULL;
330}
331
332static void drop_cluster(struct config_group *g, struct config_item *i)
333{
334 struct cluster *cl = to_cluster(i);
335 struct config_item *tmp;
336 int j;
337
338 for (j = 0; cl->group.default_groups[j]; j++) {
339 tmp = &cl->group.default_groups[j]->cg_item;
340 cl->group.default_groups[j] = NULL;
341 config_item_put(tmp);
342 }
343
344 space_list = NULL;
345 comm_list = NULL;
346
347 config_item_put(i);
348}
349
350static void release_cluster(struct config_item *i)
351{
352 struct cluster *cl = to_cluster(i);
353 kfree(cl->group.default_groups);
354 kfree(cl);
355}
356
357static struct config_group *make_space(struct config_group *g, const char *name)
358{
359 struct space *sp = NULL;
360 struct nodes *nds = NULL;
361 void *gps = NULL;
362
363 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
364 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
365 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
366
367 if (!sp || !gps || !nds)
368 goto fail;
369
370 config_group_init_type_name(&sp->group, name, &space_type);
371 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
372
373 sp->group.default_groups = gps;
374 sp->group.default_groups[0] = &nds->ns_group;
375 sp->group.default_groups[1] = NULL;
376
377 INIT_LIST_HEAD(&sp->members);
378 mutex_init(&sp->members_lock);
379 sp->members_count = 0;
380 return &sp->group;
381
382 fail:
383 kfree(sp);
384 kfree(gps);
385 kfree(nds);
386 return NULL;
387}
388
389static void drop_space(struct config_group *g, struct config_item *i)
390{
391 struct space *sp = to_space(i);
392 struct config_item *tmp;
393 int j;
394
395 /* assert list_empty(&sp->members) */
396
397 for (j = 0; sp->group.default_groups[j]; j++) {
398 tmp = &sp->group.default_groups[j]->cg_item;
399 sp->group.default_groups[j] = NULL;
400 config_item_put(tmp);
401 }
402
403 config_item_put(i);
404}
405
406static void release_space(struct config_item *i)
407{
408 struct space *sp = to_space(i);
409 kfree(sp->group.default_groups);
410 kfree(sp);
411}
412
413static struct config_item *make_comm(struct config_group *g, const char *name)
414{
415 struct comm *cm;
416
417 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
418 if (!cm)
419 return NULL;
420
421 config_item_init_type_name(&cm->item, name, &comm_type);
422 cm->nodeid = -1;
423 cm->local = 0;
424 cm->addr_count = 0;
425 return &cm->item;
426}
427
428static void drop_comm(struct config_group *g, struct config_item *i)
429{
430 struct comm *cm = to_comm(i);
431 if (local_comm == cm)
432 local_comm = NULL;
433 dlm_lowcomms_close(cm->nodeid);
434 while (cm->addr_count--)
435 kfree(cm->addr[cm->addr_count]);
436 config_item_put(i);
437}
438
439static void release_comm(struct config_item *i)
440{
441 struct comm *cm = to_comm(i);
442 kfree(cm);
443}
444
445static struct config_item *make_node(struct config_group *g, const char *name)
446{
447 struct space *sp = to_space(g->cg_item.ci_parent);
448 struct node *nd;
449
450 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
451 if (!nd)
452 return NULL;
453
454 config_item_init_type_name(&nd->item, name, &node_type);
455 nd->nodeid = -1;
456 nd->weight = 1; /* default weight of 1 if none is set */
457
458 mutex_lock(&sp->members_lock);
459 list_add(&nd->list, &sp->members);
460 sp->members_count++;
461 mutex_unlock(&sp->members_lock);
462
463 return &nd->item;
464}
465
466static void drop_node(struct config_group *g, struct config_item *i)
467{
468 struct space *sp = to_space(g->cg_item.ci_parent);
469 struct node *nd = to_node(i);
470
471 mutex_lock(&sp->members_lock);
472 list_del(&nd->list);
473 sp->members_count--;
474 mutex_unlock(&sp->members_lock);
475
476 config_item_put(i);
477}
478
479static void release_node(struct config_item *i)
480{
481 struct node *nd = to_node(i);
482 kfree(nd);
483}
484
485static struct clusters clusters_root = {
486 .subsys = {
487 .su_group = {
488 .cg_item = {
489 .ci_namebuf = "dlm",
490 .ci_type = &clusters_type,
491 },
492 },
493 },
494};
495
496int dlm_config_init(void)
497{
498 config_group_init(&clusters_root.subsys.su_group);
499 init_MUTEX(&clusters_root.subsys.su_sem);
500 return configfs_register_subsystem(&clusters_root.subsys);
501}
502
503void dlm_config_exit(void)
504{
505 configfs_unregister_subsystem(&clusters_root.subsys);
506}
507
508/*
509 * Functions for user space to read/write attributes
510 */
511
512static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
513 char *buf)
514{
515 struct comm *cm = to_comm(i);
516 struct comm_attribute *cma =
517 container_of(a, struct comm_attribute, attr);
518 return cma->show ? cma->show(cm, buf) : 0;
519}
520
521static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
522 const char *buf, size_t len)
523{
524 struct comm *cm = to_comm(i);
525 struct comm_attribute *cma =
526 container_of(a, struct comm_attribute, attr);
527 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
528}
529
530static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
531{
532 return sprintf(buf, "%d\n", cm->nodeid);
533}
534
535static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
536{
537 cm->nodeid = simple_strtol(buf, NULL, 0);
538 return len;
539}
540
541static ssize_t comm_local_read(struct comm *cm, char *buf)
542{
543 return sprintf(buf, "%d\n", cm->local);
544}
545
546static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
547{
548 cm->local= simple_strtol(buf, NULL, 0);
549 if (cm->local && !local_comm)
550 local_comm = cm;
551 return len;
552}
553
554static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
555{
556 struct sockaddr_storage *addr;
557
558 if (len != sizeof(struct sockaddr_storage))
559 return -EINVAL;
560
561 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
562 return -ENOSPC;
563
564 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
565 if (!addr)
566 return -ENOMEM;
567
568 memcpy(addr, buf, len);
569 cm->addr[cm->addr_count++] = addr;
570 return len;
571}
572
573static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
574 char *buf)
575{
576 struct node *nd = to_node(i);
577 struct node_attribute *nda =
578 container_of(a, struct node_attribute, attr);
579 return nda->show ? nda->show(nd, buf) : 0;
580}
581
582static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
583 const char *buf, size_t len)
584{
585 struct node *nd = to_node(i);
586 struct node_attribute *nda =
587 container_of(a, struct node_attribute, attr);
588 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
589}
590
591static ssize_t node_nodeid_read(struct node *nd, char *buf)
592{
593 return sprintf(buf, "%d\n", nd->nodeid);
594}
595
596static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
597{
598 nd->nodeid = simple_strtol(buf, NULL, 0);
599 return len;
600}
601
602static ssize_t node_weight_read(struct node *nd, char *buf)
603{
604 return sprintf(buf, "%d\n", nd->weight);
605}
606
607static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
608{
609 nd->weight = simple_strtol(buf, NULL, 0);
610 return len;
611}
612
613/*
614 * Functions for the dlm to get the info that's been configured
615 */
616
617static struct space *get_space(char *name)
618{
619 if (!space_list)
620 return NULL;
621 return to_space(config_group_find_obj(space_list, name));
622}
623
624static void put_space(struct space *sp)
625{
626 config_item_put(&sp->group.cg_item);
627}
628
629static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
630{
631 struct config_item *i;
632 struct comm *cm = NULL;
633 int found = 0;
634
635 if (!comm_list)
636 return NULL;
637
638 down(&clusters_root.subsys.su_sem);
639
640 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
641 cm = to_comm(i);
642
643 if (nodeid) {
644 if (cm->nodeid != nodeid)
645 continue;
646 found = 1;
647 break;
648 } else {
649 if (!cm->addr_count ||
650 memcmp(cm->addr[0], addr, sizeof(*addr)))
651 continue;
652 found = 1;
653 break;
654 }
655 }
656 up(&clusters_root.subsys.su_sem);
657
658 if (found)
659 config_item_get(i);
660 else
661 cm = NULL;
662 return cm;
663}
664
665static void put_comm(struct comm *cm)
666{
667 config_item_put(&cm->item);
668}
669
670/* caller must free mem */
671int dlm_nodeid_list(char *lsname, int **ids_out)
672{
673 struct space *sp;
674 struct node *nd;
675 int i = 0, rv = 0;
676 int *ids;
677
678 sp = get_space(lsname);
679 if (!sp)
680 return -EEXIST;
681
682 mutex_lock(&sp->members_lock);
683 if (!sp->members_count) {
684 rv = 0;
685 goto out;
686 }
687
688 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
689 if (!ids) {
690 rv = -ENOMEM;
691 goto out;
692 }
693
694 rv = sp->members_count;
695 list_for_each_entry(nd, &sp->members, list)
696 ids[i++] = nd->nodeid;
697
698 if (rv != i)
699 printk("bad nodeid count %d %d\n", rv, i);
700
701 *ids_out = ids;
702 out:
703 mutex_unlock(&sp->members_lock);
704 put_space(sp);
705 return rv;
706}
707
708int dlm_node_weight(char *lsname, int nodeid)
709{
710 struct space *sp;
711 struct node *nd;
712 int w = -EEXIST;
713
714 sp = get_space(lsname);
715 if (!sp)
716 goto out;
717
718 mutex_lock(&sp->members_lock);
719 list_for_each_entry(nd, &sp->members, list) {
720 if (nd->nodeid != nodeid)
721 continue;
722 w = nd->weight;
723 break;
724 }
725 mutex_unlock(&sp->members_lock);
726 put_space(sp);
727 out:
728 return w;
729}
730
731int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
732{
733 struct comm *cm = get_comm(nodeid, NULL);
734 if (!cm)
735 return -EEXIST;
736 if (!cm->addr_count)
737 return -ENOENT;
738 memcpy(addr, cm->addr[0], sizeof(*addr));
739 put_comm(cm);
740 return 0;
741}
742
743int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
744{
745 struct comm *cm = get_comm(0, addr);
746 if (!cm)
747 return -EEXIST;
748 *nodeid = cm->nodeid;
749 put_comm(cm);
750 return 0;
751}
752
753int dlm_our_nodeid(void)
754{
755 return local_comm ? local_comm->nodeid : 0;
756}
757
758/* num 0 is first addr, num 1 is second addr */
759int dlm_our_addr(struct sockaddr_storage *addr, int num)
760{
761 if (!local_comm)
762 return -1;
763 if (num + 1 > local_comm->addr_count)
764 return -1;
765 memcpy(addr, local_comm->addr[num], sizeof(*addr));
766 return 0;
767}
768
769/* Config file defaults */
770#define DEFAULT_TCP_PORT 21064
771#define DEFAULT_BUFFER_SIZE 4096
772#define DEFAULT_RSBTBL_SIZE 256
773#define DEFAULT_LKBTBL_SIZE 1024
774#define DEFAULT_DIRTBL_SIZE 512
775#define DEFAULT_RECOVER_TIMER 5
776#define DEFAULT_TOSS_SECS 10
777#define DEFAULT_SCAN_SECS 5
778
779struct dlm_config_info dlm_config = {
780 .tcp_port = DEFAULT_TCP_PORT,
781 .buffer_size = DEFAULT_BUFFER_SIZE,
782 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
783 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
784 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
785 .recover_timer = DEFAULT_RECOVER_TIMER,
786 .toss_secs = DEFAULT_TOSS_SECS,
787 .scan_secs = DEFAULT_SCAN_SECS
788};
789
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..ca94a837a5bb
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21#define DLM_DEBUG_BUF_LEN 4096
22static char debug_buf[DLM_DEBUG_BUF_LEN];
23static struct mutex debug_buf_lock;
24
25static struct dentry *dlm_root;
26
27struct rsb_iter {
28 int entry;
29 struct dlm_ls *ls;
30 struct list_head *next;
31 struct dlm_rsb *rsb;
32};
33
34/*
35 * dump all rsb's in the lockspace hash table
36 */
37
38static char *print_lockmode(int mode)
39{
40 switch (mode) {
41 case DLM_LOCK_IV:
42 return "--";
43 case DLM_LOCK_NL:
44 return "NL";
45 case DLM_LOCK_CR:
46 return "CR";
47 case DLM_LOCK_CW:
48 return "CW";
49 case DLM_LOCK_PR:
50 return "PR";
51 case DLM_LOCK_PW:
52 return "PW";
53 case DLM_LOCK_EX:
54 return "EX";
55 default:
56 return "??";
57 }
58}
59
60static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
61 struct dlm_rsb *res)
62{
63 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
64
65 if (lkb->lkb_status == DLM_LKSTS_CONVERT
66 || lkb->lkb_status == DLM_LKSTS_WAITING)
67 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
68
69 if (lkb->lkb_nodeid) {
70 if (lkb->lkb_nodeid != res->res_nodeid)
71 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
72 lkb->lkb_remid);
73 else
74 seq_printf(s, " Master: %08x", lkb->lkb_remid);
75 }
76
77 if (lkb->lkb_wait_type)
78 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
79
80 seq_printf(s, "\n");
81}
82
83static int print_resource(struct dlm_rsb *res, struct seq_file *s)
84{
85 struct dlm_lkb *lkb;
86 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
87
88 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
89 for (i = 0; i < res->res_length; i++) {
90 if (isprint(res->res_name[i]))
91 seq_printf(s, "%c", res->res_name[i]);
92 else
93 seq_printf(s, "%c", '.');
94 }
95 if (res->res_nodeid > 0)
96 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
97 res->res_nodeid);
98 else if (res->res_nodeid == 0)
99 seq_printf(s, "\" \nMaster Copy\n");
100 else if (res->res_nodeid == -1)
101 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
102 res->res_first_lkid);
103 else
104 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
105
106 /* Print the LVB: */
107 if (res->res_lvbptr) {
108 seq_printf(s, "LVB: ");
109 for (i = 0; i < lvblen; i++) {
110 if (i == lvblen / 2)
111 seq_printf(s, "\n ");
112 seq_printf(s, "%02x ",
113 (unsigned char) res->res_lvbptr[i]);
114 }
115 if (rsb_flag(res, RSB_VALNOTVALID))
116 seq_printf(s, " (INVALID)");
117 seq_printf(s, "\n");
118 }
119
120 root_list = !list_empty(&res->res_root_list);
121 recover_list = !list_empty(&res->res_recover_list);
122
123 if (root_list || recover_list) {
124 seq_printf(s, "Recovery: root %d recover %d flags %lx "
125 "count %d\n", root_list, recover_list,
126 res->res_flags, res->res_recover_locks_count);
127 }
128
129 /* Print the locks attached to this resource */
130 seq_printf(s, "Granted Queue\n");
131 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
132 print_lock(s, lkb, res);
133
134 seq_printf(s, "Conversion Queue\n");
135 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
136 print_lock(s, lkb, res);
137
138 seq_printf(s, "Waiting Queue\n");
139 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
140 print_lock(s, lkb, res);
141
142 if (list_empty(&res->res_lookup))
143 goto out;
144
145 seq_printf(s, "Lookup Queue\n");
146 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
147 seq_printf(s, "%08x %s", lkb->lkb_id,
148 print_lockmode(lkb->lkb_rqmode));
149 if (lkb->lkb_wait_type)
150 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
151 seq_printf(s, "\n");
152 }
153 out:
154 return 0;
155}
156
157static int rsb_iter_next(struct rsb_iter *ri)
158{
159 struct dlm_ls *ls = ri->ls;
160 int i;
161
162 if (!ri->next) {
163 top:
164 /* Find the next non-empty hash bucket */
165 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
166 read_lock(&ls->ls_rsbtbl[i].lock);
167 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
168 ri->next = ls->ls_rsbtbl[i].list.next;
169 read_unlock(&ls->ls_rsbtbl[i].lock);
170 break;
171 }
172 read_unlock(&ls->ls_rsbtbl[i].lock);
173 }
174 ri->entry = i;
175
176 if (ri->entry >= ls->ls_rsbtbl_size)
177 return 1;
178 } else {
179 i = ri->entry;
180 read_lock(&ls->ls_rsbtbl[i].lock);
181 ri->next = ri->next->next;
182 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
183 /* End of list - move to next bucket */
184 ri->next = NULL;
185 ri->entry++;
186 read_unlock(&ls->ls_rsbtbl[i].lock);
187 goto top;
188 }
189 read_unlock(&ls->ls_rsbtbl[i].lock);
190 }
191 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
192
193 return 0;
194}
195
196static void rsb_iter_free(struct rsb_iter *ri)
197{
198 kfree(ri);
199}
200
201static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
202{
203 struct rsb_iter *ri;
204
205 ri = kmalloc(sizeof *ri, GFP_KERNEL);
206 if (!ri)
207 return NULL;
208
209 ri->ls = ls;
210 ri->entry = 0;
211 ri->next = NULL;
212
213 if (rsb_iter_next(ri)) {
214 rsb_iter_free(ri);
215 return NULL;
216 }
217
218 return ri;
219}
220
221static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
222{
223 struct rsb_iter *ri;
224 loff_t n = *pos;
225
226 ri = rsb_iter_init(file->private);
227 if (!ri)
228 return NULL;
229
230 while (n--) {
231 if (rsb_iter_next(ri)) {
232 rsb_iter_free(ri);
233 return NULL;
234 }
235 }
236
237 return ri;
238}
239
240static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
241{
242 struct rsb_iter *ri = iter_ptr;
243
244 (*pos)++;
245
246 if (rsb_iter_next(ri)) {
247 rsb_iter_free(ri);
248 return NULL;
249 }
250
251 return ri;
252}
253
254static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
255{
256 /* nothing for now */
257}
258
259static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
260{
261 struct rsb_iter *ri = iter_ptr;
262
263 print_resource(ri->rsb, file);
264
265 return 0;
266}
267
268static struct seq_operations rsb_seq_ops = {
269 .start = rsb_seq_start,
270 .next = rsb_seq_next,
271 .stop = rsb_seq_stop,
272 .show = rsb_seq_show,
273};
274
275static int rsb_open(struct inode *inode, struct file *file)
276{
277 struct seq_file *seq;
278 int ret;
279
280 ret = seq_open(file, &rsb_seq_ops);
281 if (ret)
282 return ret;
283
284 seq = file->private_data;
285 seq->private = inode->i_private;
286
287 return 0;
288}
289
290static struct file_operations rsb_fops = {
291 .owner = THIS_MODULE,
292 .open = rsb_open,
293 .read = seq_read,
294 .llseek = seq_lseek,
295 .release = seq_release
296};
297
298/*
299 * dump lkb's on the ls_waiters list
300 */
301
302static int waiters_open(struct inode *inode, struct file *file)
303{
304 file->private_data = inode->i_private;
305 return 0;
306}
307
308static ssize_t waiters_read(struct file *file, char __user *userbuf,
309 size_t count, loff_t *ppos)
310{
311 struct dlm_ls *ls = file->private_data;
312 struct dlm_lkb *lkb;
313 size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
314
315 mutex_lock(&debug_buf_lock);
316 mutex_lock(&ls->ls_waiters_mutex);
317 memset(debug_buf, 0, sizeof(debug_buf));
318
319 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
320 ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
321 lkb->lkb_id, lkb->lkb_wait_type,
322 lkb->lkb_nodeid, lkb->lkb_resource->res_name);
323 if (ret >= len - pos)
324 break;
325 pos += ret;
326 }
327 mutex_unlock(&ls->ls_waiters_mutex);
328
329 rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
330 mutex_unlock(&debug_buf_lock);
331 return rv;
332}
333
334static struct file_operations waiters_fops = {
335 .owner = THIS_MODULE,
336 .open = waiters_open,
337 .read = waiters_read
338};
339
340int dlm_create_debug_file(struct dlm_ls *ls)
341{
342 char name[DLM_LOCKSPACE_LEN+8];
343
344 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
345 S_IFREG | S_IRUGO,
346 dlm_root,
347 ls,
348 &rsb_fops);
349 if (!ls->ls_debug_rsb_dentry)
350 return -ENOMEM;
351
352 memset(name, 0, sizeof(name));
353 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
354
355 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
356 S_IFREG | S_IRUGO,
357 dlm_root,
358 ls,
359 &waiters_fops);
360 if (!ls->ls_debug_waiters_dentry) {
361 debugfs_remove(ls->ls_debug_rsb_dentry);
362 return -ENOMEM;
363 }
364
365 return 0;
366}
367
368void dlm_delete_debug_file(struct dlm_ls *ls)
369{
370 if (ls->ls_debug_rsb_dentry)
371 debugfs_remove(ls->ls_debug_rsb_dentry);
372 if (ls->ls_debug_waiters_dentry)
373 debugfs_remove(ls->ls_debug_waiters_dentry);
374}
375
376int dlm_register_debugfs(void)
377{
378 mutex_init(&debug_buf_lock);
379 dlm_root = debugfs_create_dir("dlm", NULL);
380 return dlm_root ? 0 : -ENOMEM;
381}
382
383void dlm_unregister_debugfs(void)
384{
385 debugfs_remove(dlm_root);
386}
387
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/miscdevice.h>
39#include <linux/mutex.h>
40#include <asm/semaphore.h>
41#include <asm/uaccess.h>
42
43#include <linux/dlm.h>
44
45#define DLM_LOCKSPACE_LEN 64
46
47/* Size of the temp buffer midcomms allocates on the stack.
48 We try to make this large enough so most messages fit.
49 FIXME: should sctp make this unnecessary? */
50
51#define DLM_INBUF_LEN 148
52
53struct dlm_ls;
54struct dlm_lkb;
55struct dlm_rsb;
56struct dlm_member;
57struct dlm_lkbtable;
58struct dlm_rsbtable;
59struct dlm_dirtable;
60struct dlm_direntry;
61struct dlm_recover;
62struct dlm_header;
63struct dlm_message;
64struct dlm_rcom;
65struct dlm_mhandle;
66
67#define log_print(fmt, args...) \
68 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
69#define log_error(ls, fmt, args...) \
70 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
71
72#define DLM_LOG_DEBUG
73#ifdef DLM_LOG_DEBUG
74#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
75#else
76#define log_debug(ls, fmt, args...)
77#endif
78
79#define DLM_ASSERT(x, do) \
80{ \
81 if (!(x)) \
82 { \
83 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
84 "DLM: assertion: \"%s\"\n" \
85 "DLM: time = %lu\n", \
86 __LINE__, __FILE__, #x, jiffies); \
87 {do} \
88 printk("\n"); \
89 BUG(); \
90 panic("DLM: Record message above and reboot.\n"); \
91 } \
92}
93
94#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
95
96
97struct dlm_direntry {
98 struct list_head list;
99 uint32_t master_nodeid;
100 uint16_t length;
101 char name[1];
102};
103
104struct dlm_dirtable {
105 struct list_head list;
106 rwlock_t lock;
107};
108
109struct dlm_rsbtable {
110 struct list_head list;
111 struct list_head toss;
112 rwlock_t lock;
113};
114
115struct dlm_lkbtable {
116 struct list_head list;
117 rwlock_t lock;
118 uint16_t counter;
119};
120
121/*
122 * Lockspace member (per node in a ls)
123 */
124
125struct dlm_member {
126 struct list_head list;
127 int nodeid;
128 int weight;
129};
130
131/*
132 * Save and manage recovery state for a lockspace.
133 */
134
135struct dlm_recover {
136 struct list_head list;
137 int *nodeids;
138 int node_count;
139 uint64_t seq;
140};
141
142/*
143 * Pass input args to second stage locking function.
144 */
145
146struct dlm_args {
147 uint32_t flags;
148 void *astaddr;
149 long astparam;
150 void *bastaddr;
151 int mode;
152 struct dlm_lksb *lksb;
153};
154
155
156/*
157 * Lock block
158 *
159 * A lock can be one of three types:
160 *
161 * local copy lock is mastered locally
162 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
163 * process copy lock is mastered on a remote node
164 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
165 * master copy master node's copy of a lock owned by remote node
166 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
167 *
168 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
169 * dlm_unlock. The dlm does not modify these or use any private flags in
170 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
171 * are sent as-is to the remote master when the lock is remote.
172 *
173 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
174 * Some internal flags are shared between the master and process nodes;
175 * these shared flags are kept in the lower two bytes. One of these
176 * flags set on the master copy will be propagated to the process copy
177 * and v.v. Other internal flags are private to the master or process
178 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
179 *
180 * lkb_sbflags: status block flags. These flags are copied directly into
181 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
182 * ast. All defined in dlm.h with DLM_SBF_ prefix.
183 *
184 * lkb_status: the lock status indicates which rsb queue the lock is
185 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
186 *
187 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
188 * reply is needed. Only set when the lkb is on the lockspace waiters
189 * list awaiting a reply from a remote node.
190 *
191 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
192 * is a master copy, nodeid specifies the remote lock holder, when the
193 * lkb is a process copy, the nodeid specifies the lock master.
194 */
195
196/* lkb_ast_type */
197
198#define AST_COMP 1
199#define AST_BAST 2
200
201/* lkb_status */
202
203#define DLM_LKSTS_WAITING 1
204#define DLM_LKSTS_GRANTED 2
205#define DLM_LKSTS_CONVERT 3
206
207/* lkb_flags */
208
209#define DLM_IFL_MSTCPY 0x00010000
210#define DLM_IFL_RESEND 0x00020000
211#define DLM_IFL_DEAD 0x00040000
212#define DLM_IFL_USER 0x00000001
213#define DLM_IFL_ORPHAN 0x00000002
214
215struct dlm_lkb {
216 struct dlm_rsb *lkb_resource; /* the rsb */
217 struct kref lkb_ref;
218 int lkb_nodeid; /* copied from rsb */
219 int lkb_ownpid; /* pid of lock owner */
220 uint32_t lkb_id; /* our lock ID */
221 uint32_t lkb_remid; /* lock ID on remote partner */
222 uint32_t lkb_exflags; /* external flags from caller */
223 uint32_t lkb_sbflags; /* lksb flags */
224 uint32_t lkb_flags; /* internal flags */
225 uint32_t lkb_lvbseq; /* lvb sequence number */
226
227 int8_t lkb_status; /* granted, waiting, convert */
228 int8_t lkb_rqmode; /* requested lock mode */
229 int8_t lkb_grmode; /* granted lock mode */
230 int8_t lkb_bastmode; /* requested mode */
231 int8_t lkb_highbast; /* highest mode bast sent for */
232
233 int8_t lkb_wait_type; /* type of reply waiting for */
234 int8_t lkb_ast_type; /* type of ast queued for */
235
236 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
237 struct list_head lkb_statequeue; /* rsb g/c/w list */
238 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
239 struct list_head lkb_wait_reply; /* waiting for remote reply */
240 struct list_head lkb_astqueue; /* need ast to be sent */
241 struct list_head lkb_ownqueue; /* list of locks for a process */
242
243 char *lkb_lvbptr;
244 struct dlm_lksb *lkb_lksb; /* caller's status block */
245 void *lkb_astaddr; /* caller's ast function */
246 void *lkb_bastaddr; /* caller's bast function */
247 long lkb_astparam; /* caller's ast arg */
248};
249
250
251struct dlm_rsb {
252 struct dlm_ls *res_ls; /* the lockspace */
253 struct kref res_ref;
254 struct mutex res_mutex;
255 unsigned long res_flags;
256 int res_length; /* length of rsb name */
257 int res_nodeid;
258 uint32_t res_lvbseq;
259 uint32_t res_hash;
260 uint32_t res_bucket; /* rsbtbl */
261 unsigned long res_toss_time;
262 uint32_t res_first_lkid;
263 struct list_head res_lookup; /* lkbs waiting on first */
264 struct list_head res_hashchain; /* rsbtbl */
265 struct list_head res_grantqueue;
266 struct list_head res_convertqueue;
267 struct list_head res_waitqueue;
268
269 struct list_head res_root_list; /* used for recovery */
270 struct list_head res_recover_list; /* used for recovery */
271 int res_recover_locks_count;
272
273 char *res_lvbptr;
274 char res_name[1];
275};
276
277/* find_rsb() flags */
278
279#define R_MASTER 1 /* only return rsb if it's a master */
280#define R_CREATE 2 /* create/add rsb if not found */
281
282/* rsb_flags */
283
284enum rsb_flags {
285 RSB_MASTER_UNCERTAIN,
286 RSB_VALNOTVALID,
287 RSB_VALNOTVALID_PREV,
288 RSB_NEW_MASTER,
289 RSB_NEW_MASTER2,
290 RSB_RECOVER_CONVERT,
291 RSB_LOCKS_PURGED,
292};
293
294static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
295{
296 __set_bit(flag, &r->res_flags);
297}
298
299static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
300{
301 __clear_bit(flag, &r->res_flags);
302}
303
304static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
305{
306 return test_bit(flag, &r->res_flags);
307}
308
309
310/* dlm_header is first element of all structs sent between nodes */
311
312#define DLM_HEADER_MAJOR 0x00020000
313#define DLM_HEADER_MINOR 0x00000001
314
315#define DLM_MSG 1
316#define DLM_RCOM 2
317
318struct dlm_header {
319 uint32_t h_version;
320 uint32_t h_lockspace;
321 uint32_t h_nodeid; /* nodeid of sender */
322 uint16_t h_length;
323 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
324 uint8_t h_pad;
325};
326
327
328#define DLM_MSG_REQUEST 1
329#define DLM_MSG_CONVERT 2
330#define DLM_MSG_UNLOCK 3
331#define DLM_MSG_CANCEL 4
332#define DLM_MSG_REQUEST_REPLY 5
333#define DLM_MSG_CONVERT_REPLY 6
334#define DLM_MSG_UNLOCK_REPLY 7
335#define DLM_MSG_CANCEL_REPLY 8
336#define DLM_MSG_GRANT 9
337#define DLM_MSG_BAST 10
338#define DLM_MSG_LOOKUP 11
339#define DLM_MSG_REMOVE 12
340#define DLM_MSG_LOOKUP_REPLY 13
341
342struct dlm_message {
343 struct dlm_header m_header;
344 uint32_t m_type; /* DLM_MSG_ */
345 uint32_t m_nodeid;
346 uint32_t m_pid;
347 uint32_t m_lkid; /* lkid on sender */
348 uint32_t m_remid; /* lkid on receiver */
349 uint32_t m_parent_lkid;
350 uint32_t m_parent_remid;
351 uint32_t m_exflags;
352 uint32_t m_sbflags;
353 uint32_t m_flags;
354 uint32_t m_lvbseq;
355 uint32_t m_hash;
356 int m_status;
357 int m_grmode;
358 int m_rqmode;
359 int m_bastmode;
360 int m_asts;
361 int m_result; /* 0 or -EXXX */
362 char m_extra[0]; /* name or lvb */
363};
364
365
366#define DLM_RS_NODES 0x00000001
367#define DLM_RS_NODES_ALL 0x00000002
368#define DLM_RS_DIR 0x00000004
369#define DLM_RS_DIR_ALL 0x00000008
370#define DLM_RS_LOCKS 0x00000010
371#define DLM_RS_LOCKS_ALL 0x00000020
372#define DLM_RS_DONE 0x00000040
373#define DLM_RS_DONE_ALL 0x00000080
374
375#define DLM_RCOM_STATUS 1
376#define DLM_RCOM_NAMES 2
377#define DLM_RCOM_LOOKUP 3
378#define DLM_RCOM_LOCK 4
379#define DLM_RCOM_STATUS_REPLY 5
380#define DLM_RCOM_NAMES_REPLY 6
381#define DLM_RCOM_LOOKUP_REPLY 7
382#define DLM_RCOM_LOCK_REPLY 8
383
384struct dlm_rcom {
385 struct dlm_header rc_header;
386 uint32_t rc_type; /* DLM_RCOM_ */
387 int rc_result; /* multi-purpose */
388 uint64_t rc_id; /* match reply with request */
389 char rc_buf[0];
390};
391
392struct rcom_config {
393 uint32_t rf_lvblen;
394 uint32_t rf_lsflags;
395 uint64_t rf_unused;
396};
397
398struct rcom_lock {
399 uint32_t rl_ownpid;
400 uint32_t rl_lkid;
401 uint32_t rl_remid;
402 uint32_t rl_parent_lkid;
403 uint32_t rl_parent_remid;
404 uint32_t rl_exflags;
405 uint32_t rl_flags;
406 uint32_t rl_lvbseq;
407 int rl_result;
408 int8_t rl_rqmode;
409 int8_t rl_grmode;
410 int8_t rl_status;
411 int8_t rl_asts;
412 uint16_t rl_wait_type;
413 uint16_t rl_namelen;
414 char rl_name[DLM_RESNAME_MAXLEN];
415 char rl_lvb[0];
416};
417
418struct dlm_ls {
419 struct list_head ls_list; /* list of lockspaces */
420 dlm_lockspace_t *ls_local_handle;
421 uint32_t ls_global_id; /* global unique lockspace ID */
422 uint32_t ls_exflags;
423 int ls_lvblen;
424 int ls_count; /* reference count */
425 unsigned long ls_flags; /* LSFL_ */
426 struct kobject ls_kobj;
427
428 struct dlm_rsbtable *ls_rsbtbl;
429 uint32_t ls_rsbtbl_size;
430
431 struct dlm_lkbtable *ls_lkbtbl;
432 uint32_t ls_lkbtbl_size;
433
434 struct dlm_dirtable *ls_dirtbl;
435 uint32_t ls_dirtbl_size;
436
437 struct mutex ls_waiters_mutex;
438 struct list_head ls_waiters; /* lkbs needing a reply */
439
440 struct list_head ls_nodes; /* current nodes in ls */
441 struct list_head ls_nodes_gone; /* dead node list, recovery */
442 int ls_num_nodes; /* number of nodes in ls */
443 int ls_low_nodeid;
444 int ls_total_weight;
445 int *ls_node_array;
446
447 struct dlm_rsb ls_stub_rsb; /* for returning errors */
448 struct dlm_lkb ls_stub_lkb; /* for returning errors */
449 struct dlm_message ls_stub_ms; /* for faking a reply */
450
451 struct dentry *ls_debug_rsb_dentry; /* debugfs */
452 struct dentry *ls_debug_waiters_dentry; /* debugfs */
453
454 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
455 int ls_uevent_result;
456
457 struct miscdevice ls_device;
458
459 /* recovery related */
460
461 struct timer_list ls_timer;
462 struct task_struct *ls_recoverd_task;
463 struct mutex ls_recoverd_active;
464 spinlock_t ls_recover_lock;
465 uint32_t ls_recover_status; /* DLM_RS_ */
466 uint64_t ls_recover_seq;
467 struct dlm_recover *ls_recover_args;
468 struct rw_semaphore ls_in_recovery; /* block local requests */
469 struct list_head ls_requestqueue;/* queue remote requests */
470 struct mutex ls_requestqueue_mutex;
471 char *ls_recover_buf;
472 int ls_recover_nodeid; /* for debugging */
473 uint64_t ls_rcom_seq;
474 struct list_head ls_recover_list;
475 spinlock_t ls_recover_list_lock;
476 int ls_recover_list_count;
477 wait_queue_head_t ls_wait_general;
478 struct mutex ls_clear_proc_locks;
479
480 struct list_head ls_root_list; /* root resources */
481 struct rw_semaphore ls_root_sem; /* protect root_list */
482
483 int ls_namelen;
484 char ls_name[1];
485};
486
487#define LSFL_WORK 0
488#define LSFL_RUNNING 1
489#define LSFL_RECOVERY_STOP 2
490#define LSFL_RCOM_READY 3
491#define LSFL_UEVENT_WAIT 4
492
493/* much of this is just saving user space pointers associated with the
494 lock that we pass back to the user lib with an ast */
495
496struct dlm_user_args {
497 struct dlm_user_proc *proc; /* each process that opens the lockspace
498 device has private data
499 (dlm_user_proc) on the struct file,
500 the process's locks point back to it*/
501 struct dlm_lksb lksb;
502 int old_mode;
503 int update_user_lvb;
504 struct dlm_lksb __user *user_lksb;
505 void __user *castparam;
506 void __user *castaddr;
507 void __user *bastparam;
508 void __user *bastaddr;
509};
510
511#define DLM_PROC_FLAGS_CLOSING 1
512#define DLM_PROC_FLAGS_COMPAT 2
513
514/* locks list is kept so we can remove all a process's locks when it
515 exits (or orphan those that are persistent) */
516
517struct dlm_user_proc {
518 dlm_lockspace_t *lockspace;
519 unsigned long flags; /* DLM_PROC_FLAGS */
520 struct list_head asts;
521 spinlock_t asts_spin;
522 struct list_head locks;
523 spinlock_t locks_spin;
524 wait_queue_head_t wait;
525};
526
527static inline int dlm_locking_stopped(struct dlm_ls *ls)
528{
529 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
530}
531
532static inline int dlm_recovery_stopped(struct dlm_ls *ls)
533{
534 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
535}
536
537static inline int dlm_no_directory(struct dlm_ls *ls)
538{
539 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
540}
541
542#endif /* __DLM_INTERNAL_DOT_H__ */
543
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..3f2befa4797b
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3871 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58#include <linux/types.h>
59#include "dlm_internal.h"
60#include <linux/dlm_device.h>
61#include "memory.h"
62#include "lowcomms.h"
63#include "requestqueue.h"
64#include "util.h"
65#include "dir.h"
66#include "member.h"
67#include "lockspace.h"
68#include "ast.h"
69#include "lock.h"
70#include "rcom.h"
71#include "recover.h"
72#include "lvb_table.h"
73#include "user.h"
74#include "config.h"
75
76static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
80static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
82static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static int send_remove(struct dlm_rsb *r);
84static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
86 struct dlm_message *ms);
87static int receive_extralen(struct dlm_message *ms);
88
89/*
90 * Lock compatibilty matrix - thanks Steve
91 * UN = Unlocked state. Not really a state, used as a flag
92 * PD = Padding. Used to make the matrix a nice power of two in size
93 * Other states are the same as the VMS DLM.
94 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
95 */
96
97static const int __dlm_compat_matrix[8][8] = {
98 /* UN NL CR CW PR PW EX PD */
99 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
100 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
101 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
102 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
103 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
104 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
105 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
106 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
107};
108
109/*
110 * This defines the direction of transfer of LVB data.
111 * Granted mode is the row; requested mode is the column.
112 * Usage: matrix[grmode+1][rqmode+1]
113 * 1 = LVB is returned to the caller
114 * 0 = LVB is written to the resource
115 * -1 = nothing happens to the LVB
116 */
117
118const int dlm_lvb_operations[8][8] = {
119 /* UN NL CR CW PR PW EX PD*/
120 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
121 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
122 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
123 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
124 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
125 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
126 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
127 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
128};
129
130#define modes_compat(gr, rq) \
131 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
132
133int dlm_modes_compat(int mode1, int mode2)
134{
135 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
136}
137
138/*
139 * Compatibility matrix for conversions with QUECVT set.
140 * Granted mode is the row; requested mode is the column.
141 * Usage: matrix[grmode+1][rqmode+1]
142 */
143
144static const int __quecvt_compat_matrix[8][8] = {
145 /* UN NL CR CW PR PW EX PD */
146 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
147 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
148 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
149 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
150 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
151 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
152 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
153 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
154};
155
156void dlm_print_lkb(struct dlm_lkb *lkb)
157{
158 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
159 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
160 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
161 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
162 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
163}
164
165void dlm_print_rsb(struct dlm_rsb *r)
166{
167 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
168 r->res_nodeid, r->res_flags, r->res_first_lkid,
169 r->res_recover_locks_count, r->res_name);
170}
171
172void dlm_dump_rsb(struct dlm_rsb *r)
173{
174 struct dlm_lkb *lkb;
175
176 dlm_print_rsb(r);
177
178 printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
179 list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
180 printk(KERN_ERR "rsb lookup list\n");
181 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
182 dlm_print_lkb(lkb);
183 printk(KERN_ERR "rsb grant queue:\n");
184 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
185 dlm_print_lkb(lkb);
186 printk(KERN_ERR "rsb convert queue:\n");
187 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
188 dlm_print_lkb(lkb);
189 printk(KERN_ERR "rsb wait queue:\n");
190 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
191 dlm_print_lkb(lkb);
192}
193
194/* Threads cannot use the lockspace while it's being recovered */
195
196static inline void lock_recovery(struct dlm_ls *ls)
197{
198 down_read(&ls->ls_in_recovery);
199}
200
201static inline void unlock_recovery(struct dlm_ls *ls)
202{
203 up_read(&ls->ls_in_recovery);
204}
205
206static inline int lock_recovery_try(struct dlm_ls *ls)
207{
208 return down_read_trylock(&ls->ls_in_recovery);
209}
210
211static inline int can_be_queued(struct dlm_lkb *lkb)
212{
213 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
214}
215
216static inline int force_blocking_asts(struct dlm_lkb *lkb)
217{
218 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
219}
220
221static inline int is_demoted(struct dlm_lkb *lkb)
222{
223 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
224}
225
226static inline int is_remote(struct dlm_rsb *r)
227{
228 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
229 return !!r->res_nodeid;
230}
231
232static inline int is_process_copy(struct dlm_lkb *lkb)
233{
234 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
235}
236
237static inline int is_master_copy(struct dlm_lkb *lkb)
238{
239 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
240 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
241 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
242}
243
244static inline int middle_conversion(struct dlm_lkb *lkb)
245{
246 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
247 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
248 return 1;
249 return 0;
250}
251
252static inline int down_conversion(struct dlm_lkb *lkb)
253{
254 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
255}
256
257static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
258{
259 if (is_master_copy(lkb))
260 return;
261
262 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
263
264 lkb->lkb_lksb->sb_status = rv;
265 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
266
267 dlm_add_ast(lkb, AST_COMP);
268}
269
270static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
271{
272 if (is_master_copy(lkb))
273 send_bast(r, lkb, rqmode);
274 else {
275 lkb->lkb_bastmode = rqmode;
276 dlm_add_ast(lkb, AST_BAST);
277 }
278}
279
280/*
281 * Basic operations on rsb's and lkb's
282 */
283
284static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
285{
286 struct dlm_rsb *r;
287
288 r = allocate_rsb(ls, len);
289 if (!r)
290 return NULL;
291
292 r->res_ls = ls;
293 r->res_length = len;
294 memcpy(r->res_name, name, len);
295 mutex_init(&r->res_mutex);
296
297 INIT_LIST_HEAD(&r->res_lookup);
298 INIT_LIST_HEAD(&r->res_grantqueue);
299 INIT_LIST_HEAD(&r->res_convertqueue);
300 INIT_LIST_HEAD(&r->res_waitqueue);
301 INIT_LIST_HEAD(&r->res_root_list);
302 INIT_LIST_HEAD(&r->res_recover_list);
303
304 return r;
305}
306
307static int search_rsb_list(struct list_head *head, char *name, int len,
308 unsigned int flags, struct dlm_rsb **r_ret)
309{
310 struct dlm_rsb *r;
311 int error = 0;
312
313 list_for_each_entry(r, head, res_hashchain) {
314 if (len == r->res_length && !memcmp(name, r->res_name, len))
315 goto found;
316 }
317 return -EBADR;
318
319 found:
320 if (r->res_nodeid && (flags & R_MASTER))
321 error = -ENOTBLK;
322 *r_ret = r;
323 return error;
324}
325
326static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
327 unsigned int flags, struct dlm_rsb **r_ret)
328{
329 struct dlm_rsb *r;
330 int error;
331
332 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
333 if (!error) {
334 kref_get(&r->res_ref);
335 goto out;
336 }
337 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
338 if (error)
339 goto out;
340
341 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
342
343 if (dlm_no_directory(ls))
344 goto out;
345
346 if (r->res_nodeid == -1) {
347 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
348 r->res_first_lkid = 0;
349 } else if (r->res_nodeid > 0) {
350 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
351 r->res_first_lkid = 0;
352 } else {
353 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
354 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
355 }
356 out:
357 *r_ret = r;
358 return error;
359}
360
361static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
362 unsigned int flags, struct dlm_rsb **r_ret)
363{
364 int error;
365 write_lock(&ls->ls_rsbtbl[b].lock);
366 error = _search_rsb(ls, name, len, b, flags, r_ret);
367 write_unlock(&ls->ls_rsbtbl[b].lock);
368 return error;
369}
370
371/*
372 * Find rsb in rsbtbl and potentially create/add one
373 *
374 * Delaying the release of rsb's has a similar benefit to applications keeping
375 * NL locks on an rsb, but without the guarantee that the cached master value
376 * will still be valid when the rsb is reused. Apps aren't always smart enough
377 * to keep NL locks on an rsb that they may lock again shortly; this can lead
378 * to excessive master lookups and removals if we don't delay the release.
379 *
380 * Searching for an rsb means looking through both the normal list and toss
381 * list. When found on the toss list the rsb is moved to the normal list with
382 * ref count of 1; when found on normal list the ref count is incremented.
383 */
384
385static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
386 unsigned int flags, struct dlm_rsb **r_ret)
387{
388 struct dlm_rsb *r, *tmp;
389 uint32_t hash, bucket;
390 int error = 0;
391
392 if (dlm_no_directory(ls))
393 flags |= R_CREATE;
394
395 hash = jhash(name, namelen, 0);
396 bucket = hash & (ls->ls_rsbtbl_size - 1);
397
398 error = search_rsb(ls, name, namelen, bucket, flags, &r);
399 if (!error)
400 goto out;
401
402 if (error == -EBADR && !(flags & R_CREATE))
403 goto out;
404
405 /* the rsb was found but wasn't a master copy */
406 if (error == -ENOTBLK)
407 goto out;
408
409 error = -ENOMEM;
410 r = create_rsb(ls, name, namelen);
411 if (!r)
412 goto out;
413
414 r->res_hash = hash;
415 r->res_bucket = bucket;
416 r->res_nodeid = -1;
417 kref_init(&r->res_ref);
418
419 /* With no directory, the master can be set immediately */
420 if (dlm_no_directory(ls)) {
421 int nodeid = dlm_dir_nodeid(r);
422 if (nodeid == dlm_our_nodeid())
423 nodeid = 0;
424 r->res_nodeid = nodeid;
425 }
426
427 write_lock(&ls->ls_rsbtbl[bucket].lock);
428 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
429 if (!error) {
430 write_unlock(&ls->ls_rsbtbl[bucket].lock);
431 free_rsb(r);
432 r = tmp;
433 goto out;
434 }
435 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
436 write_unlock(&ls->ls_rsbtbl[bucket].lock);
437 error = 0;
438 out:
439 *r_ret = r;
440 return error;
441}
442
443int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
444 unsigned int flags, struct dlm_rsb **r_ret)
445{
446 return find_rsb(ls, name, namelen, flags, r_ret);
447}
448
449/* This is only called to add a reference when the code already holds
450 a valid reference to the rsb, so there's no need for locking. */
451
452static inline void hold_rsb(struct dlm_rsb *r)
453{
454 kref_get(&r->res_ref);
455}
456
457void dlm_hold_rsb(struct dlm_rsb *r)
458{
459 hold_rsb(r);
460}
461
462static void toss_rsb(struct kref *kref)
463{
464 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
465 struct dlm_ls *ls = r->res_ls;
466
467 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
468 kref_init(&r->res_ref);
469 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
470 r->res_toss_time = jiffies;
471 if (r->res_lvbptr) {
472 free_lvb(r->res_lvbptr);
473 r->res_lvbptr = NULL;
474 }
475}
476
477/* When all references to the rsb are gone it's transfered to
478 the tossed list for later disposal. */
479
480static void put_rsb(struct dlm_rsb *r)
481{
482 struct dlm_ls *ls = r->res_ls;
483 uint32_t bucket = r->res_bucket;
484
485 write_lock(&ls->ls_rsbtbl[bucket].lock);
486 kref_put(&r->res_ref, toss_rsb);
487 write_unlock(&ls->ls_rsbtbl[bucket].lock);
488}
489
490void dlm_put_rsb(struct dlm_rsb *r)
491{
492 put_rsb(r);
493}
494
495/* See comment for unhold_lkb */
496
497static void unhold_rsb(struct dlm_rsb *r)
498{
499 int rv;
500 rv = kref_put(&r->res_ref, toss_rsb);
501 DLM_ASSERT(!rv, dlm_dump_rsb(r););
502}
503
504static void kill_rsb(struct kref *kref)
505{
506 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
507
508 /* All work is done after the return from kref_put() so we
509 can release the write_lock before the remove and free. */
510
511 DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
512 DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
513 DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
514 DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
515 DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
516 DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
517}
518
519/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
520 The rsb must exist as long as any lkb's for it do. */
521
522static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
523{
524 hold_rsb(r);
525 lkb->lkb_resource = r;
526}
527
528static void detach_lkb(struct dlm_lkb *lkb)
529{
530 if (lkb->lkb_resource) {
531 put_rsb(lkb->lkb_resource);
532 lkb->lkb_resource = NULL;
533 }
534}
535
536static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
537{
538 struct dlm_lkb *lkb, *tmp;
539 uint32_t lkid = 0;
540 uint16_t bucket;
541
542 lkb = allocate_lkb(ls);
543 if (!lkb)
544 return -ENOMEM;
545
546 lkb->lkb_nodeid = -1;
547 lkb->lkb_grmode = DLM_LOCK_IV;
548 kref_init(&lkb->lkb_ref);
549 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
550
551 get_random_bytes(&bucket, sizeof(bucket));
552 bucket &= (ls->ls_lkbtbl_size - 1);
553
554 write_lock(&ls->ls_lkbtbl[bucket].lock);
555
556 /* counter can roll over so we must verify lkid is not in use */
557
558 while (lkid == 0) {
559 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
560
561 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
562 lkb_idtbl_list) {
563 if (tmp->lkb_id != lkid)
564 continue;
565 lkid = 0;
566 break;
567 }
568 }
569
570 lkb->lkb_id = lkid;
571 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
572 write_unlock(&ls->ls_lkbtbl[bucket].lock);
573
574 *lkb_ret = lkb;
575 return 0;
576}
577
578static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
579{
580 uint16_t bucket = lkid & 0xFFFF;
581 struct dlm_lkb *lkb;
582
583 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
584 if (lkb->lkb_id == lkid)
585 return lkb;
586 }
587 return NULL;
588}
589
590static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
591{
592 struct dlm_lkb *lkb;
593 uint16_t bucket = lkid & 0xFFFF;
594
595 if (bucket >= ls->ls_lkbtbl_size)
596 return -EBADSLT;
597
598 read_lock(&ls->ls_lkbtbl[bucket].lock);
599 lkb = __find_lkb(ls, lkid);
600 if (lkb)
601 kref_get(&lkb->lkb_ref);
602 read_unlock(&ls->ls_lkbtbl[bucket].lock);
603
604 *lkb_ret = lkb;
605 return lkb ? 0 : -ENOENT;
606}
607
608static void kill_lkb(struct kref *kref)
609{
610 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
611
612 /* All work is done after the return from kref_put() so we
613 can release the write_lock before the detach_lkb */
614
615 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
616}
617
618/* __put_lkb() is used when an lkb may not have an rsb attached to
619 it so we need to provide the lockspace explicitly */
620
621static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
622{
623 uint16_t bucket = lkb->lkb_id & 0xFFFF;
624
625 write_lock(&ls->ls_lkbtbl[bucket].lock);
626 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
627 list_del(&lkb->lkb_idtbl_list);
628 write_unlock(&ls->ls_lkbtbl[bucket].lock);
629
630 detach_lkb(lkb);
631
632 /* for local/process lkbs, lvbptr points to caller's lksb */
633 if (lkb->lkb_lvbptr && is_master_copy(lkb))
634 free_lvb(lkb->lkb_lvbptr);
635 free_lkb(lkb);
636 return 1;
637 } else {
638 write_unlock(&ls->ls_lkbtbl[bucket].lock);
639 return 0;
640 }
641}
642
643int dlm_put_lkb(struct dlm_lkb *lkb)
644{
645 struct dlm_ls *ls;
646
647 DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
648 DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
649
650 ls = lkb->lkb_resource->res_ls;
651 return __put_lkb(ls, lkb);
652}
653
654/* This is only called to add a reference when the code already holds
655 a valid reference to the lkb, so there's no need for locking. */
656
657static inline void hold_lkb(struct dlm_lkb *lkb)
658{
659 kref_get(&lkb->lkb_ref);
660}
661
662/* This is called when we need to remove a reference and are certain
663 it's not the last ref. e.g. del_lkb is always called between a
664 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
665 put_lkb would work fine, but would involve unnecessary locking */
666
667static inline void unhold_lkb(struct dlm_lkb *lkb)
668{
669 int rv;
670 rv = kref_put(&lkb->lkb_ref, kill_lkb);
671 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
672}
673
674static void lkb_add_ordered(struct list_head *new, struct list_head *head,
675 int mode)
676{
677 struct dlm_lkb *lkb = NULL;
678
679 list_for_each_entry(lkb, head, lkb_statequeue)
680 if (lkb->lkb_rqmode < mode)
681 break;
682
683 if (!lkb)
684 list_add_tail(new, head);
685 else
686 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
687}
688
689/* add/remove lkb to rsb's grant/convert/wait queue */
690
691static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
692{
693 kref_get(&lkb->lkb_ref);
694
695 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
696
697 lkb->lkb_status = status;
698
699 switch (status) {
700 case DLM_LKSTS_WAITING:
701 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
702 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
703 else
704 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
705 break;
706 case DLM_LKSTS_GRANTED:
707 /* convention says granted locks kept in order of grmode */
708 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
709 lkb->lkb_grmode);
710 break;
711 case DLM_LKSTS_CONVERT:
712 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
713 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
714 else
715 list_add_tail(&lkb->lkb_statequeue,
716 &r->res_convertqueue);
717 break;
718 default:
719 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
720 }
721}
722
723static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
724{
725 lkb->lkb_status = 0;
726 list_del(&lkb->lkb_statequeue);
727 unhold_lkb(lkb);
728}
729
730static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
731{
732 hold_lkb(lkb);
733 del_lkb(r, lkb);
734 add_lkb(r, lkb, sts);
735 unhold_lkb(lkb);
736}
737
738/* add/remove lkb from global waiters list of lkb's waiting for
739 a reply from a remote node */
740
741static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
742{
743 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
744
745 mutex_lock(&ls->ls_waiters_mutex);
746 if (lkb->lkb_wait_type) {
747 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
748 goto out;
749 }
750 lkb->lkb_wait_type = mstype;
751 kref_get(&lkb->lkb_ref);
752 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
753 out:
754 mutex_unlock(&ls->ls_waiters_mutex);
755}
756
757static int _remove_from_waiters(struct dlm_lkb *lkb)
758{
759 int error = 0;
760
761 if (!lkb->lkb_wait_type) {
762 log_print("remove_from_waiters error");
763 error = -EINVAL;
764 goto out;
765 }
766 lkb->lkb_wait_type = 0;
767 list_del(&lkb->lkb_wait_reply);
768 unhold_lkb(lkb);
769 out:
770 return error;
771}
772
773static int remove_from_waiters(struct dlm_lkb *lkb)
774{
775 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
776 int error;
777
778 mutex_lock(&ls->ls_waiters_mutex);
779 error = _remove_from_waiters(lkb);
780 mutex_unlock(&ls->ls_waiters_mutex);
781 return error;
782}
783
784static void dir_remove(struct dlm_rsb *r)
785{
786 int to_nodeid;
787
788 if (dlm_no_directory(r->res_ls))
789 return;
790
791 to_nodeid = dlm_dir_nodeid(r);
792 if (to_nodeid != dlm_our_nodeid())
793 send_remove(r);
794 else
795 dlm_dir_remove_entry(r->res_ls, to_nodeid,
796 r->res_name, r->res_length);
797}
798
799/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
800 found since they are in order of newest to oldest? */
801
802static int shrink_bucket(struct dlm_ls *ls, int b)
803{
804 struct dlm_rsb *r;
805 int count = 0, found;
806
807 for (;;) {
808 found = 0;
809 write_lock(&ls->ls_rsbtbl[b].lock);
810 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
811 res_hashchain) {
812 if (!time_after_eq(jiffies, r->res_toss_time +
813 dlm_config.toss_secs * HZ))
814 continue;
815 found = 1;
816 break;
817 }
818
819 if (!found) {
820 write_unlock(&ls->ls_rsbtbl[b].lock);
821 break;
822 }
823
824 if (kref_put(&r->res_ref, kill_rsb)) {
825 list_del(&r->res_hashchain);
826 write_unlock(&ls->ls_rsbtbl[b].lock);
827
828 if (is_master(r))
829 dir_remove(r);
830 free_rsb(r);
831 count++;
832 } else {
833 write_unlock(&ls->ls_rsbtbl[b].lock);
834 log_error(ls, "tossed rsb in use %s", r->res_name);
835 }
836 }
837
838 return count;
839}
840
841void dlm_scan_rsbs(struct dlm_ls *ls)
842{
843 int i;
844
845 if (dlm_locking_stopped(ls))
846 return;
847
848 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
849 shrink_bucket(ls, i);
850 cond_resched();
851 }
852}
853
854/* lkb is master or local copy */
855
856static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
857{
858 int b, len = r->res_ls->ls_lvblen;
859
860 /* b=1 lvb returned to caller
861 b=0 lvb written to rsb or invalidated
862 b=-1 do nothing */
863
864 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
865
866 if (b == 1) {
867 if (!lkb->lkb_lvbptr)
868 return;
869
870 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
871 return;
872
873 if (!r->res_lvbptr)
874 return;
875
876 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
877 lkb->lkb_lvbseq = r->res_lvbseq;
878
879 } else if (b == 0) {
880 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
881 rsb_set_flag(r, RSB_VALNOTVALID);
882 return;
883 }
884
885 if (!lkb->lkb_lvbptr)
886 return;
887
888 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
889 return;
890
891 if (!r->res_lvbptr)
892 r->res_lvbptr = allocate_lvb(r->res_ls);
893
894 if (!r->res_lvbptr)
895 return;
896
897 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
898 r->res_lvbseq++;
899 lkb->lkb_lvbseq = r->res_lvbseq;
900 rsb_clear_flag(r, RSB_VALNOTVALID);
901 }
902
903 if (rsb_flag(r, RSB_VALNOTVALID))
904 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
905}
906
907static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
908{
909 if (lkb->lkb_grmode < DLM_LOCK_PW)
910 return;
911
912 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
913 rsb_set_flag(r, RSB_VALNOTVALID);
914 return;
915 }
916
917 if (!lkb->lkb_lvbptr)
918 return;
919
920 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
921 return;
922
923 if (!r->res_lvbptr)
924 r->res_lvbptr = allocate_lvb(r->res_ls);
925
926 if (!r->res_lvbptr)
927 return;
928
929 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
930 r->res_lvbseq++;
931 rsb_clear_flag(r, RSB_VALNOTVALID);
932}
933
934/* lkb is process copy (pc) */
935
936static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
937 struct dlm_message *ms)
938{
939 int b;
940
941 if (!lkb->lkb_lvbptr)
942 return;
943
944 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
945 return;
946
947 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
948 if (b == 1) {
949 int len = receive_extralen(ms);
950 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
951 lkb->lkb_lvbseq = ms->m_lvbseq;
952 }
953}
954
955/* Manipulate lkb's on rsb's convert/granted/waiting queues
956 remove_lock -- used for unlock, removes lkb from granted
957 revert_lock -- used for cancel, moves lkb from convert to granted
958 grant_lock -- used for request and convert, adds lkb to granted or
959 moves lkb from convert or waiting to granted
960
961 Each of these is used for master or local copy lkb's. There is
962 also a _pc() variation used to make the corresponding change on
963 a process copy (pc) lkb. */
964
965static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
966{
967 del_lkb(r, lkb);
968 lkb->lkb_grmode = DLM_LOCK_IV;
969 /* this unhold undoes the original ref from create_lkb()
970 so this leads to the lkb being freed */
971 unhold_lkb(lkb);
972}
973
974static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
975{
976 set_lvb_unlock(r, lkb);
977 _remove_lock(r, lkb);
978}
979
980static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
981{
982 _remove_lock(r, lkb);
983}
984
985static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
986{
987 lkb->lkb_rqmode = DLM_LOCK_IV;
988
989 switch (lkb->lkb_status) {
990 case DLM_LKSTS_GRANTED:
991 break;
992 case DLM_LKSTS_CONVERT:
993 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
994 break;
995 case DLM_LKSTS_WAITING:
996 del_lkb(r, lkb);
997 lkb->lkb_grmode = DLM_LOCK_IV;
998 /* this unhold undoes the original ref from create_lkb()
999 so this leads to the lkb being freed */
1000 unhold_lkb(lkb);
1001 break;
1002 default:
1003 log_print("invalid status for revert %d", lkb->lkb_status);
1004 }
1005}
1006
1007static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
1008{
1009 revert_lock(r, lkb);
1010}
1011
1012static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1013{
1014 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
1015 lkb->lkb_grmode = lkb->lkb_rqmode;
1016 if (lkb->lkb_status)
1017 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
1018 else
1019 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
1020 }
1021
1022 lkb->lkb_rqmode = DLM_LOCK_IV;
1023}
1024
1025static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1026{
1027 set_lvb_lock(r, lkb);
1028 _grant_lock(r, lkb);
1029 lkb->lkb_highbast = 0;
1030}
1031
1032static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1033 struct dlm_message *ms)
1034{
1035 set_lvb_lock_pc(r, lkb, ms);
1036 _grant_lock(r, lkb);
1037}
1038
1039/* called by grant_pending_locks() which means an async grant message must
1040 be sent to the requesting node in addition to granting the lock if the
1041 lkb belongs to a remote node. */
1042
1043static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1044{
1045 grant_lock(r, lkb);
1046 if (is_master_copy(lkb))
1047 send_grant(r, lkb);
1048 else
1049 queue_cast(r, lkb, 0);
1050}
1051
1052static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1053{
1054 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1055 lkb_statequeue);
1056 if (lkb->lkb_id == first->lkb_id)
1057 return 1;
1058
1059 return 0;
1060}
1061
1062/* Check if the given lkb conflicts with another lkb on the queue. */
1063
1064static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1065{
1066 struct dlm_lkb *this;
1067
1068 list_for_each_entry(this, head, lkb_statequeue) {
1069 if (this == lkb)
1070 continue;
1071 if (!modes_compat(this, lkb))
1072 return 1;
1073 }
1074 return 0;
1075}
1076
1077/*
1078 * "A conversion deadlock arises with a pair of lock requests in the converting
1079 * queue for one resource. The granted mode of each lock blocks the requested
1080 * mode of the other lock."
1081 *
1082 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1083 * convert queue from being granted, then demote lkb (set grmode to NL).
1084 * This second form requires that we check for conv-deadlk even when
1085 * now == 0 in _can_be_granted().
1086 *
1087 * Example:
1088 * Granted Queue: empty
1089 * Convert Queue: NL->EX (first lock)
1090 * PR->EX (second lock)
1091 *
1092 * The first lock can't be granted because of the granted mode of the second
1093 * lock and the second lock can't be granted because it's not first in the
1094 * list. We demote the granted mode of the second lock (the lkb passed to this
1095 * function).
1096 *
1097 * After the resolution, the "grant pending" function needs to go back and try
1098 * to grant locks on the convert queue again since the first lock can now be
1099 * granted.
1100 */
1101
1102static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1103{
1104 struct dlm_lkb *this, *first = NULL, *self = NULL;
1105
1106 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1107 if (!first)
1108 first = this;
1109 if (this == lkb) {
1110 self = lkb;
1111 continue;
1112 }
1113
1114 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1115 return 1;
1116 }
1117
1118 /* if lkb is on the convert queue and is preventing the first
1119 from being granted, then there's deadlock and we demote lkb.
1120 multiple converting locks may need to do this before the first
1121 converting lock can be granted. */
1122
1123 if (self && self != first) {
1124 if (!modes_compat(lkb, first) &&
1125 !queue_conflict(&rsb->res_grantqueue, first))
1126 return 1;
1127 }
1128
1129 return 0;
1130}
1131
1132/*
1133 * Return 1 if the lock can be granted, 0 otherwise.
1134 * Also detect and resolve conversion deadlocks.
1135 *
1136 * lkb is the lock to be granted
1137 *
1138 * now is 1 if the function is being called in the context of the
1139 * immediate request, it is 0 if called later, after the lock has been
1140 * queued.
1141 *
1142 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1143 */
1144
1145static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1146{
1147 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1148
1149 /*
1150 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1151 * a new request for a NL mode lock being blocked.
1152 *
1153 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1154 * request, then it would be granted. In essence, the use of this flag
1155 * tells the Lock Manager to expedite theis request by not considering
1156 * what may be in the CONVERTING or WAITING queues... As of this
1157 * writing, the EXPEDITE flag can be used only with new requests for NL
1158 * mode locks. This flag is not valid for conversion requests.
1159 *
1160 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1161 * conversion or used with a non-NL requested mode. We also know an
1162 * EXPEDITE request is always granted immediately, so now must always
1163 * be 1. The full condition to grant an expedite request: (now &&
1164 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1165 * therefore be shortened to just checking the flag.
1166 */
1167
1168 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1169 return 1;
1170
1171 /*
1172 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1173 * added to the remaining conditions.
1174 */
1175
1176 if (queue_conflict(&r->res_grantqueue, lkb))
1177 goto out;
1178
1179 /*
1180 * 6-3: By default, a conversion request is immediately granted if the
1181 * requested mode is compatible with the modes of all other granted
1182 * locks
1183 */
1184
1185 if (queue_conflict(&r->res_convertqueue, lkb))
1186 goto out;
1187
1188 /*
1189 * 6-5: But the default algorithm for deciding whether to grant or
1190 * queue conversion requests does not by itself guarantee that such
1191 * requests are serviced on a "first come first serve" basis. This, in
1192 * turn, can lead to a phenomenon known as "indefinate postponement".
1193 *
1194 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1195 * the system service employed to request a lock conversion. This flag
1196 * forces certain conversion requests to be queued, even if they are
1197 * compatible with the granted modes of other locks on the same
1198 * resource. Thus, the use of this flag results in conversion requests
1199 * being ordered on a "first come first servce" basis.
1200 *
1201 * DCT: This condition is all about new conversions being able to occur
1202 * "in place" while the lock remains on the granted queue (assuming
1203 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1204 * doesn't _have_ to go onto the convert queue where it's processed in
1205 * order. The "now" variable is necessary to distinguish converts
1206 * being received and processed for the first time now, because once a
1207 * convert is moved to the conversion queue the condition below applies
1208 * requiring fifo granting.
1209 */
1210
1211 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1212 return 1;
1213
1214 /*
1215 * The NOORDER flag is set to avoid the standard vms rules on grant
1216 * order.
1217 */
1218
1219 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1220 return 1;
1221
1222 /*
1223 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1224 * granted until all other conversion requests ahead of it are granted
1225 * and/or canceled.
1226 */
1227
1228 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1229 return 1;
1230
1231 /*
1232 * 6-4: By default, a new request is immediately granted only if all
1233 * three of the following conditions are satisfied when the request is
1234 * issued:
1235 * - The queue of ungranted conversion requests for the resource is
1236 * empty.
1237 * - The queue of ungranted new requests for the resource is empty.
1238 * - The mode of the new request is compatible with the most
1239 * restrictive mode of all granted locks on the resource.
1240 */
1241
1242 if (now && !conv && list_empty(&r->res_convertqueue) &&
1243 list_empty(&r->res_waitqueue))
1244 return 1;
1245
1246 /*
1247 * 6-4: Once a lock request is in the queue of ungranted new requests,
1248 * it cannot be granted until the queue of ungranted conversion
1249 * requests is empty, all ungranted new requests ahead of it are
1250 * granted and/or canceled, and it is compatible with the granted mode
1251 * of the most restrictive lock granted on the resource.
1252 */
1253
1254 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1255 first_in_list(lkb, &r->res_waitqueue))
1256 return 1;
1257
1258 out:
1259 /*
1260 * The following, enabled by CONVDEADLK, departs from VMS.
1261 */
1262
1263 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1264 conversion_deadlock_detect(r, lkb)) {
1265 lkb->lkb_grmode = DLM_LOCK_NL;
1266 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1267 }
1268
1269 return 0;
1270}
1271
1272/*
1273 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1274 * simple way to provide a big optimization to applications that can use them.
1275 */
1276
1277static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1278{
1279 uint32_t flags = lkb->lkb_exflags;
1280 int rv;
1281 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1282
1283 rv = _can_be_granted(r, lkb, now);
1284 if (rv)
1285 goto out;
1286
1287 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1288 goto out;
1289
1290 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1291 alt = DLM_LOCK_PR;
1292 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1293 alt = DLM_LOCK_CW;
1294
1295 if (alt) {
1296 lkb->lkb_rqmode = alt;
1297 rv = _can_be_granted(r, lkb, now);
1298 if (rv)
1299 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1300 else
1301 lkb->lkb_rqmode = rqmode;
1302 }
1303 out:
1304 return rv;
1305}
1306
1307static int grant_pending_convert(struct dlm_rsb *r, int high)
1308{
1309 struct dlm_lkb *lkb, *s;
1310 int hi, demoted, quit, grant_restart, demote_restart;
1311
1312 quit = 0;
1313 restart:
1314 grant_restart = 0;
1315 demote_restart = 0;
1316 hi = DLM_LOCK_IV;
1317
1318 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1319 demoted = is_demoted(lkb);
1320 if (can_be_granted(r, lkb, 0)) {
1321 grant_lock_pending(r, lkb);
1322 grant_restart = 1;
1323 } else {
1324 hi = max_t(int, lkb->lkb_rqmode, hi);
1325 if (!demoted && is_demoted(lkb))
1326 demote_restart = 1;
1327 }
1328 }
1329
1330 if (grant_restart)
1331 goto restart;
1332 if (demote_restart && !quit) {
1333 quit = 1;
1334 goto restart;
1335 }
1336
1337 return max_t(int, high, hi);
1338}
1339
1340static int grant_pending_wait(struct dlm_rsb *r, int high)
1341{
1342 struct dlm_lkb *lkb, *s;
1343
1344 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1345 if (can_be_granted(r, lkb, 0))
1346 grant_lock_pending(r, lkb);
1347 else
1348 high = max_t(int, lkb->lkb_rqmode, high);
1349 }
1350
1351 return high;
1352}
1353
1354static void grant_pending_locks(struct dlm_rsb *r)
1355{
1356 struct dlm_lkb *lkb, *s;
1357 int high = DLM_LOCK_IV;
1358
1359 DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
1360
1361 high = grant_pending_convert(r, high);
1362 high = grant_pending_wait(r, high);
1363
1364 if (high == DLM_LOCK_IV)
1365 return;
1366
1367 /*
1368 * If there are locks left on the wait/convert queue then send blocking
1369 * ASTs to granted locks based on the largest requested mode (high)
1370 * found above. FIXME: highbast < high comparison not valid for PR/CW.
1371 */
1372
1373 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1374 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1375 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1376 queue_bast(r, lkb, high);
1377 lkb->lkb_highbast = high;
1378 }
1379 }
1380}
1381
1382static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1383 struct dlm_lkb *lkb)
1384{
1385 struct dlm_lkb *gr;
1386
1387 list_for_each_entry(gr, head, lkb_statequeue) {
1388 if (gr->lkb_bastaddr &&
1389 gr->lkb_highbast < lkb->lkb_rqmode &&
1390 !modes_compat(gr, lkb)) {
1391 queue_bast(r, gr, lkb->lkb_rqmode);
1392 gr->lkb_highbast = lkb->lkb_rqmode;
1393 }
1394 }
1395}
1396
1397static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1398{
1399 send_bast_queue(r, &r->res_grantqueue, lkb);
1400}
1401
1402static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1403{
1404 send_bast_queue(r, &r->res_grantqueue, lkb);
1405 send_bast_queue(r, &r->res_convertqueue, lkb);
1406}
1407
1408/* set_master(r, lkb) -- set the master nodeid of a resource
1409
1410 The purpose of this function is to set the nodeid field in the given
1411 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1412 known, it can just be copied to the lkb and the function will return
1413 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1414 before it can be copied to the lkb.
1415
1416 When the rsb nodeid is being looked up remotely, the initial lkb
1417 causing the lookup is kept on the ls_waiters list waiting for the
1418 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1419 on the rsb's res_lookup list until the master is verified.
1420
1421 Return values:
1422 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1423 1: the rsb master is not available and the lkb has been placed on
1424 a wait queue
1425*/
1426
1427static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1428{
1429 struct dlm_ls *ls = r->res_ls;
1430 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1431
1432 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1433 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1434 r->res_first_lkid = lkb->lkb_id;
1435 lkb->lkb_nodeid = r->res_nodeid;
1436 return 0;
1437 }
1438
1439 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1440 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1441 return 1;
1442 }
1443
1444 if (r->res_nodeid == 0) {
1445 lkb->lkb_nodeid = 0;
1446 return 0;
1447 }
1448
1449 if (r->res_nodeid > 0) {
1450 lkb->lkb_nodeid = r->res_nodeid;
1451 return 0;
1452 }
1453
1454 DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
1455
1456 dir_nodeid = dlm_dir_nodeid(r);
1457
1458 if (dir_nodeid != our_nodeid) {
1459 r->res_first_lkid = lkb->lkb_id;
1460 send_lookup(r, lkb);
1461 return 1;
1462 }
1463
1464 for (;;) {
1465 /* It's possible for dlm_scand to remove an old rsb for
1466 this same resource from the toss list, us to create
1467 a new one, look up the master locally, and find it
1468 already exists just before dlm_scand does the
1469 dir_remove() on the previous rsb. */
1470
1471 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1472 r->res_length, &ret_nodeid);
1473 if (!error)
1474 break;
1475 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1476 schedule();
1477 }
1478
1479 if (ret_nodeid == our_nodeid) {
1480 r->res_first_lkid = 0;
1481 r->res_nodeid = 0;
1482 lkb->lkb_nodeid = 0;
1483 } else {
1484 r->res_first_lkid = lkb->lkb_id;
1485 r->res_nodeid = ret_nodeid;
1486 lkb->lkb_nodeid = ret_nodeid;
1487 }
1488 return 0;
1489}
1490
1491static void process_lookup_list(struct dlm_rsb *r)
1492{
1493 struct dlm_lkb *lkb, *safe;
1494
1495 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1496 list_del(&lkb->lkb_rsb_lookup);
1497 _request_lock(r, lkb);
1498 schedule();
1499 }
1500}
1501
1502/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1503
1504static void confirm_master(struct dlm_rsb *r, int error)
1505{
1506 struct dlm_lkb *lkb;
1507
1508 if (!r->res_first_lkid)
1509 return;
1510
1511 switch (error) {
1512 case 0:
1513 case -EINPROGRESS:
1514 r->res_first_lkid = 0;
1515 process_lookup_list(r);
1516 break;
1517
1518 case -EAGAIN:
1519 /* the remote master didn't queue our NOQUEUE request;
1520 make a waiting lkb the first_lkid */
1521
1522 r->res_first_lkid = 0;
1523
1524 if (!list_empty(&r->res_lookup)) {
1525 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1526 lkb_rsb_lookup);
1527 list_del(&lkb->lkb_rsb_lookup);
1528 r->res_first_lkid = lkb->lkb_id;
1529 _request_lock(r, lkb);
1530 } else
1531 r->res_nodeid = -1;
1532 break;
1533
1534 default:
1535 log_error(r->res_ls, "confirm_master unknown error %d", error);
1536 }
1537}
1538
1539static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1540 int namelen, uint32_t parent_lkid, void *ast,
1541 void *astarg, void *bast, struct dlm_args *args)
1542{
1543 int rv = -EINVAL;
1544
1545 /* check for invalid arg usage */
1546
1547 if (mode < 0 || mode > DLM_LOCK_EX)
1548 goto out;
1549
1550 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1551 goto out;
1552
1553 if (flags & DLM_LKF_CANCEL)
1554 goto out;
1555
1556 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1557 goto out;
1558
1559 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1560 goto out;
1561
1562 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1563 goto out;
1564
1565 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1566 goto out;
1567
1568 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1569 goto out;
1570
1571 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1572 goto out;
1573
1574 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1575 goto out;
1576
1577 if (!ast || !lksb)
1578 goto out;
1579
1580 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1581 goto out;
1582
1583 /* parent/child locks not yet supported */
1584 if (parent_lkid)
1585 goto out;
1586
1587 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1588 goto out;
1589
1590 /* these args will be copied to the lkb in validate_lock_args,
1591 it cannot be done now because when converting locks, fields in
1592 an active lkb cannot be modified before locking the rsb */
1593
1594 args->flags = flags;
1595 args->astaddr = ast;
1596 args->astparam = (long) astarg;
1597 args->bastaddr = bast;
1598 args->mode = mode;
1599 args->lksb = lksb;
1600 rv = 0;
1601 out:
1602 return rv;
1603}
1604
1605static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1606{
1607 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1608 DLM_LKF_FORCEUNLOCK))
1609 return -EINVAL;
1610
1611 args->flags = flags;
1612 args->astparam = (long) astarg;
1613 return 0;
1614}
1615
1616static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1617 struct dlm_args *args)
1618{
1619 int rv = -EINVAL;
1620
1621 if (args->flags & DLM_LKF_CONVERT) {
1622 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1623 goto out;
1624
1625 if (args->flags & DLM_LKF_QUECVT &&
1626 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1627 goto out;
1628
1629 rv = -EBUSY;
1630 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1631 goto out;
1632
1633 if (lkb->lkb_wait_type)
1634 goto out;
1635 }
1636
1637 lkb->lkb_exflags = args->flags;
1638 lkb->lkb_sbflags = 0;
1639 lkb->lkb_astaddr = args->astaddr;
1640 lkb->lkb_astparam = args->astparam;
1641 lkb->lkb_bastaddr = args->bastaddr;
1642 lkb->lkb_rqmode = args->mode;
1643 lkb->lkb_lksb = args->lksb;
1644 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1645 lkb->lkb_ownpid = (int) current->pid;
1646 rv = 0;
1647 out:
1648 return rv;
1649}
1650
1651static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1652{
1653 int rv = -EINVAL;
1654
1655 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1656 goto out;
1657
1658 if (args->flags & DLM_LKF_FORCEUNLOCK)
1659 goto out_ok;
1660
1661 if (args->flags & DLM_LKF_CANCEL &&
1662 lkb->lkb_status == DLM_LKSTS_GRANTED)
1663 goto out;
1664
1665 if (!(args->flags & DLM_LKF_CANCEL) &&
1666 lkb->lkb_status != DLM_LKSTS_GRANTED)
1667 goto out;
1668
1669 rv = -EBUSY;
1670 if (lkb->lkb_wait_type)
1671 goto out;
1672
1673 out_ok:
1674 lkb->lkb_exflags = args->flags;
1675 lkb->lkb_sbflags = 0;
1676 lkb->lkb_astparam = args->astparam;
1677
1678 rv = 0;
1679 out:
1680 return rv;
1681}
1682
1683/*
1684 * Four stage 4 varieties:
1685 * do_request(), do_convert(), do_unlock(), do_cancel()
1686 * These are called on the master node for the given lock and
1687 * from the central locking logic.
1688 */
1689
1690static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1691{
1692 int error = 0;
1693
1694 if (can_be_granted(r, lkb, 1)) {
1695 grant_lock(r, lkb);
1696 queue_cast(r, lkb, 0);
1697 goto out;
1698 }
1699
1700 if (can_be_queued(lkb)) {
1701 error = -EINPROGRESS;
1702 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1703 send_blocking_asts(r, lkb);
1704 goto out;
1705 }
1706
1707 error = -EAGAIN;
1708 if (force_blocking_asts(lkb))
1709 send_blocking_asts_all(r, lkb);
1710 queue_cast(r, lkb, -EAGAIN);
1711
1712 out:
1713 return error;
1714}
1715
1716static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1717{
1718 int error = 0;
1719
1720 /* changing an existing lock may allow others to be granted */
1721
1722 if (can_be_granted(r, lkb, 1)) {
1723 grant_lock(r, lkb);
1724 queue_cast(r, lkb, 0);
1725 grant_pending_locks(r);
1726 goto out;
1727 }
1728
1729 if (can_be_queued(lkb)) {
1730 if (is_demoted(lkb))
1731 grant_pending_locks(r);
1732 error = -EINPROGRESS;
1733 del_lkb(r, lkb);
1734 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1735 send_blocking_asts(r, lkb);
1736 goto out;
1737 }
1738
1739 error = -EAGAIN;
1740 if (force_blocking_asts(lkb))
1741 send_blocking_asts_all(r, lkb);
1742 queue_cast(r, lkb, -EAGAIN);
1743
1744 out:
1745 return error;
1746}
1747
1748static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1749{
1750 remove_lock(r, lkb);
1751 queue_cast(r, lkb, -DLM_EUNLOCK);
1752 grant_pending_locks(r);
1753 return -DLM_EUNLOCK;
1754}
1755
1756/* FIXME: if revert_lock() finds that the lkb is granted, we should
1757 skip the queue_cast(ECANCEL). It indicates that the request/convert
1758 completed (and queued a normal ast) just before the cancel; we don't
1759 want to clobber the sb_result for the normal ast with ECANCEL. */
1760
1761static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1762{
1763 revert_lock(r, lkb);
1764 queue_cast(r, lkb, -DLM_ECANCEL);
1765 grant_pending_locks(r);
1766 return -DLM_ECANCEL;
1767}
1768
1769/*
1770 * Four stage 3 varieties:
1771 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1772 */
1773
1774/* add a new lkb to a possibly new rsb, called by requesting process */
1775
1776static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1777{
1778 int error;
1779
1780 /* set_master: sets lkb nodeid from r */
1781
1782 error = set_master(r, lkb);
1783 if (error < 0)
1784 goto out;
1785 if (error) {
1786 error = 0;
1787 goto out;
1788 }
1789
1790 if (is_remote(r))
1791 /* receive_request() calls do_request() on remote node */
1792 error = send_request(r, lkb);
1793 else
1794 error = do_request(r, lkb);
1795 out:
1796 return error;
1797}
1798
1799/* change some property of an existing lkb, e.g. mode */
1800
1801static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1802{
1803 int error;
1804
1805 if (is_remote(r))
1806 /* receive_convert() calls do_convert() on remote node */
1807 error = send_convert(r, lkb);
1808 else
1809 error = do_convert(r, lkb);
1810
1811 return error;
1812}
1813
1814/* remove an existing lkb from the granted queue */
1815
1816static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1817{
1818 int error;
1819
1820 if (is_remote(r))
1821 /* receive_unlock() calls do_unlock() on remote node */
1822 error = send_unlock(r, lkb);
1823 else
1824 error = do_unlock(r, lkb);
1825
1826 return error;
1827}
1828
1829/* remove an existing lkb from the convert or wait queue */
1830
1831static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1832{
1833 int error;
1834
1835 if (is_remote(r))
1836 /* receive_cancel() calls do_cancel() on remote node */
1837 error = send_cancel(r, lkb);
1838 else
1839 error = do_cancel(r, lkb);
1840
1841 return error;
1842}
1843
1844/*
1845 * Four stage 2 varieties:
1846 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1847 */
1848
1849static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1850 int len, struct dlm_args *args)
1851{
1852 struct dlm_rsb *r;
1853 int error;
1854
1855 error = validate_lock_args(ls, lkb, args);
1856 if (error)
1857 goto out;
1858
1859 error = find_rsb(ls, name, len, R_CREATE, &r);
1860 if (error)
1861 goto out;
1862
1863 lock_rsb(r);
1864
1865 attach_lkb(r, lkb);
1866 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1867
1868 error = _request_lock(r, lkb);
1869
1870 unlock_rsb(r);
1871 put_rsb(r);
1872
1873 out:
1874 return error;
1875}
1876
1877static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1878 struct dlm_args *args)
1879{
1880 struct dlm_rsb *r;
1881 int error;
1882
1883 r = lkb->lkb_resource;
1884
1885 hold_rsb(r);
1886 lock_rsb(r);
1887
1888 error = validate_lock_args(ls, lkb, args);
1889 if (error)
1890 goto out;
1891
1892 error = _convert_lock(r, lkb);
1893 out:
1894 unlock_rsb(r);
1895 put_rsb(r);
1896 return error;
1897}
1898
1899static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1900 struct dlm_args *args)
1901{
1902 struct dlm_rsb *r;
1903 int error;
1904
1905 r = lkb->lkb_resource;
1906
1907 hold_rsb(r);
1908 lock_rsb(r);
1909
1910 error = validate_unlock_args(lkb, args);
1911 if (error)
1912 goto out;
1913
1914 error = _unlock_lock(r, lkb);
1915 out:
1916 unlock_rsb(r);
1917 put_rsb(r);
1918 return error;
1919}
1920
1921static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1922 struct dlm_args *args)
1923{
1924 struct dlm_rsb *r;
1925 int error;
1926
1927 r = lkb->lkb_resource;
1928
1929 hold_rsb(r);
1930 lock_rsb(r);
1931
1932 error = validate_unlock_args(lkb, args);
1933 if (error)
1934 goto out;
1935
1936 error = _cancel_lock(r, lkb);
1937 out:
1938 unlock_rsb(r);
1939 put_rsb(r);
1940 return error;
1941}
1942
1943/*
1944 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1945 */
1946
1947int dlm_lock(dlm_lockspace_t *lockspace,
1948 int mode,
1949 struct dlm_lksb *lksb,
1950 uint32_t flags,
1951 void *name,
1952 unsigned int namelen,
1953 uint32_t parent_lkid,
1954 void (*ast) (void *astarg),
1955 void *astarg,
1956 void (*bast) (void *astarg, int mode))
1957{
1958 struct dlm_ls *ls;
1959 struct dlm_lkb *lkb;
1960 struct dlm_args args;
1961 int error, convert = flags & DLM_LKF_CONVERT;
1962
1963 ls = dlm_find_lockspace_local(lockspace);
1964 if (!ls)
1965 return -EINVAL;
1966
1967 lock_recovery(ls);
1968
1969 if (convert)
1970 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1971 else
1972 error = create_lkb(ls, &lkb);
1973
1974 if (error)
1975 goto out;
1976
1977 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1978 astarg, bast, &args);
1979 if (error)
1980 goto out_put;
1981
1982 if (convert)
1983 error = convert_lock(ls, lkb, &args);
1984 else
1985 error = request_lock(ls, lkb, name, namelen, &args);
1986
1987 if (error == -EINPROGRESS)
1988 error = 0;
1989 out_put:
1990 if (convert || error)
1991 __put_lkb(ls, lkb);
1992 if (error == -EAGAIN)
1993 error = 0;
1994 out:
1995 unlock_recovery(ls);
1996 dlm_put_lockspace(ls);
1997 return error;
1998}
1999
2000int dlm_unlock(dlm_lockspace_t *lockspace,
2001 uint32_t lkid,
2002 uint32_t flags,
2003 struct dlm_lksb *lksb,
2004 void *astarg)
2005{
2006 struct dlm_ls *ls;
2007 struct dlm_lkb *lkb;
2008 struct dlm_args args;
2009 int error;
2010
2011 ls = dlm_find_lockspace_local(lockspace);
2012 if (!ls)
2013 return -EINVAL;
2014
2015 lock_recovery(ls);
2016
2017 error = find_lkb(ls, lkid, &lkb);
2018 if (error)
2019 goto out;
2020
2021 error = set_unlock_args(flags, astarg, &args);
2022 if (error)
2023 goto out_put;
2024
2025 if (flags & DLM_LKF_CANCEL)
2026 error = cancel_lock(ls, lkb, &args);
2027 else
2028 error = unlock_lock(ls, lkb, &args);
2029
2030 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2031 error = 0;
2032 out_put:
2033 dlm_put_lkb(lkb);
2034 out:
2035 unlock_recovery(ls);
2036 dlm_put_lockspace(ls);
2037 return error;
2038}
2039
2040/*
2041 * send/receive routines for remote operations and replies
2042 *
2043 * send_args
2044 * send_common
2045 * send_request receive_request
2046 * send_convert receive_convert
2047 * send_unlock receive_unlock
2048 * send_cancel receive_cancel
2049 * send_grant receive_grant
2050 * send_bast receive_bast
2051 * send_lookup receive_lookup
2052 * send_remove receive_remove
2053 *
2054 * send_common_reply
2055 * receive_request_reply send_request_reply
2056 * receive_convert_reply send_convert_reply
2057 * receive_unlock_reply send_unlock_reply
2058 * receive_cancel_reply send_cancel_reply
2059 * receive_lookup_reply send_lookup_reply
2060 */
2061
2062static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2063 int to_nodeid, int mstype,
2064 struct dlm_message **ms_ret,
2065 struct dlm_mhandle **mh_ret)
2066{
2067 struct dlm_message *ms;
2068 struct dlm_mhandle *mh;
2069 char *mb;
2070 int mb_len = sizeof(struct dlm_message);
2071
2072 switch (mstype) {
2073 case DLM_MSG_REQUEST:
2074 case DLM_MSG_LOOKUP:
2075 case DLM_MSG_REMOVE:
2076 mb_len += r->res_length;
2077 break;
2078 case DLM_MSG_CONVERT:
2079 case DLM_MSG_UNLOCK:
2080 case DLM_MSG_REQUEST_REPLY:
2081 case DLM_MSG_CONVERT_REPLY:
2082 case DLM_MSG_GRANT:
2083 if (lkb && lkb->lkb_lvbptr)
2084 mb_len += r->res_ls->ls_lvblen;
2085 break;
2086 }
2087
2088 /* get_buffer gives us a message handle (mh) that we need to
2089 pass into lowcomms_commit and a message buffer (mb) that we
2090 write our data into */
2091
2092 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2093 if (!mh)
2094 return -ENOBUFS;
2095
2096 memset(mb, 0, mb_len);
2097
2098 ms = (struct dlm_message *) mb;
2099
2100 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2101 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2102 ms->m_header.h_nodeid = dlm_our_nodeid();
2103 ms->m_header.h_length = mb_len;
2104 ms->m_header.h_cmd = DLM_MSG;
2105
2106 ms->m_type = mstype;
2107
2108 *mh_ret = mh;
2109 *ms_ret = ms;
2110 return 0;
2111}
2112
2113/* further lowcomms enhancements or alternate implementations may make
2114 the return value from this function useful at some point */
2115
2116static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2117{
2118 dlm_message_out(ms);
2119 dlm_lowcomms_commit_buffer(mh);
2120 return 0;
2121}
2122
2123static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2124 struct dlm_message *ms)
2125{
2126 ms->m_nodeid = lkb->lkb_nodeid;
2127 ms->m_pid = lkb->lkb_ownpid;
2128 ms->m_lkid = lkb->lkb_id;
2129 ms->m_remid = lkb->lkb_remid;
2130 ms->m_exflags = lkb->lkb_exflags;
2131 ms->m_sbflags = lkb->lkb_sbflags;
2132 ms->m_flags = lkb->lkb_flags;
2133 ms->m_lvbseq = lkb->lkb_lvbseq;
2134 ms->m_status = lkb->lkb_status;
2135 ms->m_grmode = lkb->lkb_grmode;
2136 ms->m_rqmode = lkb->lkb_rqmode;
2137 ms->m_hash = r->res_hash;
2138
2139 /* m_result and m_bastmode are set from function args,
2140 not from lkb fields */
2141
2142 if (lkb->lkb_bastaddr)
2143 ms->m_asts |= AST_BAST;
2144 if (lkb->lkb_astaddr)
2145 ms->m_asts |= AST_COMP;
2146
2147 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2148 memcpy(ms->m_extra, r->res_name, r->res_length);
2149
2150 else if (lkb->lkb_lvbptr)
2151 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2152
2153}
2154
2155static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2156{
2157 struct dlm_message *ms;
2158 struct dlm_mhandle *mh;
2159 int to_nodeid, error;
2160
2161 add_to_waiters(lkb, mstype);
2162
2163 to_nodeid = r->res_nodeid;
2164
2165 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2166 if (error)
2167 goto fail;
2168
2169 send_args(r, lkb, ms);
2170
2171 error = send_message(mh, ms);
2172 if (error)
2173 goto fail;
2174 return 0;
2175
2176 fail:
2177 remove_from_waiters(lkb);
2178 return error;
2179}
2180
2181static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2182{
2183 return send_common(r, lkb, DLM_MSG_REQUEST);
2184}
2185
2186static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2187{
2188 int error;
2189
2190 error = send_common(r, lkb, DLM_MSG_CONVERT);
2191
2192 /* down conversions go without a reply from the master */
2193 if (!error && down_conversion(lkb)) {
2194 remove_from_waiters(lkb);
2195 r->res_ls->ls_stub_ms.m_result = 0;
2196 r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
2197 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2198 }
2199
2200 return error;
2201}
2202
2203/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2204 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2205 that the master is still correct. */
2206
2207static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2208{
2209 return send_common(r, lkb, DLM_MSG_UNLOCK);
2210}
2211
2212static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2213{
2214 return send_common(r, lkb, DLM_MSG_CANCEL);
2215}
2216
2217static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2218{
2219 struct dlm_message *ms;
2220 struct dlm_mhandle *mh;
2221 int to_nodeid, error;
2222
2223 to_nodeid = lkb->lkb_nodeid;
2224
2225 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2226 if (error)
2227 goto out;
2228
2229 send_args(r, lkb, ms);
2230
2231 ms->m_result = 0;
2232
2233 error = send_message(mh, ms);
2234 out:
2235 return error;
2236}
2237
2238static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2239{
2240 struct dlm_message *ms;
2241 struct dlm_mhandle *mh;
2242 int to_nodeid, error;
2243
2244 to_nodeid = lkb->lkb_nodeid;
2245
2246 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2247 if (error)
2248 goto out;
2249
2250 send_args(r, lkb, ms);
2251
2252 ms->m_bastmode = mode;
2253
2254 error = send_message(mh, ms);
2255 out:
2256 return error;
2257}
2258
2259static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2260{
2261 struct dlm_message *ms;
2262 struct dlm_mhandle *mh;
2263 int to_nodeid, error;
2264
2265 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2266
2267 to_nodeid = dlm_dir_nodeid(r);
2268
2269 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2270 if (error)
2271 goto fail;
2272
2273 send_args(r, lkb, ms);
2274
2275 error = send_message(mh, ms);
2276 if (error)
2277 goto fail;
2278 return 0;
2279
2280 fail:
2281 remove_from_waiters(lkb);
2282 return error;
2283}
2284
2285static int send_remove(struct dlm_rsb *r)
2286{
2287 struct dlm_message *ms;
2288 struct dlm_mhandle *mh;
2289 int to_nodeid, error;
2290
2291 to_nodeid = dlm_dir_nodeid(r);
2292
2293 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2294 if (error)
2295 goto out;
2296
2297 memcpy(ms->m_extra, r->res_name, r->res_length);
2298 ms->m_hash = r->res_hash;
2299
2300 error = send_message(mh, ms);
2301 out:
2302 return error;
2303}
2304
2305static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2306 int mstype, int rv)
2307{
2308 struct dlm_message *ms;
2309 struct dlm_mhandle *mh;
2310 int to_nodeid, error;
2311
2312 to_nodeid = lkb->lkb_nodeid;
2313
2314 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2315 if (error)
2316 goto out;
2317
2318 send_args(r, lkb, ms);
2319
2320 ms->m_result = rv;
2321
2322 error = send_message(mh, ms);
2323 out:
2324 return error;
2325}
2326
2327static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2328{
2329 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2330}
2331
2332static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2333{
2334 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2335}
2336
2337static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2338{
2339 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2340}
2341
2342static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2343{
2344 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2345}
2346
2347static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2348 int ret_nodeid, int rv)
2349{
2350 struct dlm_rsb *r = &ls->ls_stub_rsb;
2351 struct dlm_message *ms;
2352 struct dlm_mhandle *mh;
2353 int error, nodeid = ms_in->m_header.h_nodeid;
2354
2355 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2356 if (error)
2357 goto out;
2358
2359 ms->m_lkid = ms_in->m_lkid;
2360 ms->m_result = rv;
2361 ms->m_nodeid = ret_nodeid;
2362
2363 error = send_message(mh, ms);
2364 out:
2365 return error;
2366}
2367
2368/* which args we save from a received message depends heavily on the type
2369 of message, unlike the send side where we can safely send everything about
2370 the lkb for any type of message */
2371
2372static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2373{
2374 lkb->lkb_exflags = ms->m_exflags;
2375 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2376 (ms->m_flags & 0x0000FFFF);
2377}
2378
2379static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2380{
2381 lkb->lkb_sbflags = ms->m_sbflags;
2382 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2383 (ms->m_flags & 0x0000FFFF);
2384}
2385
2386static int receive_extralen(struct dlm_message *ms)
2387{
2388 return (ms->m_header.h_length - sizeof(struct dlm_message));
2389}
2390
2391static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2392 struct dlm_message *ms)
2393{
2394 int len;
2395
2396 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2397 if (!lkb->lkb_lvbptr)
2398 lkb->lkb_lvbptr = allocate_lvb(ls);
2399 if (!lkb->lkb_lvbptr)
2400 return -ENOMEM;
2401 len = receive_extralen(ms);
2402 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2403 }
2404 return 0;
2405}
2406
2407static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2408 struct dlm_message *ms)
2409{
2410 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2411 lkb->lkb_ownpid = ms->m_pid;
2412 lkb->lkb_remid = ms->m_lkid;
2413 lkb->lkb_grmode = DLM_LOCK_IV;
2414 lkb->lkb_rqmode = ms->m_rqmode;
2415 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2416 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2417
2418 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2419
2420 if (receive_lvb(ls, lkb, ms))
2421 return -ENOMEM;
2422
2423 return 0;
2424}
2425
2426static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2427 struct dlm_message *ms)
2428{
2429 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2430 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2431 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2432 lkb->lkb_id, lkb->lkb_remid);
2433 return -EINVAL;
2434 }
2435
2436 if (!is_master_copy(lkb))
2437 return -EINVAL;
2438
2439 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2440 return -EBUSY;
2441
2442 if (receive_lvb(ls, lkb, ms))
2443 return -ENOMEM;
2444
2445 lkb->lkb_rqmode = ms->m_rqmode;
2446 lkb->lkb_lvbseq = ms->m_lvbseq;
2447
2448 return 0;
2449}
2450
2451static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2452 struct dlm_message *ms)
2453{
2454 if (!is_master_copy(lkb))
2455 return -EINVAL;
2456 if (receive_lvb(ls, lkb, ms))
2457 return -ENOMEM;
2458 return 0;
2459}
2460
2461/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2462 uses to send a reply and that the remote end uses to process the reply. */
2463
2464static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2465{
2466 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2467 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2468 lkb->lkb_remid = ms->m_lkid;
2469}
2470
2471static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2472{
2473 struct dlm_lkb *lkb;
2474 struct dlm_rsb *r;
2475 int error, namelen;
2476
2477 error = create_lkb(ls, &lkb);
2478 if (error)
2479 goto fail;
2480
2481 receive_flags(lkb, ms);
2482 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2483 error = receive_request_args(ls, lkb, ms);
2484 if (error) {
2485 __put_lkb(ls, lkb);
2486 goto fail;
2487 }
2488
2489 namelen = receive_extralen(ms);
2490
2491 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2492 if (error) {
2493 __put_lkb(ls, lkb);
2494 goto fail;
2495 }
2496
2497 lock_rsb(r);
2498
2499 attach_lkb(r, lkb);
2500 error = do_request(r, lkb);
2501 send_request_reply(r, lkb, error);
2502
2503 unlock_rsb(r);
2504 put_rsb(r);
2505
2506 if (error == -EINPROGRESS)
2507 error = 0;
2508 if (error)
2509 dlm_put_lkb(lkb);
2510 return;
2511
2512 fail:
2513 setup_stub_lkb(ls, ms);
2514 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2515}
2516
2517static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2518{
2519 struct dlm_lkb *lkb;
2520 struct dlm_rsb *r;
2521 int error, reply = 1;
2522
2523 error = find_lkb(ls, ms->m_remid, &lkb);
2524 if (error)
2525 goto fail;
2526
2527 r = lkb->lkb_resource;
2528
2529 hold_rsb(r);
2530 lock_rsb(r);
2531
2532 receive_flags(lkb, ms);
2533 error = receive_convert_args(ls, lkb, ms);
2534 if (error)
2535 goto out;
2536 reply = !down_conversion(lkb);
2537
2538 error = do_convert(r, lkb);
2539 out:
2540 if (reply)
2541 send_convert_reply(r, lkb, error);
2542
2543 unlock_rsb(r);
2544 put_rsb(r);
2545 dlm_put_lkb(lkb);
2546 return;
2547
2548 fail:
2549 setup_stub_lkb(ls, ms);
2550 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2551}
2552
2553static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2554{
2555 struct dlm_lkb *lkb;
2556 struct dlm_rsb *r;
2557 int error;
2558
2559 error = find_lkb(ls, ms->m_remid, &lkb);
2560 if (error)
2561 goto fail;
2562
2563 r = lkb->lkb_resource;
2564
2565 hold_rsb(r);
2566 lock_rsb(r);
2567
2568 receive_flags(lkb, ms);
2569 error = receive_unlock_args(ls, lkb, ms);
2570 if (error)
2571 goto out;
2572
2573 error = do_unlock(r, lkb);
2574 out:
2575 send_unlock_reply(r, lkb, error);
2576
2577 unlock_rsb(r);
2578 put_rsb(r);
2579 dlm_put_lkb(lkb);
2580 return;
2581
2582 fail:
2583 setup_stub_lkb(ls, ms);
2584 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2585}
2586
2587static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2588{
2589 struct dlm_lkb *lkb;
2590 struct dlm_rsb *r;
2591 int error;
2592
2593 error = find_lkb(ls, ms->m_remid, &lkb);
2594 if (error)
2595 goto fail;
2596
2597 receive_flags(lkb, ms);
2598
2599 r = lkb->lkb_resource;
2600
2601 hold_rsb(r);
2602 lock_rsb(r);
2603
2604 error = do_cancel(r, lkb);
2605 send_cancel_reply(r, lkb, error);
2606
2607 unlock_rsb(r);
2608 put_rsb(r);
2609 dlm_put_lkb(lkb);
2610 return;
2611
2612 fail:
2613 setup_stub_lkb(ls, ms);
2614 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2615}
2616
2617static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2618{
2619 struct dlm_lkb *lkb;
2620 struct dlm_rsb *r;
2621 int error;
2622
2623 error = find_lkb(ls, ms->m_remid, &lkb);
2624 if (error) {
2625 log_error(ls, "receive_grant no lkb");
2626 return;
2627 }
2628 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2629
2630 r = lkb->lkb_resource;
2631
2632 hold_rsb(r);
2633 lock_rsb(r);
2634
2635 receive_flags_reply(lkb, ms);
2636 grant_lock_pc(r, lkb, ms);
2637 queue_cast(r, lkb, 0);
2638
2639 unlock_rsb(r);
2640 put_rsb(r);
2641 dlm_put_lkb(lkb);
2642}
2643
2644static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2645{
2646 struct dlm_lkb *lkb;
2647 struct dlm_rsb *r;
2648 int error;
2649
2650 error = find_lkb(ls, ms->m_remid, &lkb);
2651 if (error) {
2652 log_error(ls, "receive_bast no lkb");
2653 return;
2654 }
2655 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2656
2657 r = lkb->lkb_resource;
2658
2659 hold_rsb(r);
2660 lock_rsb(r);
2661
2662 queue_bast(r, lkb, ms->m_bastmode);
2663
2664 unlock_rsb(r);
2665 put_rsb(r);
2666 dlm_put_lkb(lkb);
2667}
2668
2669static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2670{
2671 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2672
2673 from_nodeid = ms->m_header.h_nodeid;
2674 our_nodeid = dlm_our_nodeid();
2675
2676 len = receive_extralen(ms);
2677
2678 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2679 if (dir_nodeid != our_nodeid) {
2680 log_error(ls, "lookup dir_nodeid %d from %d",
2681 dir_nodeid, from_nodeid);
2682 error = -EINVAL;
2683 ret_nodeid = -1;
2684 goto out;
2685 }
2686
2687 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2688
2689 /* Optimization: we're master so treat lookup as a request */
2690 if (!error && ret_nodeid == our_nodeid) {
2691 receive_request(ls, ms);
2692 return;
2693 }
2694 out:
2695 send_lookup_reply(ls, ms, ret_nodeid, error);
2696}
2697
2698static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2699{
2700 int len, dir_nodeid, from_nodeid;
2701
2702 from_nodeid = ms->m_header.h_nodeid;
2703
2704 len = receive_extralen(ms);
2705
2706 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2707 if (dir_nodeid != dlm_our_nodeid()) {
2708 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2709 dir_nodeid, from_nodeid);
2710 return;
2711 }
2712
2713 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2714}
2715
2716static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2717{
2718 struct dlm_lkb *lkb;
2719 struct dlm_rsb *r;
2720 int error, mstype;
2721
2722 error = find_lkb(ls, ms->m_remid, &lkb);
2723 if (error) {
2724 log_error(ls, "receive_request_reply no lkb");
2725 return;
2726 }
2727 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2728
2729 mstype = lkb->lkb_wait_type;
2730 error = remove_from_waiters(lkb);
2731 if (error) {
2732 log_error(ls, "receive_request_reply not on waiters");
2733 goto out;
2734 }
2735
2736 /* this is the value returned from do_request() on the master */
2737 error = ms->m_result;
2738
2739 r = lkb->lkb_resource;
2740 hold_rsb(r);
2741 lock_rsb(r);
2742
2743 /* Optimization: the dir node was also the master, so it took our
2744 lookup as a request and sent request reply instead of lookup reply */
2745 if (mstype == DLM_MSG_LOOKUP) {
2746 r->res_nodeid = ms->m_header.h_nodeid;
2747 lkb->lkb_nodeid = r->res_nodeid;
2748 }
2749
2750 switch (error) {
2751 case -EAGAIN:
2752 /* request would block (be queued) on remote master;
2753 the unhold undoes the original ref from create_lkb()
2754 so it leads to the lkb being freed */
2755 queue_cast(r, lkb, -EAGAIN);
2756 confirm_master(r, -EAGAIN);
2757 unhold_lkb(lkb);
2758 break;
2759
2760 case -EINPROGRESS:
2761 case 0:
2762 /* request was queued or granted on remote master */
2763 receive_flags_reply(lkb, ms);
2764 lkb->lkb_remid = ms->m_lkid;
2765 if (error)
2766 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2767 else {
2768 grant_lock_pc(r, lkb, ms);
2769 queue_cast(r, lkb, 0);
2770 }
2771 confirm_master(r, error);
2772 break;
2773
2774 case -EBADR:
2775 case -ENOTBLK:
2776 /* find_rsb failed to find rsb or rsb wasn't master */
2777 r->res_nodeid = -1;
2778 lkb->lkb_nodeid = -1;
2779 _request_lock(r, lkb);
2780 break;
2781
2782 default:
2783 log_error(ls, "receive_request_reply error %d", error);
2784 }
2785
2786 unlock_rsb(r);
2787 put_rsb(r);
2788 out:
2789 dlm_put_lkb(lkb);
2790}
2791
2792static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2793 struct dlm_message *ms)
2794{
2795 int error = ms->m_result;
2796
2797 /* this is the value returned from do_convert() on the master */
2798
2799 switch (error) {
2800 case -EAGAIN:
2801 /* convert would block (be queued) on remote master */
2802 queue_cast(r, lkb, -EAGAIN);
2803 break;
2804
2805 case -EINPROGRESS:
2806 /* convert was queued on remote master */
2807 del_lkb(r, lkb);
2808 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2809 break;
2810
2811 case 0:
2812 /* convert was granted on remote master */
2813 receive_flags_reply(lkb, ms);
2814 grant_lock_pc(r, lkb, ms);
2815 queue_cast(r, lkb, 0);
2816 break;
2817
2818 default:
2819 log_error(r->res_ls, "receive_convert_reply error %d", error);
2820 }
2821}
2822
2823static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2824{
2825 struct dlm_rsb *r = lkb->lkb_resource;
2826
2827 hold_rsb(r);
2828 lock_rsb(r);
2829
2830 __receive_convert_reply(r, lkb, ms);
2831
2832 unlock_rsb(r);
2833 put_rsb(r);
2834}
2835
2836static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2837{
2838 struct dlm_lkb *lkb;
2839 int error;
2840
2841 error = find_lkb(ls, ms->m_remid, &lkb);
2842 if (error) {
2843 log_error(ls, "receive_convert_reply no lkb");
2844 return;
2845 }
2846 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2847
2848 error = remove_from_waiters(lkb);
2849 if (error) {
2850 log_error(ls, "receive_convert_reply not on waiters");
2851 goto out;
2852 }
2853
2854 _receive_convert_reply(lkb, ms);
2855 out:
2856 dlm_put_lkb(lkb);
2857}
2858
2859static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2860{
2861 struct dlm_rsb *r = lkb->lkb_resource;
2862 int error = ms->m_result;
2863
2864 hold_rsb(r);
2865 lock_rsb(r);
2866
2867 /* this is the value returned from do_unlock() on the master */
2868
2869 switch (error) {
2870 case -DLM_EUNLOCK:
2871 receive_flags_reply(lkb, ms);
2872 remove_lock_pc(r, lkb);
2873 queue_cast(r, lkb, -DLM_EUNLOCK);
2874 break;
2875 default:
2876 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2877 }
2878
2879 unlock_rsb(r);
2880 put_rsb(r);
2881}
2882
2883static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2884{
2885 struct dlm_lkb *lkb;
2886 int error;
2887
2888 error = find_lkb(ls, ms->m_remid, &lkb);
2889 if (error) {
2890 log_error(ls, "receive_unlock_reply no lkb");
2891 return;
2892 }
2893 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2894
2895 error = remove_from_waiters(lkb);
2896 if (error) {
2897 log_error(ls, "receive_unlock_reply not on waiters");
2898 goto out;
2899 }
2900
2901 _receive_unlock_reply(lkb, ms);
2902 out:
2903 dlm_put_lkb(lkb);
2904}
2905
2906static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2907{
2908 struct dlm_rsb *r = lkb->lkb_resource;
2909 int error = ms->m_result;
2910
2911 hold_rsb(r);
2912 lock_rsb(r);
2913
2914 /* this is the value returned from do_cancel() on the master */
2915
2916 switch (error) {
2917 case -DLM_ECANCEL:
2918 receive_flags_reply(lkb, ms);
2919 revert_lock_pc(r, lkb);
2920 queue_cast(r, lkb, -DLM_ECANCEL);
2921 break;
2922 default:
2923 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2924 }
2925
2926 unlock_rsb(r);
2927 put_rsb(r);
2928}
2929
2930static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2931{
2932 struct dlm_lkb *lkb;
2933 int error;
2934
2935 error = find_lkb(ls, ms->m_remid, &lkb);
2936 if (error) {
2937 log_error(ls, "receive_cancel_reply no lkb");
2938 return;
2939 }
2940 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2941
2942 error = remove_from_waiters(lkb);
2943 if (error) {
2944 log_error(ls, "receive_cancel_reply not on waiters");
2945 goto out;
2946 }
2947
2948 _receive_cancel_reply(lkb, ms);
2949 out:
2950 dlm_put_lkb(lkb);
2951}
2952
2953static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2954{
2955 struct dlm_lkb *lkb;
2956 struct dlm_rsb *r;
2957 int error, ret_nodeid;
2958
2959 error = find_lkb(ls, ms->m_lkid, &lkb);
2960 if (error) {
2961 log_error(ls, "receive_lookup_reply no lkb");
2962 return;
2963 }
2964
2965 error = remove_from_waiters(lkb);
2966 if (error) {
2967 log_error(ls, "receive_lookup_reply not on waiters");
2968 goto out;
2969 }
2970
2971 /* this is the value returned by dlm_dir_lookup on dir node
2972 FIXME: will a non-zero error ever be returned? */
2973 error = ms->m_result;
2974
2975 r = lkb->lkb_resource;
2976 hold_rsb(r);
2977 lock_rsb(r);
2978
2979 ret_nodeid = ms->m_nodeid;
2980 if (ret_nodeid == dlm_our_nodeid()) {
2981 r->res_nodeid = 0;
2982 ret_nodeid = 0;
2983 r->res_first_lkid = 0;
2984 } else {
2985 /* set_master() will copy res_nodeid to lkb_nodeid */
2986 r->res_nodeid = ret_nodeid;
2987 }
2988
2989 _request_lock(r, lkb);
2990
2991 if (!ret_nodeid)
2992 process_lookup_list(r);
2993
2994 unlock_rsb(r);
2995 put_rsb(r);
2996 out:
2997 dlm_put_lkb(lkb);
2998}
2999
3000int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3001{
3002 struct dlm_message *ms = (struct dlm_message *) hd;
3003 struct dlm_ls *ls;
3004 int error;
3005
3006 if (!recovery)
3007 dlm_message_in(ms);
3008
3009 ls = dlm_find_lockspace_global(hd->h_lockspace);
3010 if (!ls) {
3011 log_print("drop message %d from %d for unknown lockspace %d",
3012 ms->m_type, nodeid, hd->h_lockspace);
3013 return -EINVAL;
3014 }
3015
3016 /* recovery may have just ended leaving a bunch of backed-up requests
3017 in the requestqueue; wait while dlm_recoverd clears them */
3018
3019 if (!recovery)
3020 dlm_wait_requestqueue(ls);
3021
3022 /* recovery may have just started while there were a bunch of
3023 in-flight requests -- save them in requestqueue to be processed
3024 after recovery. we can't let dlm_recvd block on the recovery
3025 lock. if dlm_recoverd is calling this function to clear the
3026 requestqueue, it needs to be interrupted (-EINTR) if another
3027 recovery operation is starting. */
3028
3029 while (1) {
3030 if (dlm_locking_stopped(ls)) {
3031 if (!recovery)
3032 dlm_add_requestqueue(ls, nodeid, hd);
3033 error = -EINTR;
3034 goto out;
3035 }
3036
3037 if (lock_recovery_try(ls))
3038 break;
3039 schedule();
3040 }
3041
3042 switch (ms->m_type) {
3043
3044 /* messages sent to a master node */
3045
3046 case DLM_MSG_REQUEST:
3047 receive_request(ls, ms);
3048 break;
3049
3050 case DLM_MSG_CONVERT:
3051 receive_convert(ls, ms);
3052 break;
3053
3054 case DLM_MSG_UNLOCK:
3055 receive_unlock(ls, ms);
3056 break;
3057
3058 case DLM_MSG_CANCEL:
3059 receive_cancel(ls, ms);
3060 break;
3061
3062 /* messages sent from a master node (replies to above) */
3063
3064 case DLM_MSG_REQUEST_REPLY:
3065 receive_request_reply(ls, ms);
3066 break;
3067
3068 case DLM_MSG_CONVERT_REPLY:
3069 receive_convert_reply(ls, ms);
3070 break;
3071
3072 case DLM_MSG_UNLOCK_REPLY:
3073 receive_unlock_reply(ls, ms);
3074 break;
3075
3076 case DLM_MSG_CANCEL_REPLY:
3077 receive_cancel_reply(ls, ms);
3078 break;
3079
3080 /* messages sent from a master node (only two types of async msg) */
3081
3082 case DLM_MSG_GRANT:
3083 receive_grant(ls, ms);
3084 break;
3085
3086 case DLM_MSG_BAST:
3087 receive_bast(ls, ms);
3088 break;
3089
3090 /* messages sent to a dir node */
3091
3092 case DLM_MSG_LOOKUP:
3093 receive_lookup(ls, ms);
3094 break;
3095
3096 case DLM_MSG_REMOVE:
3097 receive_remove(ls, ms);
3098 break;
3099
3100 /* messages sent from a dir node (remove has no reply) */
3101
3102 case DLM_MSG_LOOKUP_REPLY:
3103 receive_lookup_reply(ls, ms);
3104 break;
3105
3106 default:
3107 log_error(ls, "unknown message type %d", ms->m_type);
3108 }
3109
3110 unlock_recovery(ls);
3111 out:
3112 dlm_put_lockspace(ls);
3113 dlm_astd_wake();
3114 return 0;
3115}
3116
3117
3118/*
3119 * Recovery related
3120 */
3121
3122static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3123{
3124 if (middle_conversion(lkb)) {
3125 hold_lkb(lkb);
3126 ls->ls_stub_ms.m_result = -EINPROGRESS;
3127 _remove_from_waiters(lkb);
3128 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3129
3130 /* Same special case as in receive_rcom_lock_args() */
3131 lkb->lkb_grmode = DLM_LOCK_IV;
3132 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3133 unhold_lkb(lkb);
3134
3135 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3136 lkb->lkb_flags |= DLM_IFL_RESEND;
3137 }
3138
3139 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3140 conversions are async; there's no reply from the remote master */
3141}
3142
3143/* A waiting lkb needs recovery if the master node has failed, or
3144 the master node is changing (only when no directory is used) */
3145
3146static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3147{
3148 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3149 return 1;
3150
3151 if (!dlm_no_directory(ls))
3152 return 0;
3153
3154 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3155 return 1;
3156
3157 return 0;
3158}
3159
3160/* Recovery for locks that are waiting for replies from nodes that are now
3161 gone. We can just complete unlocks and cancels by faking a reply from the
3162 dead node. Requests and up-conversions we flag to be resent after
3163 recovery. Down-conversions can just be completed with a fake reply like
3164 unlocks. Conversions between PR and CW need special attention. */
3165
3166void dlm_recover_waiters_pre(struct dlm_ls *ls)
3167{
3168 struct dlm_lkb *lkb, *safe;
3169
3170 mutex_lock(&ls->ls_waiters_mutex);
3171
3172 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3173 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3174 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3175
3176 /* all outstanding lookups, regardless of destination will be
3177 resent after recovery is done */
3178
3179 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3180 lkb->lkb_flags |= DLM_IFL_RESEND;
3181 continue;
3182 }
3183
3184 if (!waiter_needs_recovery(ls, lkb))
3185 continue;
3186
3187 switch (lkb->lkb_wait_type) {
3188
3189 case DLM_MSG_REQUEST:
3190 lkb->lkb_flags |= DLM_IFL_RESEND;
3191 break;
3192
3193 case DLM_MSG_CONVERT:
3194 recover_convert_waiter(ls, lkb);
3195 break;
3196
3197 case DLM_MSG_UNLOCK:
3198 hold_lkb(lkb);
3199 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3200 _remove_from_waiters(lkb);
3201 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3202 dlm_put_lkb(lkb);
3203 break;
3204
3205 case DLM_MSG_CANCEL:
3206 hold_lkb(lkb);
3207 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3208 _remove_from_waiters(lkb);
3209 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3210 dlm_put_lkb(lkb);
3211 break;
3212
3213 default:
3214 log_error(ls, "invalid lkb wait_type %d",
3215 lkb->lkb_wait_type);
3216 }
3217 schedule();
3218 }
3219 mutex_unlock(&ls->ls_waiters_mutex);
3220}
3221
3222static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3223{
3224 struct dlm_lkb *lkb;
3225 int rv = 0;
3226
3227 mutex_lock(&ls->ls_waiters_mutex);
3228 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3229 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3230 rv = lkb->lkb_wait_type;
3231 _remove_from_waiters(lkb);
3232 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3233 break;
3234 }
3235 }
3236 mutex_unlock(&ls->ls_waiters_mutex);
3237
3238 if (!rv)
3239 lkb = NULL;
3240 *lkb_ret = lkb;
3241 return rv;
3242}
3243
3244/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3245 master or dir-node for r. Processing the lkb may result in it being placed
3246 back on waiters. */
3247
3248int dlm_recover_waiters_post(struct dlm_ls *ls)
3249{
3250 struct dlm_lkb *lkb;
3251 struct dlm_rsb *r;
3252 int error = 0, mstype;
3253
3254 while (1) {
3255 if (dlm_locking_stopped(ls)) {
3256 log_debug(ls, "recover_waiters_post aborted");
3257 error = -EINTR;
3258 break;
3259 }
3260
3261 mstype = remove_resend_waiter(ls, &lkb);
3262 if (!mstype)
3263 break;
3264
3265 r = lkb->lkb_resource;
3266
3267 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3268 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3269
3270 switch (mstype) {
3271
3272 case DLM_MSG_LOOKUP:
3273 hold_rsb(r);
3274 lock_rsb(r);
3275 _request_lock(r, lkb);
3276 if (is_master(r))
3277 confirm_master(r, 0);
3278 unlock_rsb(r);
3279 put_rsb(r);
3280 break;
3281
3282 case DLM_MSG_REQUEST:
3283 hold_rsb(r);
3284 lock_rsb(r);
3285 _request_lock(r, lkb);
3286 if (is_master(r))
3287 confirm_master(r, 0);
3288 unlock_rsb(r);
3289 put_rsb(r);
3290 break;
3291
3292 case DLM_MSG_CONVERT:
3293 hold_rsb(r);
3294 lock_rsb(r);
3295 _convert_lock(r, lkb);
3296 unlock_rsb(r);
3297 put_rsb(r);
3298 break;
3299
3300 default:
3301 log_error(ls, "recover_waiters_post type %d", mstype);
3302 }
3303 }
3304
3305 return error;
3306}
3307
3308static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3309 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3310{
3311 struct dlm_ls *ls = r->res_ls;
3312 struct dlm_lkb *lkb, *safe;
3313
3314 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3315 if (test(ls, lkb)) {
3316 rsb_set_flag(r, RSB_LOCKS_PURGED);
3317 del_lkb(r, lkb);
3318 /* this put should free the lkb */
3319 if (!dlm_put_lkb(lkb))
3320 log_error(ls, "purged lkb not released");
3321 }
3322 }
3323}
3324
3325static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3326{
3327 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3328}
3329
3330static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3331{
3332 return is_master_copy(lkb);
3333}
3334
3335static void purge_dead_locks(struct dlm_rsb *r)
3336{
3337 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3338 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3339 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3340}
3341
3342void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3343{
3344 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3345 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3346 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3347}
3348
3349/* Get rid of locks held by nodes that are gone. */
3350
3351int dlm_purge_locks(struct dlm_ls *ls)
3352{
3353 struct dlm_rsb *r;
3354
3355 log_debug(ls, "dlm_purge_locks");
3356
3357 down_write(&ls->ls_root_sem);
3358 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3359 hold_rsb(r);
3360 lock_rsb(r);
3361 if (is_master(r))
3362 purge_dead_locks(r);
3363 unlock_rsb(r);
3364 unhold_rsb(r);
3365
3366 schedule();
3367 }
3368 up_write(&ls->ls_root_sem);
3369
3370 return 0;
3371}
3372
3373static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
3374{
3375 struct dlm_rsb *r, *r_ret = NULL;
3376
3377 read_lock(&ls->ls_rsbtbl[bucket].lock);
3378 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
3379 if (!rsb_flag(r, RSB_LOCKS_PURGED))
3380 continue;
3381 hold_rsb(r);
3382 rsb_clear_flag(r, RSB_LOCKS_PURGED);
3383 r_ret = r;
3384 break;
3385 }
3386 read_unlock(&ls->ls_rsbtbl[bucket].lock);
3387 return r_ret;
3388}
3389
3390void dlm_grant_after_purge(struct dlm_ls *ls)
3391{
3392 struct dlm_rsb *r;
3393 int bucket = 0;
3394
3395 while (1) {
3396 r = find_purged_rsb(ls, bucket);
3397 if (!r) {
3398 if (bucket == ls->ls_rsbtbl_size - 1)
3399 break;
3400 bucket++;
3401 continue;
3402 }
3403 lock_rsb(r);
3404 if (is_master(r)) {
3405 grant_pending_locks(r);
3406 confirm_master(r, 0);
3407 }
3408 unlock_rsb(r);
3409 put_rsb(r);
3410 schedule();
3411 }
3412}
3413
3414static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3415 uint32_t remid)
3416{
3417 struct dlm_lkb *lkb;
3418
3419 list_for_each_entry(lkb, head, lkb_statequeue) {
3420 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3421 return lkb;
3422 }
3423 return NULL;
3424}
3425
3426static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3427 uint32_t remid)
3428{
3429 struct dlm_lkb *lkb;
3430
3431 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3432 if (lkb)
3433 return lkb;
3434 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3435 if (lkb)
3436 return lkb;
3437 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3438 if (lkb)
3439 return lkb;
3440 return NULL;
3441}
3442
3443static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3444 struct dlm_rsb *r, struct dlm_rcom *rc)
3445{
3446 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3447 int lvblen;
3448
3449 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3450 lkb->lkb_ownpid = rl->rl_ownpid;
3451 lkb->lkb_remid = rl->rl_lkid;
3452 lkb->lkb_exflags = rl->rl_exflags;
3453 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3454 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3455 lkb->lkb_lvbseq = rl->rl_lvbseq;
3456 lkb->lkb_rqmode = rl->rl_rqmode;
3457 lkb->lkb_grmode = rl->rl_grmode;
3458 /* don't set lkb_status because add_lkb wants to itself */
3459
3460 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3461 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3462
3463 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3464 lkb->lkb_lvbptr = allocate_lvb(ls);
3465 if (!lkb->lkb_lvbptr)
3466 return -ENOMEM;
3467 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3468 sizeof(struct rcom_lock);
3469 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3470 }
3471
3472 /* Conversions between PR and CW (middle modes) need special handling.
3473 The real granted mode of these converting locks cannot be determined
3474 until all locks have been rebuilt on the rsb (recover_conversion) */
3475
3476 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3477 rl->rl_status = DLM_LKSTS_CONVERT;
3478 lkb->lkb_grmode = DLM_LOCK_IV;
3479 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3480 }
3481
3482 return 0;
3483}
3484
3485/* This lkb may have been recovered in a previous aborted recovery so we need
3486 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3487 If so we just send back a standard reply. If not, we create a new lkb with
3488 the given values and send back our lkid. We send back our lkid by sending
3489 back the rcom_lock struct we got but with the remid field filled in. */
3490
3491int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3492{
3493 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3494 struct dlm_rsb *r;
3495 struct dlm_lkb *lkb;
3496 int error;
3497
3498 if (rl->rl_parent_lkid) {
3499 error = -EOPNOTSUPP;
3500 goto out;
3501 }
3502
3503 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3504 if (error)
3505 goto out;
3506
3507 lock_rsb(r);
3508
3509 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3510 if (lkb) {
3511 error = -EEXIST;
3512 goto out_remid;
3513 }
3514
3515 error = create_lkb(ls, &lkb);
3516 if (error)
3517 goto out_unlock;
3518
3519 error = receive_rcom_lock_args(ls, lkb, r, rc);
3520 if (error) {
3521 __put_lkb(ls, lkb);
3522 goto out_unlock;
3523 }
3524
3525 attach_lkb(r, lkb);
3526 add_lkb(r, lkb, rl->rl_status);
3527 error = 0;
3528
3529 out_remid:
3530 /* this is the new value returned to the lock holder for
3531 saving in its process-copy lkb */
3532 rl->rl_remid = lkb->lkb_id;
3533
3534 out_unlock:
3535 unlock_rsb(r);
3536 put_rsb(r);
3537 out:
3538 if (error)
3539 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3540 rl->rl_result = error;
3541 return error;
3542}
3543
3544int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3545{
3546 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3547 struct dlm_rsb *r;
3548 struct dlm_lkb *lkb;
3549 int error;
3550
3551 error = find_lkb(ls, rl->rl_lkid, &lkb);
3552 if (error) {
3553 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3554 return error;
3555 }
3556
3557 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3558
3559 error = rl->rl_result;
3560
3561 r = lkb->lkb_resource;
3562 hold_rsb(r);
3563 lock_rsb(r);
3564
3565 switch (error) {
3566 case -EEXIST:
3567 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3568 /* fall through */
3569 case 0:
3570 lkb->lkb_remid = rl->rl_remid;
3571 break;
3572 default:
3573 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3574 error, lkb->lkb_id);
3575 }
3576
3577 /* an ack for dlm_recover_locks() which waits for replies from
3578 all the locks it sends to new masters */
3579 dlm_recovered_lock(r);
3580
3581 unlock_rsb(r);
3582 put_rsb(r);
3583 dlm_put_lkb(lkb);
3584
3585 return 0;
3586}
3587
3588int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
3589 int mode, uint32_t flags, void *name, unsigned int namelen,
3590 uint32_t parent_lkid)
3591{
3592 struct dlm_lkb *lkb;
3593 struct dlm_args args;
3594 int error;
3595
3596 lock_recovery(ls);
3597
3598 error = create_lkb(ls, &lkb);
3599 if (error) {
3600 kfree(ua);
3601 goto out;
3602 }
3603
3604 if (flags & DLM_LKF_VALBLK) {
3605 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3606 if (!ua->lksb.sb_lvbptr) {
3607 kfree(ua);
3608 __put_lkb(ls, lkb);
3609 error = -ENOMEM;
3610 goto out;
3611 }
3612 }
3613
3614 /* After ua is attached to lkb it will be freed by free_lkb().
3615 When DLM_IFL_USER is set, the dlm knows that this is a userspace
3616 lock and that lkb_astparam is the dlm_user_args structure. */
3617
3618 error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
3619 DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
3620 lkb->lkb_flags |= DLM_IFL_USER;
3621 ua->old_mode = DLM_LOCK_IV;
3622
3623 if (error) {
3624 __put_lkb(ls, lkb);
3625 goto out;
3626 }
3627
3628 error = request_lock(ls, lkb, name, namelen, &args);
3629
3630 switch (error) {
3631 case 0:
3632 break;
3633 case -EINPROGRESS:
3634 error = 0;
3635 break;
3636 case -EAGAIN:
3637 error = 0;
3638 /* fall through */
3639 default:
3640 __put_lkb(ls, lkb);
3641 goto out;
3642 }
3643
3644 /* add this new lkb to the per-process list of locks */
3645 spin_lock(&ua->proc->locks_spin);
3646 kref_get(&lkb->lkb_ref);
3647 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
3648 spin_unlock(&ua->proc->locks_spin);
3649 out:
3650 unlock_recovery(ls);
3651 return error;
3652}
3653
3654int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3655 int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
3656{
3657 struct dlm_lkb *lkb;
3658 struct dlm_args args;
3659 struct dlm_user_args *ua;
3660 int error;
3661
3662 lock_recovery(ls);
3663
3664 error = find_lkb(ls, lkid, &lkb);
3665 if (error)
3666 goto out;
3667
3668 /* user can change the params on its lock when it converts it, or
3669 add an lvb that didn't exist before */
3670
3671 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3672
3673 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
3674 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3675 if (!ua->lksb.sb_lvbptr) {
3676 error = -ENOMEM;
3677 goto out_put;
3678 }
3679 }
3680 if (lvb_in && ua->lksb.sb_lvbptr)
3681 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3682
3683 ua->castparam = ua_tmp->castparam;
3684 ua->castaddr = ua_tmp->castaddr;
3685 ua->bastparam = ua_tmp->bastparam;
3686 ua->bastaddr = ua_tmp->bastaddr;
3687 ua->user_lksb = ua_tmp->user_lksb;
3688 ua->old_mode = lkb->lkb_grmode;
3689
3690 error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
3691 ua, DLM_FAKE_USER_AST, &args);
3692 if (error)
3693 goto out_put;
3694
3695 error = convert_lock(ls, lkb, &args);
3696
3697 if (error == -EINPROGRESS || error == -EAGAIN)
3698 error = 0;
3699 out_put:
3700 dlm_put_lkb(lkb);
3701 out:
3702 unlock_recovery(ls);
3703 kfree(ua_tmp);
3704 return error;
3705}
3706
3707int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3708 uint32_t flags, uint32_t lkid, char *lvb_in)
3709{
3710 struct dlm_lkb *lkb;
3711 struct dlm_args args;
3712 struct dlm_user_args *ua;
3713 int error;
3714
3715 lock_recovery(ls);
3716
3717 error = find_lkb(ls, lkid, &lkb);
3718 if (error)
3719 goto out;
3720
3721 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3722
3723 if (lvb_in && ua->lksb.sb_lvbptr)
3724 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3725 ua->castparam = ua_tmp->castparam;
3726 ua->user_lksb = ua_tmp->user_lksb;
3727
3728 error = set_unlock_args(flags, ua, &args);
3729 if (error)
3730 goto out_put;
3731
3732 error = unlock_lock(ls, lkb, &args);
3733
3734 if (error == -DLM_EUNLOCK)
3735 error = 0;
3736 if (error)
3737 goto out_put;
3738
3739 spin_lock(&ua->proc->locks_spin);
3740 list_del_init(&lkb->lkb_ownqueue);
3741 spin_unlock(&ua->proc->locks_spin);
3742
3743 /* this removes the reference for the proc->locks list added by
3744 dlm_user_request */
3745 unhold_lkb(lkb);
3746 out_put:
3747 dlm_put_lkb(lkb);
3748 out:
3749 unlock_recovery(ls);
3750 return error;
3751}
3752
3753int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3754 uint32_t flags, uint32_t lkid)
3755{
3756 struct dlm_lkb *lkb;
3757 struct dlm_args args;
3758 struct dlm_user_args *ua;
3759 int error;
3760
3761 lock_recovery(ls);
3762
3763 error = find_lkb(ls, lkid, &lkb);
3764 if (error)
3765 goto out;
3766
3767 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3768 ua->castparam = ua_tmp->castparam;
3769 ua->user_lksb = ua_tmp->user_lksb;
3770
3771 error = set_unlock_args(flags, ua, &args);
3772 if (error)
3773 goto out_put;
3774
3775 error = cancel_lock(ls, lkb, &args);
3776
3777 if (error == -DLM_ECANCEL)
3778 error = 0;
3779 if (error)
3780 goto out_put;
3781
3782 /* this lkb was removed from the WAITING queue */
3783 if (lkb->lkb_grmode == DLM_LOCK_IV) {
3784 spin_lock(&ua->proc->locks_spin);
3785 list_del_init(&lkb->lkb_ownqueue);
3786 spin_unlock(&ua->proc->locks_spin);
3787 unhold_lkb(lkb);
3788 }
3789 out_put:
3790 dlm_put_lkb(lkb);
3791 out:
3792 unlock_recovery(ls);
3793 return error;
3794}
3795
3796static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3797{
3798 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3799
3800 if (ua->lksb.sb_lvbptr)
3801 kfree(ua->lksb.sb_lvbptr);
3802 kfree(ua);
3803 lkb->lkb_astparam = (long)NULL;
3804
3805 /* TODO: propogate to master if needed */
3806 return 0;
3807}
3808
3809/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
3810 Regardless of what rsb queue the lock is on, it's removed and freed. */
3811
3812static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3813{
3814 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3815 struct dlm_args args;
3816 int error;
3817
3818 /* FIXME: we need to handle the case where the lkb is in limbo
3819 while the rsb is being looked up, currently we assert in
3820 _unlock_lock/is_remote because rsb nodeid is -1. */
3821
3822 set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
3823
3824 error = unlock_lock(ls, lkb, &args);
3825 if (error == -DLM_EUNLOCK)
3826 error = 0;
3827 return error;
3828}
3829
3830/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
3831 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
3832 which we clear here. */
3833
3834/* proc CLOSING flag is set so no more device_reads should look at proc->asts
3835 list, and no more device_writes should add lkb's to proc->locks list; so we
3836 shouldn't need to take asts_spin or locks_spin here. this assumes that
3837 device reads/writes/closes are serialized -- FIXME: we may need to serialize
3838 them ourself. */
3839
3840void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3841{
3842 struct dlm_lkb *lkb, *safe;
3843
3844 lock_recovery(ls);
3845 mutex_lock(&ls->ls_clear_proc_locks);
3846
3847 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
3848 if (lkb->lkb_ast_type) {
3849 list_del(&lkb->lkb_astqueue);
3850 unhold_lkb(lkb);
3851 }
3852
3853 list_del_init(&lkb->lkb_ownqueue);
3854
3855 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
3856 lkb->lkb_flags |= DLM_IFL_ORPHAN;
3857 orphan_proc_lock(ls, lkb);
3858 } else {
3859 lkb->lkb_flags |= DLM_IFL_DEAD;
3860 unlock_proc_lock(ls, lkb);
3861 }
3862
3863 /* this removes the reference for the proc->locks list
3864 added by dlm_user_request, it may result in the lkb
3865 being freed */
3866
3867 dlm_put_lkb(lkb);
3868 }
3869 mutex_unlock(&ls->ls_clear_proc_locks);
3870 unlock_recovery(ls);
3871}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_dump_rsb(struct dlm_rsb *r);
18void dlm_print_lkb(struct dlm_lkb *lkb);
19int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
20int dlm_modes_compat(int mode1, int mode2);
21int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
22 unsigned int flags, struct dlm_rsb **r_ret);
23void dlm_put_rsb(struct dlm_rsb *r);
24void dlm_hold_rsb(struct dlm_rsb *r);
25int dlm_put_lkb(struct dlm_lkb *lkb);
26void dlm_scan_rsbs(struct dlm_ls *ls);
27
28int dlm_purge_locks(struct dlm_ls *ls);
29void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
30void dlm_grant_after_purge(struct dlm_ls *ls);
31int dlm_recover_waiters_post(struct dlm_ls *ls);
32void dlm_recover_waiters_pre(struct dlm_ls *ls);
33int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
34int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
35
36int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
37 uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
38int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
39 int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
40int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
41 uint32_t flags, uint32_t lkid, char *lvb_in);
42int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
43 uint32_t flags, uint32_t lkid);
44void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
45
46static inline int is_master(struct dlm_rsb *r)
47{
48 return !r->res_nodeid;
49}
50
51static inline void lock_rsb(struct dlm_rsb *r)
52{
53 mutex_lock(&r->res_mutex);
54}
55
56static inline void unlock_rsb(struct dlm_rsb *r)
57{
58 mutex_unlock(&r->res_mutex);
59}
60
61#endif
62
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..109333c8ecb9
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24#include "recover.h"
25
26#ifdef CONFIG_DLM_DEBUG
27int dlm_create_debug_file(struct dlm_ls *ls);
28void dlm_delete_debug_file(struct dlm_ls *ls);
29#else
30static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
31static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
32#endif
33
34static int ls_count;
35static struct mutex ls_lock;
36static struct list_head lslist;
37static spinlock_t lslist_lock;
38static struct task_struct * scand_task;
39
40
41static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
42{
43 ssize_t ret = len;
44 int n = simple_strtol(buf, NULL, 0);
45
46 switch (n) {
47 case 0:
48 dlm_ls_stop(ls);
49 break;
50 case 1:
51 dlm_ls_start(ls);
52 break;
53 default:
54 ret = -EINVAL;
55 }
56 return ret;
57}
58
59static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
60{
61 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
62 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
63 wake_up(&ls->ls_uevent_wait);
64 return len;
65}
66
67static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
68{
69 return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
70}
71
72static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
73{
74 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
75 return len;
76}
77
78static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
79{
80 uint32_t status = dlm_recover_status(ls);
81 return snprintf(buf, PAGE_SIZE, "%x\n", status);
82}
83
84static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
85{
86 return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
87}
88
89struct dlm_attr {
90 struct attribute attr;
91 ssize_t (*show)(struct dlm_ls *, char *);
92 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
93};
94
95static struct dlm_attr dlm_attr_control = {
96 .attr = {.name = "control", .mode = S_IWUSR},
97 .store = dlm_control_store
98};
99
100static struct dlm_attr dlm_attr_event = {
101 .attr = {.name = "event_done", .mode = S_IWUSR},
102 .store = dlm_event_store
103};
104
105static struct dlm_attr dlm_attr_id = {
106 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
107 .show = dlm_id_show,
108 .store = dlm_id_store
109};
110
111static struct dlm_attr dlm_attr_recover_status = {
112 .attr = {.name = "recover_status", .mode = S_IRUGO},
113 .show = dlm_recover_status_show
114};
115
116static struct dlm_attr dlm_attr_recover_nodeid = {
117 .attr = {.name = "recover_nodeid", .mode = S_IRUGO},
118 .show = dlm_recover_nodeid_show
119};
120
121static struct attribute *dlm_attrs[] = {
122 &dlm_attr_control.attr,
123 &dlm_attr_event.attr,
124 &dlm_attr_id.attr,
125 &dlm_attr_recover_status.attr,
126 &dlm_attr_recover_nodeid.attr,
127 NULL,
128};
129
130static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
131 char *buf)
132{
133 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
134 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
135 return a->show ? a->show(ls, buf) : 0;
136}
137
138static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
139 const char *buf, size_t len)
140{
141 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
142 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
143 return a->store ? a->store(ls, buf, len) : len;
144}
145
146static struct sysfs_ops dlm_attr_ops = {
147 .show = dlm_attr_show,
148 .store = dlm_attr_store,
149};
150
151static struct kobj_type dlm_ktype = {
152 .default_attrs = dlm_attrs,
153 .sysfs_ops = &dlm_attr_ops,
154};
155
156static struct kset dlm_kset = {
157 .subsys = &kernel_subsys,
158 .kobj = {.name = "dlm",},
159 .ktype = &dlm_ktype,
160};
161
162static int kobject_setup(struct dlm_ls *ls)
163{
164 char lsname[DLM_LOCKSPACE_LEN];
165 int error;
166
167 memset(lsname, 0, DLM_LOCKSPACE_LEN);
168 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
169
170 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
171 if (error)
172 return error;
173
174 ls->ls_kobj.kset = &dlm_kset;
175 ls->ls_kobj.ktype = &dlm_ktype;
176 return 0;
177}
178
179static int do_uevent(struct dlm_ls *ls, int in)
180{
181 int error;
182
183 if (in)
184 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
185 else
186 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
187
188 error = wait_event_interruptible(ls->ls_uevent_wait,
189 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
190 if (error)
191 goto out;
192
193 error = ls->ls_uevent_result;
194 out:
195 return error;
196}
197
198
199int dlm_lockspace_init(void)
200{
201 int error;
202
203 ls_count = 0;
204 mutex_init(&ls_lock);
205 INIT_LIST_HEAD(&lslist);
206 spin_lock_init(&lslist_lock);
207
208 error = kset_register(&dlm_kset);
209 if (error)
210 printk("dlm_lockspace_init: cannot register kset %d\n", error);
211 return error;
212}
213
214void dlm_lockspace_exit(void)
215{
216 kset_unregister(&dlm_kset);
217}
218
219static int dlm_scand(void *data)
220{
221 struct dlm_ls *ls;
222
223 while (!kthread_should_stop()) {
224 list_for_each_entry(ls, &lslist, ls_list)
225 dlm_scan_rsbs(ls);
226 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
227 }
228 return 0;
229}
230
231static int dlm_scand_start(void)
232{
233 struct task_struct *p;
234 int error = 0;
235
236 p = kthread_run(dlm_scand, NULL, "dlm_scand");
237 if (IS_ERR(p))
238 error = PTR_ERR(p);
239 else
240 scand_task = p;
241 return error;
242}
243
244static void dlm_scand_stop(void)
245{
246 kthread_stop(scand_task);
247}
248
249static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
250{
251 struct dlm_ls *ls;
252
253 spin_lock(&lslist_lock);
254
255 list_for_each_entry(ls, &lslist, ls_list) {
256 if (ls->ls_namelen == namelen &&
257 memcmp(ls->ls_name, name, namelen) == 0)
258 goto out;
259 }
260 ls = NULL;
261 out:
262 spin_unlock(&lslist_lock);
263 return ls;
264}
265
266struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
267{
268 struct dlm_ls *ls;
269
270 spin_lock(&lslist_lock);
271
272 list_for_each_entry(ls, &lslist, ls_list) {
273 if (ls->ls_global_id == id) {
274 ls->ls_count++;
275 goto out;
276 }
277 }
278 ls = NULL;
279 out:
280 spin_unlock(&lslist_lock);
281 return ls;
282}
283
284struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
285{
286 struct dlm_ls *ls;
287
288 spin_lock(&lslist_lock);
289 list_for_each_entry(ls, &lslist, ls_list) {
290 if (ls->ls_local_handle == lockspace) {
291 ls->ls_count++;
292 goto out;
293 }
294 }
295 ls = NULL;
296 out:
297 spin_unlock(&lslist_lock);
298 return ls;
299}
300
301struct dlm_ls *dlm_find_lockspace_device(int minor)
302{
303 struct dlm_ls *ls;
304
305 spin_lock(&lslist_lock);
306 list_for_each_entry(ls, &lslist, ls_list) {
307 if (ls->ls_device.minor == minor) {
308 ls->ls_count++;
309 goto out;
310 }
311 }
312 ls = NULL;
313 out:
314 spin_unlock(&lslist_lock);
315 return ls;
316}
317
318void dlm_put_lockspace(struct dlm_ls *ls)
319{
320 spin_lock(&lslist_lock);
321 ls->ls_count--;
322 spin_unlock(&lslist_lock);
323}
324
325static void remove_lockspace(struct dlm_ls *ls)
326{
327 for (;;) {
328 spin_lock(&lslist_lock);
329 if (ls->ls_count == 0) {
330 list_del(&ls->ls_list);
331 spin_unlock(&lslist_lock);
332 return;
333 }
334 spin_unlock(&lslist_lock);
335 ssleep(1);
336 }
337}
338
339static int threads_start(void)
340{
341 int error;
342
343 /* Thread which process lock requests for all lockspace's */
344 error = dlm_astd_start();
345 if (error) {
346 log_print("cannot start dlm_astd thread %d", error);
347 goto fail;
348 }
349
350 error = dlm_scand_start();
351 if (error) {
352 log_print("cannot start dlm_scand thread %d", error);
353 goto astd_fail;
354 }
355
356 /* Thread for sending/receiving messages for all lockspace's */
357 error = dlm_lowcomms_start();
358 if (error) {
359 log_print("cannot start dlm lowcomms %d", error);
360 goto scand_fail;
361 }
362
363 return 0;
364
365 scand_fail:
366 dlm_scand_stop();
367 astd_fail:
368 dlm_astd_stop();
369 fail:
370 return error;
371}
372
373static void threads_stop(void)
374{
375 dlm_scand_stop();
376 dlm_lowcomms_stop();
377 dlm_astd_stop();
378}
379
380static int new_lockspace(char *name, int namelen, void **lockspace,
381 uint32_t flags, int lvblen)
382{
383 struct dlm_ls *ls;
384 int i, size, error = -ENOMEM;
385
386 if (namelen > DLM_LOCKSPACE_LEN)
387 return -EINVAL;
388
389 if (!lvblen || (lvblen % 8))
390 return -EINVAL;
391
392 if (!try_module_get(THIS_MODULE))
393 return -EINVAL;
394
395 ls = dlm_find_lockspace_name(name, namelen);
396 if (ls) {
397 *lockspace = ls;
398 module_put(THIS_MODULE);
399 return -EEXIST;
400 }
401
402 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
403 if (!ls)
404 goto out;
405 memcpy(ls->ls_name, name, namelen);
406 ls->ls_namelen = namelen;
407 ls->ls_exflags = flags;
408 ls->ls_lvblen = lvblen;
409 ls->ls_count = 0;
410 ls->ls_flags = 0;
411
412 size = dlm_config.rsbtbl_size;
413 ls->ls_rsbtbl_size = size;
414
415 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
416 if (!ls->ls_rsbtbl)
417 goto out_lsfree;
418 for (i = 0; i < size; i++) {
419 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
420 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
421 rwlock_init(&ls->ls_rsbtbl[i].lock);
422 }
423
424 size = dlm_config.lkbtbl_size;
425 ls->ls_lkbtbl_size = size;
426
427 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
428 if (!ls->ls_lkbtbl)
429 goto out_rsbfree;
430 for (i = 0; i < size; i++) {
431 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
432 rwlock_init(&ls->ls_lkbtbl[i].lock);
433 ls->ls_lkbtbl[i].counter = 1;
434 }
435
436 size = dlm_config.dirtbl_size;
437 ls->ls_dirtbl_size = size;
438
439 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
440 if (!ls->ls_dirtbl)
441 goto out_lkbfree;
442 for (i = 0; i < size; i++) {
443 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
444 rwlock_init(&ls->ls_dirtbl[i].lock);
445 }
446
447 INIT_LIST_HEAD(&ls->ls_waiters);
448 mutex_init(&ls->ls_waiters_mutex);
449
450 INIT_LIST_HEAD(&ls->ls_nodes);
451 INIT_LIST_HEAD(&ls->ls_nodes_gone);
452 ls->ls_num_nodes = 0;
453 ls->ls_low_nodeid = 0;
454 ls->ls_total_weight = 0;
455 ls->ls_node_array = NULL;
456
457 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
458 ls->ls_stub_rsb.res_ls = ls;
459
460 ls->ls_debug_rsb_dentry = NULL;
461 ls->ls_debug_waiters_dentry = NULL;
462
463 init_waitqueue_head(&ls->ls_uevent_wait);
464 ls->ls_uevent_result = 0;
465
466 ls->ls_recoverd_task = NULL;
467 mutex_init(&ls->ls_recoverd_active);
468 spin_lock_init(&ls->ls_recover_lock);
469 ls->ls_recover_status = 0;
470 ls->ls_recover_seq = 0;
471 ls->ls_recover_args = NULL;
472 init_rwsem(&ls->ls_in_recovery);
473 INIT_LIST_HEAD(&ls->ls_requestqueue);
474 mutex_init(&ls->ls_requestqueue_mutex);
475 mutex_init(&ls->ls_clear_proc_locks);
476
477 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
478 if (!ls->ls_recover_buf)
479 goto out_dirfree;
480
481 INIT_LIST_HEAD(&ls->ls_recover_list);
482 spin_lock_init(&ls->ls_recover_list_lock);
483 ls->ls_recover_list_count = 0;
484 ls->ls_local_handle = ls;
485 init_waitqueue_head(&ls->ls_wait_general);
486 INIT_LIST_HEAD(&ls->ls_root_list);
487 init_rwsem(&ls->ls_root_sem);
488
489 down_write(&ls->ls_in_recovery);
490
491 spin_lock(&lslist_lock);
492 list_add(&ls->ls_list, &lslist);
493 spin_unlock(&lslist_lock);
494
495 /* needs to find ls in lslist */
496 error = dlm_recoverd_start(ls);
497 if (error) {
498 log_error(ls, "can't start dlm_recoverd %d", error);
499 goto out_rcomfree;
500 }
501
502 dlm_create_debug_file(ls);
503
504 error = kobject_setup(ls);
505 if (error)
506 goto out_del;
507
508 error = kobject_register(&ls->ls_kobj);
509 if (error)
510 goto out_del;
511
512 error = do_uevent(ls, 1);
513 if (error)
514 goto out_unreg;
515
516 *lockspace = ls;
517 return 0;
518
519 out_unreg:
520 kobject_unregister(&ls->ls_kobj);
521 out_del:
522 dlm_delete_debug_file(ls);
523 dlm_recoverd_stop(ls);
524 out_rcomfree:
525 spin_lock(&lslist_lock);
526 list_del(&ls->ls_list);
527 spin_unlock(&lslist_lock);
528 kfree(ls->ls_recover_buf);
529 out_dirfree:
530 kfree(ls->ls_dirtbl);
531 out_lkbfree:
532 kfree(ls->ls_lkbtbl);
533 out_rsbfree:
534 kfree(ls->ls_rsbtbl);
535 out_lsfree:
536 kfree(ls);
537 out:
538 module_put(THIS_MODULE);
539 return error;
540}
541
542int dlm_new_lockspace(char *name, int namelen, void **lockspace,
543 uint32_t flags, int lvblen)
544{
545 int error = 0;
546
547 mutex_lock(&ls_lock);
548 if (!ls_count)
549 error = threads_start();
550 if (error)
551 goto out;
552
553 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
554 if (!error)
555 ls_count++;
556 out:
557 mutex_unlock(&ls_lock);
558 return error;
559}
560
561/* Return 1 if the lockspace still has active remote locks,
562 * 2 if the lockspace still has active local locks.
563 */
564static int lockspace_busy(struct dlm_ls *ls)
565{
566 int i, lkb_found = 0;
567 struct dlm_lkb *lkb;
568
569 /* NOTE: We check the lockidtbl here rather than the resource table.
570 This is because there may be LKBs queued as ASTs that have been
571 unlinked from their RSBs and are pending deletion once the AST has
572 been delivered */
573
574 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
575 read_lock(&ls->ls_lkbtbl[i].lock);
576 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
577 lkb_found = 1;
578 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
579 lkb_idtbl_list) {
580 if (!lkb->lkb_nodeid) {
581 read_unlock(&ls->ls_lkbtbl[i].lock);
582 return 2;
583 }
584 }
585 }
586 read_unlock(&ls->ls_lkbtbl[i].lock);
587 }
588 return lkb_found;
589}
590
591static int release_lockspace(struct dlm_ls *ls, int force)
592{
593 struct dlm_lkb *lkb;
594 struct dlm_rsb *rsb;
595 struct list_head *head;
596 int i;
597 int busy = lockspace_busy(ls);
598
599 if (busy > force)
600 return -EBUSY;
601
602 if (force < 3)
603 do_uevent(ls, 0);
604
605 dlm_recoverd_stop(ls);
606
607 remove_lockspace(ls);
608
609 dlm_delete_debug_file(ls);
610
611 dlm_astd_suspend();
612
613 kfree(ls->ls_recover_buf);
614
615 /*
616 * Free direntry structs.
617 */
618
619 dlm_dir_clear(ls);
620 kfree(ls->ls_dirtbl);
621
622 /*
623 * Free all lkb's on lkbtbl[] lists.
624 */
625
626 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
627 head = &ls->ls_lkbtbl[i].list;
628 while (!list_empty(head)) {
629 lkb = list_entry(head->next, struct dlm_lkb,
630 lkb_idtbl_list);
631
632 list_del(&lkb->lkb_idtbl_list);
633
634 dlm_del_ast(lkb);
635
636 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
637 free_lvb(lkb->lkb_lvbptr);
638
639 free_lkb(lkb);
640 }
641 }
642 dlm_astd_resume();
643
644 kfree(ls->ls_lkbtbl);
645
646 /*
647 * Free all rsb's on rsbtbl[] lists
648 */
649
650 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
651 head = &ls->ls_rsbtbl[i].list;
652 while (!list_empty(head)) {
653 rsb = list_entry(head->next, struct dlm_rsb,
654 res_hashchain);
655
656 list_del(&rsb->res_hashchain);
657 free_rsb(rsb);
658 }
659
660 head = &ls->ls_rsbtbl[i].toss;
661 while (!list_empty(head)) {
662 rsb = list_entry(head->next, struct dlm_rsb,
663 res_hashchain);
664 list_del(&rsb->res_hashchain);
665 free_rsb(rsb);
666 }
667 }
668
669 kfree(ls->ls_rsbtbl);
670
671 /*
672 * Free structures on any other lists
673 */
674
675 kfree(ls->ls_recover_args);
676 dlm_clear_free_entries(ls);
677 dlm_clear_members(ls);
678 dlm_clear_members_gone(ls);
679 kfree(ls->ls_node_array);
680 kobject_unregister(&ls->ls_kobj);
681 kfree(ls);
682
683 mutex_lock(&ls_lock);
684 ls_count--;
685 if (!ls_count)
686 threads_stop();
687 mutex_unlock(&ls_lock);
688
689 module_put(THIS_MODULE);
690 return 0;
691}
692
693/*
694 * Called when a system has released all its locks and is not going to use the
695 * lockspace any longer. We free everything we're managing for this lockspace.
696 * Remaining nodes will go through the recovery process as if we'd died. The
697 * lockspace must continue to function as usual, participating in recoveries,
698 * until this returns.
699 *
700 * Force has 4 possible values:
701 * 0 - don't destroy locksapce if it has any LKBs
702 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
703 * 2 - destroy lockspace regardless of LKBs
704 * 3 - destroy lockspace as part of a forced shutdown
705 */
706
707int dlm_release_lockspace(void *lockspace, int force)
708{
709 struct dlm_ls *ls;
710
711 ls = dlm_find_lockspace_local(lockspace);
712 if (!ls)
713 return -EINVAL;
714 dlm_put_lockspace(ls);
715 return release_lockspace(ls, force);
716}
717
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls);
23
24#endif /* __LOCKSPACE_DOT_H__ */
25
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..6da6b14d5a61
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1239 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
59static int dlm_local_count;
60static int dlm_local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!dlm_local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (dlm_local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!dlm_local_count)
264 return;
265
266 if (!port) {
267 if (dlm_local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = dlm_local_addr[0]->ss_family;
277 if (dlm_local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522 msg.msg_iovlen = 1;
523
524 /* I don't see why this circular buffer stuff is necessary for SCTP
525 * which is a packet-based protocol, but the whole thing breaks under
526 * load without it! The overhead is minimal (and is in the TCP lowcomms
527 * anyway, of course) so I'll leave it in until I can figure out what's
528 * really happening.
529 */
530
531 /*
532 * iov[0] is the bit of the circular buffer between the current end
533 * point (cb.base + cb.len) and the end of the buffer.
534 */
535 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
536 iov[0].iov_base = page_address(sctp_con.rx_page) +
537 CBUF_DATA(&sctp_con.cb);
538 iov[1].iov_len = 0;
539
540 /*
541 * iov[1] is the bit of the circular buffer between the start of the
542 * buffer and the start of the currently used section (cb.base)
543 */
544 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
545 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
546 iov[1].iov_len = sctp_con.cb.base;
547 iov[1].iov_base = page_address(sctp_con.rx_page);
548 msg.msg_iovlen = 2;
549 }
550 len = iov[0].iov_len + iov[1].iov_len;
551
552 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
553 MSG_NOSIGNAL | MSG_DONTWAIT);
554 if (ret <= 0)
555 goto out_close;
556
557 msg.msg_control = incmsg;
558 msg.msg_controllen = sizeof(incmsg);
559 cmsg = CMSG_FIRSTHDR(&msg);
560 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
561
562 if (msg.msg_flags & MSG_NOTIFICATION) {
563 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
564 return 0;
565 }
566
567 /* Is this a new association ? */
568 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
569 if (ni) {
570 ni->assoc_id = sinfo->sinfo_assoc_id;
571 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
572
573 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
574 spin_lock_bh(&write_nodes_lock);
575 list_add_tail(&ni->write_list, &write_nodes);
576 spin_unlock_bh(&write_nodes_lock);
577 }
578 wake_up_process(send_task);
579 }
580 }
581
582 /* INIT sends a message with length of 1 - ignore it */
583 if (r == 1)
584 return 0;
585
586 CBUF_ADD(&sctp_con.cb, ret);
587 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
588 page_address(sctp_con.rx_page),
589 sctp_con.cb.base, sctp_con.cb.len,
590 PAGE_CACHE_SIZE);
591 if (ret < 0)
592 goto out_close;
593 CBUF_EAT(&sctp_con.cb, ret);
594
595 out:
596 ret = 0;
597 goto out_ret;
598
599 out_resched:
600 lowcomms_data_ready(sctp_con.sock->sk, 0);
601 ret = 0;
602 schedule();
603 goto out_ret;
604
605 out_close:
606 if (ret != -EAGAIN)
607 log_print("error reading from sctp socket: %d", ret);
608 out_ret:
609 return ret;
610}
611
612/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
613static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
614{
615 mm_segment_t fs;
616 int result = 0;
617
618 fs = get_fs();
619 set_fs(get_ds());
620 if (num == 1)
621 result = sctp_con.sock->ops->bind(sctp_con.sock,
622 (struct sockaddr *) addr, addr_len);
623 else
624 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
625 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
626 set_fs(fs);
627
628 if (result < 0)
629 log_print("Can't bind to port %d addr number %d",
630 dlm_config.tcp_port, num);
631
632 return result;
633}
634
635static void init_local(void)
636{
637 struct sockaddr_storage sas, *addr;
638 int i;
639
640 dlm_local_nodeid = dlm_our_nodeid();
641
642 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
643 if (dlm_our_addr(&sas, i))
644 break;
645
646 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
647 if (!addr)
648 break;
649 memcpy(addr, &sas, sizeof(*addr));
650 dlm_local_addr[dlm_local_count++] = addr;
651 }
652}
653
654/* Initialise SCTP socket and bind to all interfaces */
655static int init_sock(void)
656{
657 mm_segment_t fs;
658 struct socket *sock = NULL;
659 struct sockaddr_storage localaddr;
660 struct sctp_event_subscribe subscribe;
661 int result = -EINVAL, num = 1, i, addr_len;
662
663 if (!dlm_local_count) {
664 init_local();
665 if (!dlm_local_count) {
666 log_print("no local IP address has been set");
667 goto out;
668 }
669 }
670
671 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
672 IPPROTO_SCTP, &sock);
673 if (result < 0) {
674 log_print("Can't create comms socket, check SCTP is loaded");
675 goto out;
676 }
677
678 /* Listen for events */
679 memset(&subscribe, 0, sizeof(subscribe));
680 subscribe.sctp_data_io_event = 1;
681 subscribe.sctp_association_event = 1;
682 subscribe.sctp_send_failure_event = 1;
683 subscribe.sctp_shutdown_event = 1;
684 subscribe.sctp_partial_delivery_event = 1;
685
686 fs = get_fs();
687 set_fs(get_ds());
688 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
689 (char *)&subscribe, sizeof(subscribe));
690 set_fs(fs);
691
692 if (result < 0) {
693 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
694 result);
695 goto create_delsock;
696 }
697
698 /* Init con struct */
699 sock->sk->sk_user_data = &sctp_con;
700 sctp_con.sock = sock;
701 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
702
703 /* Bind to all interfaces. */
704 for (i = 0; i < dlm_local_count; i++) {
705 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
706 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
707
708 result = add_bind_addr(&localaddr, addr_len, num);
709 if (result)
710 goto create_delsock;
711 ++num;
712 }
713
714 result = sock->ops->listen(sock, 5);
715 if (result < 0) {
716 log_print("Can't set socket listening");
717 goto create_delsock;
718 }
719
720 return 0;
721
722 create_delsock:
723 sock_release(sock);
724 sctp_con.sock = NULL;
725 out:
726 return result;
727}
728
729
730static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
731{
732 struct writequeue_entry *entry;
733
734 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
735 if (!entry)
736 return NULL;
737
738 entry->page = alloc_page(allocation);
739 if (!entry->page) {
740 kfree(entry);
741 return NULL;
742 }
743
744 entry->offset = 0;
745 entry->len = 0;
746 entry->end = 0;
747 entry->users = 0;
748
749 return entry;
750}
751
752void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
753{
754 struct writequeue_entry *e;
755 int offset = 0;
756 int users = 0;
757 struct nodeinfo *ni;
758
759 if (!atomic_read(&accepting))
760 return NULL;
761
762 ni = nodeid2nodeinfo(nodeid, allocation);
763 if (!ni)
764 return NULL;
765
766 spin_lock(&ni->writequeue_lock);
767 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
768 if (((struct list_head *) e == &ni->writequeue) ||
769 (PAGE_CACHE_SIZE - e->end < len)) {
770 e = NULL;
771 } else {
772 offset = e->end;
773 e->end += len;
774 users = e->users++;
775 }
776 spin_unlock(&ni->writequeue_lock);
777
778 if (e) {
779 got_one:
780 if (users == 0)
781 kmap(e->page);
782 *ppc = page_address(e->page) + offset;
783 return e;
784 }
785
786 e = new_writequeue_entry(allocation);
787 if (e) {
788 spin_lock(&ni->writequeue_lock);
789 offset = e->end;
790 e->end += len;
791 e->ni = ni;
792 users = e->users++;
793 list_add_tail(&e->list, &ni->writequeue);
794 spin_unlock(&ni->writequeue_lock);
795 goto got_one;
796 }
797 return NULL;
798}
799
800void dlm_lowcomms_commit_buffer(void *arg)
801{
802 struct writequeue_entry *e = (struct writequeue_entry *) arg;
803 int users;
804 struct nodeinfo *ni = e->ni;
805
806 if (!atomic_read(&accepting))
807 return;
808
809 spin_lock(&ni->writequeue_lock);
810 users = --e->users;
811 if (users)
812 goto out;
813 e->len = e->end - e->offset;
814 kunmap(e->page);
815 spin_unlock(&ni->writequeue_lock);
816
817 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
818 spin_lock_bh(&write_nodes_lock);
819 list_add_tail(&ni->write_list, &write_nodes);
820 spin_unlock_bh(&write_nodes_lock);
821 wake_up_process(send_task);
822 }
823 return;
824
825 out:
826 spin_unlock(&ni->writequeue_lock);
827 return;
828}
829
830static void free_entry(struct writequeue_entry *e)
831{
832 __free_page(e->page);
833 kfree(e);
834}
835
836/* Initiate an SCTP association. In theory we could just use sendmsg() on
837 the first IP address and it should work, but this allows us to set up the
838 association before sending any valuable data that we can't afford to lose.
839 It also keeps the send path clean as it can now always use the association ID */
840static void initiate_association(int nodeid)
841{
842 struct sockaddr_storage rem_addr;
843 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
844 struct msghdr outmessage;
845 struct cmsghdr *cmsg;
846 struct sctp_sndrcvinfo *sinfo;
847 int ret;
848 int addrlen;
849 char buf[1];
850 struct kvec iov[1];
851 struct nodeinfo *ni;
852
853 log_print("Initiating association with node %d", nodeid);
854
855 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
856 if (!ni)
857 return;
858
859 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
860 log_print("no address for nodeid %d", nodeid);
861 return;
862 }
863
864 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
865
866 outmessage.msg_name = &rem_addr;
867 outmessage.msg_namelen = addrlen;
868 outmessage.msg_control = outcmsg;
869 outmessage.msg_controllen = sizeof(outcmsg);
870 outmessage.msg_flags = MSG_EOR;
871
872 iov[0].iov_base = buf;
873 iov[0].iov_len = 1;
874
875 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
876 we can afford to lose */
877 cmsg = CMSG_FIRSTHDR(&outmessage);
878 cmsg->cmsg_level = IPPROTO_SCTP;
879 cmsg->cmsg_type = SCTP_SNDRCV;
880 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
881 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
882 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
883 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
884
885 outmessage.msg_controllen = cmsg->cmsg_len;
886 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
887 if (ret < 0) {
888 log_print("send INIT to node failed: %d", ret);
889 /* Try again later */
890 clear_bit(NI_INIT_PENDING, &ni->flags);
891 }
892}
893
894/* Send a message */
895static int send_to_sock(struct nodeinfo *ni)
896{
897 int ret = 0;
898 struct writequeue_entry *e;
899 int len, offset;
900 struct msghdr outmsg;
901 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
902 struct cmsghdr *cmsg;
903 struct sctp_sndrcvinfo *sinfo;
904 struct kvec iov;
905
906 /* See if we need to init an association before we start
907 sending precious messages */
908 spin_lock(&ni->lock);
909 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
910 spin_unlock(&ni->lock);
911 initiate_association(ni->nodeid);
912 return 0;
913 }
914 spin_unlock(&ni->lock);
915
916 outmsg.msg_name = NULL; /* We use assoc_id */
917 outmsg.msg_namelen = 0;
918 outmsg.msg_control = outcmsg;
919 outmsg.msg_controllen = sizeof(outcmsg);
920 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
921
922 cmsg = CMSG_FIRSTHDR(&outmsg);
923 cmsg->cmsg_level = IPPROTO_SCTP;
924 cmsg->cmsg_type = SCTP_SNDRCV;
925 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
926 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
927 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
928 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
929 sinfo->sinfo_assoc_id = ni->assoc_id;
930 outmsg.msg_controllen = cmsg->cmsg_len;
931
932 spin_lock(&ni->writequeue_lock);
933 for (;;) {
934 if (list_empty(&ni->writequeue))
935 break;
936 e = list_entry(ni->writequeue.next, struct writequeue_entry,
937 list);
938 len = e->len;
939 offset = e->offset;
940 BUG_ON(len == 0 && e->users == 0);
941 spin_unlock(&ni->writequeue_lock);
942 kmap(e->page);
943
944 ret = 0;
945 if (len) {
946 iov.iov_base = page_address(e->page)+offset;
947 iov.iov_len = len;
948
949 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
950 len);
951 if (ret == -EAGAIN) {
952 sctp_con.eagain_flag = 1;
953 goto out;
954 } else if (ret < 0)
955 goto send_error;
956 } else {
957 /* Don't starve people filling buffers */
958 schedule();
959 }
960
961 spin_lock(&ni->writequeue_lock);
962 e->offset += ret;
963 e->len -= ret;
964
965 if (e->len == 0 && e->users == 0) {
966 list_del(&e->list);
967 free_entry(e);
968 continue;
969 }
970 }
971 spin_unlock(&ni->writequeue_lock);
972 out:
973 return ret;
974
975 send_error:
976 log_print("Error sending to node %d %d", ni->nodeid, ret);
977 spin_lock(&ni->lock);
978 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
979 ni->assoc_id = 0;
980 spin_unlock(&ni->lock);
981 initiate_association(ni->nodeid);
982 } else
983 spin_unlock(&ni->lock);
984
985 return ret;
986}
987
988/* Try to send any messages that are pending */
989static void process_output_queue(void)
990{
991 struct list_head *list;
992 struct list_head *temp;
993
994 spin_lock_bh(&write_nodes_lock);
995 list_for_each_safe(list, temp, &write_nodes) {
996 struct nodeinfo *ni =
997 list_entry(list, struct nodeinfo, write_list);
998 clear_bit(NI_WRITE_PENDING, &ni->flags);
999 list_del(&ni->write_list);
1000
1001 spin_unlock_bh(&write_nodes_lock);
1002
1003 send_to_sock(ni);
1004 spin_lock_bh(&write_nodes_lock);
1005 }
1006 spin_unlock_bh(&write_nodes_lock);
1007}
1008
1009/* Called after we've had -EAGAIN and been woken up */
1010static void refill_write_queue(void)
1011{
1012 int i;
1013
1014 for (i=1; i<=max_nodeid; i++) {
1015 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1016
1017 if (ni) {
1018 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1019 spin_lock_bh(&write_nodes_lock);
1020 list_add_tail(&ni->write_list, &write_nodes);
1021 spin_unlock_bh(&write_nodes_lock);
1022 }
1023 }
1024 }
1025}
1026
1027static void clean_one_writequeue(struct nodeinfo *ni)
1028{
1029 struct list_head *list;
1030 struct list_head *temp;
1031
1032 spin_lock(&ni->writequeue_lock);
1033 list_for_each_safe(list, temp, &ni->writequeue) {
1034 struct writequeue_entry *e =
1035 list_entry(list, struct writequeue_entry, list);
1036 list_del(&e->list);
1037 free_entry(e);
1038 }
1039 spin_unlock(&ni->writequeue_lock);
1040}
1041
1042static void clean_writequeues(void)
1043{
1044 int i;
1045
1046 for (i=1; i<=max_nodeid; i++) {
1047 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1048 if (ni)
1049 clean_one_writequeue(ni);
1050 }
1051}
1052
1053
1054static void dealloc_nodeinfo(void)
1055{
1056 int i;
1057
1058 for (i=1; i<=max_nodeid; i++) {
1059 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1060 if (ni) {
1061 idr_remove(&nodeinfo_idr, i);
1062 kfree(ni);
1063 }
1064 }
1065}
1066
1067int dlm_lowcomms_close(int nodeid)
1068{
1069 struct nodeinfo *ni;
1070
1071 ni = nodeid2nodeinfo(nodeid, 0);
1072 if (!ni)
1073 return -1;
1074
1075 spin_lock(&ni->lock);
1076 if (ni->assoc_id) {
1077 ni->assoc_id = 0;
1078 /* Don't send shutdown here, sctp will just queue it
1079 till the node comes back up! */
1080 }
1081 spin_unlock(&ni->lock);
1082
1083 clean_one_writequeue(ni);
1084 clear_bit(NI_INIT_PENDING, &ni->flags);
1085 return 0;
1086}
1087
1088static int write_list_empty(void)
1089{
1090 int status;
1091
1092 spin_lock_bh(&write_nodes_lock);
1093 status = list_empty(&write_nodes);
1094 spin_unlock_bh(&write_nodes_lock);
1095
1096 return status;
1097}
1098
1099static int dlm_recvd(void *data)
1100{
1101 DECLARE_WAITQUEUE(wait, current);
1102
1103 while (!kthread_should_stop()) {
1104 int count = 0;
1105
1106 set_current_state(TASK_INTERRUPTIBLE);
1107 add_wait_queue(&lowcomms_recv_wait, &wait);
1108 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1109 schedule();
1110 remove_wait_queue(&lowcomms_recv_wait, &wait);
1111 set_current_state(TASK_RUNNING);
1112
1113 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1114 int ret;
1115
1116 do {
1117 ret = receive_from_sock();
1118
1119 /* Don't starve out everyone else */
1120 if (++count >= MAX_RX_MSG_COUNT) {
1121 schedule();
1122 count = 0;
1123 }
1124 } while (!kthread_should_stop() && ret >=0);
1125 }
1126 schedule();
1127 }
1128
1129 return 0;
1130}
1131
1132static int dlm_sendd(void *data)
1133{
1134 DECLARE_WAITQUEUE(wait, current);
1135
1136 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1137
1138 while (!kthread_should_stop()) {
1139 set_current_state(TASK_INTERRUPTIBLE);
1140 if (write_list_empty())
1141 schedule();
1142 set_current_state(TASK_RUNNING);
1143
1144 if (sctp_con.eagain_flag) {
1145 sctp_con.eagain_flag = 0;
1146 refill_write_queue();
1147 }
1148 process_output_queue();
1149 }
1150
1151 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1152
1153 return 0;
1154}
1155
1156static void daemons_stop(void)
1157{
1158 kthread_stop(recv_task);
1159 kthread_stop(send_task);
1160}
1161
1162static int daemons_start(void)
1163{
1164 struct task_struct *p;
1165 int error;
1166
1167 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1168 error = IS_ERR(p);
1169 if (error) {
1170 log_print("can't start dlm_recvd %d", error);
1171 return error;
1172 }
1173 recv_task = p;
1174
1175 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1176 error = IS_ERR(p);
1177 if (error) {
1178 log_print("can't start dlm_sendd %d", error);
1179 kthread_stop(recv_task);
1180 return error;
1181 }
1182 send_task = p;
1183
1184 return 0;
1185}
1186
1187/*
1188 * This is quite likely to sleep...
1189 */
1190int dlm_lowcomms_start(void)
1191{
1192 int error;
1193
1194 error = init_sock();
1195 if (error)
1196 goto fail_sock;
1197 error = daemons_start();
1198 if (error)
1199 goto fail_sock;
1200 atomic_set(&accepting, 1);
1201 return 0;
1202
1203 fail_sock:
1204 close_connection();
1205 return error;
1206}
1207
1208/* Set all the activity flags to prevent any socket activity. */
1209
1210void dlm_lowcomms_stop(void)
1211{
1212 atomic_set(&accepting, 0);
1213 sctp_con.flags = 0x7;
1214 daemons_stop();
1215 clean_writequeues();
1216 close_connection();
1217 dealloc_nodeinfo();
1218 max_nodeid = 0;
1219}
1220
1221int dlm_lowcomms_init(void)
1222{
1223 init_waitqueue_head(&lowcomms_recv_wait);
1224 spin_lock_init(&write_nodes_lock);
1225 INIT_LIST_HEAD(&write_nodes);
1226 init_rwsem(&nodeinfo_lock);
1227 return 0;
1228}
1229
1230void dlm_lowcomms_exit(void)
1231{
1232 int i;
1233
1234 for (i = 0; i < dlm_local_count; i++)
1235 kfree(dlm_local_addr[i]);
1236 dlm_local_count = 0;
1237 dlm_local_nodeid = 0;
1238}
1239
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..2d045e0daae1
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21int dlm_lowcomms_close(int nodeid);
22void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
23void dlm_lowcomms_commit_buffer(void *mh);
24
25#endif /* __LOWCOMMS_DOT_H__ */
26
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "user.h"
18#include "memory.h"
19#include "lowcomms.h"
20#include "config.h"
21
22#ifdef CONFIG_DLM_DEBUG
23int dlm_register_debugfs(void);
24void dlm_unregister_debugfs(void);
25#else
26static inline int dlm_register_debugfs(void) { return 0; }
27static inline void dlm_unregister_debugfs(void) { }
28#endif
29
30static int __init init_dlm(void)
31{
32 int error;
33
34 error = dlm_memory_init();
35 if (error)
36 goto out;
37
38 error = dlm_lockspace_init();
39 if (error)
40 goto out_mem;
41
42 error = dlm_config_init();
43 if (error)
44 goto out_lockspace;
45
46 error = dlm_register_debugfs();
47 if (error)
48 goto out_config;
49
50 error = dlm_lowcomms_init();
51 if (error)
52 goto out_debug;
53
54 error = dlm_user_init();
55 if (error)
56 goto out_lowcomms;
57
58 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
59
60 return 0;
61
62 out_lowcomms:
63 dlm_lowcomms_exit();
64 out_debug:
65 dlm_unregister_debugfs();
66 out_config:
67 dlm_config_exit();
68 out_lockspace:
69 dlm_lockspace_exit();
70 out_mem:
71 dlm_memory_exit();
72 out:
73 return error;
74}
75
76static void __exit exit_dlm(void)
77{
78 dlm_user_exit();
79 dlm_lowcomms_exit();
80 dlm_config_exit();
81 dlm_memory_exit();
82 dlm_lockspace_exit();
83 dlm_unregister_debugfs();
84}
85
86module_init(init_dlm);
87module_exit(exit_dlm);
88
89MODULE_DESCRIPTION("Distributed Lock Manager");
90MODULE_AUTHOR("Red Hat, Inc.");
91MODULE_LICENSE("GPL");
92
93EXPORT_SYMBOL_GPL(dlm_new_lockspace);
94EXPORT_SYMBOL_GPL(dlm_release_lockspace);
95EXPORT_SYMBOL_GPL(dlm_lock);
96EXPORT_SYMBOL_GPL(dlm_unlock);
97
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "rcom.h"
19#include "config.h"
20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{
27 struct dlm_member *memb = NULL;
28 struct list_head *tmp;
29 struct list_head *newlist = &new->list;
30 struct list_head *head = &ls->ls_nodes;
31
32 list_for_each(tmp, head) {
33 memb = list_entry(tmp, struct dlm_member, list);
34 if (new->nodeid < memb->nodeid)
35 break;
36 }
37
38 if (!memb)
39 list_add_tail(newlist, head);
40 else {
41 /* FIXME: can use list macro here */
42 newlist->prev = tmp->prev;
43 newlist->next = tmp;
44 tmp->prev->next = newlist;
45 tmp->prev = newlist;
46 }
47}
48
49static int dlm_add_member(struct dlm_ls *ls, int nodeid)
50{
51 struct dlm_member *memb;
52 int w;
53
54 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
55 if (!memb)
56 return -ENOMEM;
57
58 w = dlm_node_weight(ls->ls_name, nodeid);
59 if (w < 0)
60 return w;
61
62 memb->nodeid = nodeid;
63 memb->weight = w;
64 add_ordered_member(ls, memb);
65 ls->ls_num_nodes++;
66 return 0;
67}
68
69static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70{
71 list_move(&memb->list, &ls->ls_nodes_gone);
72 ls->ls_num_nodes--;
73}
74
75static int dlm_is_member(struct dlm_ls *ls, int nodeid)
76{
77 struct dlm_member *memb;
78
79 list_for_each_entry(memb, &ls->ls_nodes, list) {
80 if (memb->nodeid == nodeid)
81 return 1;
82 }
83 return 0;
84}
85
86int dlm_is_removed(struct dlm_ls *ls, int nodeid)
87{
88 struct dlm_member *memb;
89
90 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
91 if (memb->nodeid == nodeid)
92 return 1;
93 }
94 return 0;
95}
96
97static void clear_memb_list(struct list_head *head)
98{
99 struct dlm_member *memb;
100
101 while (!list_empty(head)) {
102 memb = list_entry(head->next, struct dlm_member, list);
103 list_del(&memb->list);
104 kfree(memb);
105 }
106}
107
108void dlm_clear_members(struct dlm_ls *ls)
109{
110 clear_memb_list(&ls->ls_nodes);
111 ls->ls_num_nodes = 0;
112}
113
114void dlm_clear_members_gone(struct dlm_ls *ls)
115{
116 clear_memb_list(&ls->ls_nodes_gone);
117}
118
119static void make_member_array(struct dlm_ls *ls)
120{
121 struct dlm_member *memb;
122 int i, w, x = 0, total = 0, all_zero = 0, *array;
123
124 kfree(ls->ls_node_array);
125 ls->ls_node_array = NULL;
126
127 list_for_each_entry(memb, &ls->ls_nodes, list) {
128 if (memb->weight)
129 total += memb->weight;
130 }
131
132 /* all nodes revert to weight of 1 if all have weight 0 */
133
134 if (!total) {
135 total = ls->ls_num_nodes;
136 all_zero = 1;
137 }
138
139 ls->ls_total_weight = total;
140
141 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
142 if (!array)
143 return;
144
145 list_for_each_entry(memb, &ls->ls_nodes, list) {
146 if (!all_zero && !memb->weight)
147 continue;
148
149 if (all_zero)
150 w = 1;
151 else
152 w = memb->weight;
153
154 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
155
156 for (i = 0; i < w; i++)
157 array[x++] = memb->nodeid;
158 }
159
160 ls->ls_node_array = array;
161}
162
163/* send a status request to all members just to establish comms connections */
164
165static int ping_members(struct dlm_ls *ls)
166{
167 struct dlm_member *memb;
168 int error = 0;
169
170 list_for_each_entry(memb, &ls->ls_nodes, list) {
171 error = dlm_recovery_stopped(ls);
172 if (error)
173 break;
174 error = dlm_rcom_status(ls, memb->nodeid);
175 if (error)
176 break;
177 }
178 if (error)
179 log_debug(ls, "ping_members aborted %d last nodeid %d",
180 error, ls->ls_recover_nodeid);
181 return error;
182}
183
184int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
185{
186 struct dlm_member *memb, *safe;
187 int i, error, found, pos = 0, neg = 0, low = -1;
188
189 /* move departed members from ls_nodes to ls_nodes_gone */
190
191 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
192 found = 0;
193 for (i = 0; i < rv->node_count; i++) {
194 if (memb->nodeid == rv->nodeids[i]) {
195 found = 1;
196 break;
197 }
198 }
199
200 if (!found) {
201 neg++;
202 dlm_remove_member(ls, memb);
203 log_debug(ls, "remove member %d", memb->nodeid);
204 }
205 }
206
207 /* add new members to ls_nodes */
208
209 for (i = 0; i < rv->node_count; i++) {
210 if (dlm_is_member(ls, rv->nodeids[i]))
211 continue;
212 dlm_add_member(ls, rv->nodeids[i]);
213 pos++;
214 log_debug(ls, "add member %d", rv->nodeids[i]);
215 }
216
217 list_for_each_entry(memb, &ls->ls_nodes, list) {
218 if (low == -1 || memb->nodeid < low)
219 low = memb->nodeid;
220 }
221 ls->ls_low_nodeid = low;
222
223 make_member_array(ls);
224 dlm_set_recover_status(ls, DLM_RS_NODES);
225 *neg_out = neg;
226
227 error = ping_members(ls);
228 if (error)
229 goto out;
230
231 error = dlm_recover_members_wait(ls);
232 out:
233 log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
234 return error;
235}
236
237/*
238 * Following called from lockspace.c
239 */
240
241int dlm_ls_stop(struct dlm_ls *ls)
242{
243 int new;
244
245 /*
246 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
247 * dlm_recovery_stopped()) and prevents any new locks from being
248 * processed (see RUNNING, dlm_locking_stopped()).
249 */
250
251 spin_lock(&ls->ls_recover_lock);
252 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
253 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
254 ls->ls_recover_seq++;
255 spin_unlock(&ls->ls_recover_lock);
256
257 /*
258 * This in_recovery lock does two things:
259 *
260 * 1) Keeps this function from returning until all threads are out
261 * of locking routines and locking is truely stopped.
262 * 2) Keeps any new requests from being processed until it's unlocked
263 * when recovery is complete.
264 */
265
266 if (new)
267 down_write(&ls->ls_in_recovery);
268
269 /*
270 * The recoverd suspend/resume makes sure that dlm_recoverd (if
271 * running) has noticed the clearing of RUNNING above and quit
272 * processing the previous recovery. This will be true for all nodes
273 * before any nodes start the new recovery.
274 */
275
276 dlm_recoverd_suspend(ls);
277 ls->ls_recover_status = 0;
278 dlm_recoverd_resume(ls);
279 return 0;
280}
281
282int dlm_ls_start(struct dlm_ls *ls)
283{
284 struct dlm_recover *rv = NULL, *rv_old;
285 int *ids = NULL;
286 int error, count;
287
288 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
289 if (!rv)
290 return -ENOMEM;
291
292 error = count = dlm_nodeid_list(ls->ls_name, &ids);
293 if (error <= 0)
294 goto fail;
295
296 spin_lock(&ls->ls_recover_lock);
297
298 /* the lockspace needs to be stopped before it can be started */
299
300 if (!dlm_locking_stopped(ls)) {
301 spin_unlock(&ls->ls_recover_lock);
302 log_error(ls, "start ignored: lockspace running");
303 error = -EINVAL;
304 goto fail;
305 }
306
307 rv->nodeids = ids;
308 rv->node_count = count;
309 rv->seq = ++ls->ls_recover_seq;
310 rv_old = ls->ls_recover_args;
311 ls->ls_recover_args = rv;
312 spin_unlock(&ls->ls_recover_lock);
313
314 if (rv_old) {
315 kfree(rv_old->nodeids);
316 kfree(rv_old);
317 }
318
319 dlm_recoverd_kick(ls);
320 return 0;
321
322 fail:
323 kfree(rv);
324 kfree(ids);
325 return error;
326}
327
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53/* FIXME: have some minimal space built-in to rsb for the name and
54 kmalloc a separate name if needed, like dentries are done */
55
56struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
57{
58 struct dlm_rsb *r;
59
60 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
61
62 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
63 if (r)
64 memset(r, 0, sizeof(*r) + namelen);
65 return r;
66}
67
68void free_rsb(struct dlm_rsb *r)
69{
70 if (r->res_lvbptr)
71 free_lvb(r->res_lvbptr);
72 kfree(r);
73}
74
75struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76{
77 struct dlm_lkb *lkb;
78
79 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
80 if (lkb)
81 memset(lkb, 0, sizeof(*lkb));
82 return lkb;
83}
84
85void free_lkb(struct dlm_lkb *lkb)
86{
87 if (lkb->lkb_flags & DLM_IFL_USER) {
88 struct dlm_user_args *ua;
89 ua = (struct dlm_user_args *)lkb->lkb_astparam;
90 if (ua) {
91 if (ua->lksb.sb_lvbptr)
92 kfree(ua->lksb.sb_lvbptr);
93 kfree(ua);
94 }
95 }
96 kmem_cache_free(lkb_cache, lkb);
97}
98
99struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
100{
101 struct dlm_direntry *de;
102
103 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
104 printk("namelen = %d\n", namelen););
105
106 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
107 if (de)
108 memset(de, 0, sizeof(*de) + namelen);
109 return de;
110}
111
112void free_direntry(struct dlm_direntry *de)
113{
114 kfree(de);
115}
116
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27
28#endif /* __MEMORY_DOT_H__ */
29
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100 ls->ls_recover_nodeid = nodeid;
101
102 if (nodeid == dlm_our_nodeid()) {
103 rc = (struct dlm_rcom *) ls->ls_recover_buf;
104 rc->rc_result = dlm_recover_status(ls);
105 goto out;
106 }
107
108 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
109 if (error)
110 goto out;
111 rc->rc_id = ++ls->ls_rcom_seq;
112
113 send_rcom(ls, mh, rc);
114
115 error = dlm_wait_function(ls, &rcom_response);
116 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
117 if (error)
118 goto out;
119
120 rc = (struct dlm_rcom *) ls->ls_recover_buf;
121
122 if (rc->rc_result == -ESRCH) {
123 /* we pretend the remote lockspace exists with 0 status */
124 log_debug(ls, "remote node %d not ready", nodeid);
125 rc->rc_result = 0;
126 } else
127 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
128 nodeid);
129 /* the caller looks at rc_result for the remote recovery status */
130 out:
131 return error;
132}
133
134static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
135{
136 struct dlm_rcom *rc;
137 struct dlm_mhandle *mh;
138 int error, nodeid = rc_in->rc_header.h_nodeid;
139
140 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
141 sizeof(struct rcom_config), &rc, &mh);
142 if (error)
143 return;
144 rc->rc_id = rc_in->rc_id;
145 rc->rc_result = dlm_recover_status(ls);
146 make_config(ls, (struct rcom_config *) rc->rc_buf);
147
148 send_rcom(ls, mh, rc);
149}
150
151static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
152{
153 if (rc_in->rc_id != ls->ls_rcom_seq) {
154 log_debug(ls, "reject old reply %d got %llx wanted %llx",
155 rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
156 return;
157 }
158 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
159 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
160 wake_up(&ls->ls_wait_general);
161}
162
163static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
164{
165 receive_sync_reply(ls, rc_in);
166}
167
168int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
169{
170 struct dlm_rcom *rc;
171 struct dlm_mhandle *mh;
172 int error = 0, len = sizeof(struct dlm_rcom);
173
174 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
175 ls->ls_recover_nodeid = nodeid;
176
177 if (nodeid == dlm_our_nodeid()) {
178 dlm_copy_master_names(ls, last_name, last_len,
179 ls->ls_recover_buf + len,
180 dlm_config.buffer_size - len, nodeid);
181 goto out;
182 }
183
184 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
185 if (error)
186 goto out;
187 memcpy(rc->rc_buf, last_name, last_len);
188 rc->rc_id = ++ls->ls_rcom_seq;
189
190 send_rcom(ls, mh, rc);
191
192 error = dlm_wait_function(ls, &rcom_response);
193 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
194 out:
195 return error;
196}
197
198static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
199{
200 struct dlm_rcom *rc;
201 struct dlm_mhandle *mh;
202 int error, inlen, outlen;
203 int nodeid = rc_in->rc_header.h_nodeid;
204 uint32_t status = dlm_recover_status(ls);
205
206 /*
207 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
208 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
209 * It could only happen in rare cases where we get a late NAMES
210 * message from a previous instance of recovery.
211 */
212
213 if (!(status & DLM_RS_NODES)) {
214 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
215 return;
216 }
217
218 nodeid = rc_in->rc_header.h_nodeid;
219 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
220 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
221
222 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
223 if (error)
224 return;
225 rc->rc_id = rc_in->rc_id;
226
227 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
228 nodeid);
229 send_rcom(ls, mh, rc);
230}
231
232static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
233{
234 receive_sync_reply(ls, rc_in);
235}
236
237int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
238{
239 struct dlm_rcom *rc;
240 struct dlm_mhandle *mh;
241 struct dlm_ls *ls = r->res_ls;
242 int error;
243
244 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
245 &rc, &mh);
246 if (error)
247 goto out;
248 memcpy(rc->rc_buf, r->res_name, r->res_length);
249 rc->rc_id = (unsigned long) r;
250
251 send_rcom(ls, mh, rc);
252 out:
253 return error;
254}
255
256static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
257{
258 struct dlm_rcom *rc;
259 struct dlm_mhandle *mh;
260 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
261 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
262
263 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
264 if (error)
265 return;
266
267 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
268 if (error)
269 ret_nodeid = error;
270 rc->rc_result = ret_nodeid;
271 rc->rc_id = rc_in->rc_id;
272
273 send_rcom(ls, mh, rc);
274}
275
276static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
277{
278 dlm_recover_master_reply(ls, rc_in);
279}
280
281static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
282 struct rcom_lock *rl)
283{
284 memset(rl, 0, sizeof(*rl));
285
286 rl->rl_ownpid = lkb->lkb_ownpid;
287 rl->rl_lkid = lkb->lkb_id;
288 rl->rl_exflags = lkb->lkb_exflags;
289 rl->rl_flags = lkb->lkb_flags;
290 rl->rl_lvbseq = lkb->lkb_lvbseq;
291 rl->rl_rqmode = lkb->lkb_rqmode;
292 rl->rl_grmode = lkb->lkb_grmode;
293 rl->rl_status = lkb->lkb_status;
294 rl->rl_wait_type = lkb->lkb_wait_type;
295
296 if (lkb->lkb_bastaddr)
297 rl->rl_asts |= AST_BAST;
298 if (lkb->lkb_astaddr)
299 rl->rl_asts |= AST_COMP;
300
301 rl->rl_namelen = r->res_length;
302 memcpy(rl->rl_name, r->res_name, r->res_length);
303
304 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
305 If so, receive_rcom_lock_args() won't take this copy. */
306
307 if (lkb->lkb_lvbptr)
308 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
309}
310
311int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
312{
313 struct dlm_ls *ls = r->res_ls;
314 struct dlm_rcom *rc;
315 struct dlm_mhandle *mh;
316 struct rcom_lock *rl;
317 int error, len = sizeof(struct rcom_lock);
318
319 if (lkb->lkb_lvbptr)
320 len += ls->ls_lvblen;
321
322 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
323 if (error)
324 goto out;
325
326 rl = (struct rcom_lock *) rc->rc_buf;
327 pack_rcom_lock(r, lkb, rl);
328 rc->rc_id = (unsigned long) r;
329
330 send_rcom(ls, mh, rc);
331 out:
332 return error;
333}
334
335static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
336{
337 struct dlm_rcom *rc;
338 struct dlm_mhandle *mh;
339 int error, nodeid = rc_in->rc_header.h_nodeid;
340
341 dlm_recover_master_copy(ls, rc_in);
342
343 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
344 sizeof(struct rcom_lock), &rc, &mh);
345 if (error)
346 return;
347
348 /* We send back the same rcom_lock struct we received, but
349 dlm_recover_master_copy() has filled in rl_remid and rl_result */
350
351 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
352 rc->rc_id = rc_in->rc_id;
353
354 send_rcom(ls, mh, rc);
355}
356
357static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
358{
359 uint32_t status = dlm_recover_status(ls);
360
361 if (!(status & DLM_RS_DIR)) {
362 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
363 rc_in->rc_header.h_nodeid);
364 return;
365 }
366
367 dlm_recover_process_copy(ls, rc_in);
368}
369
370static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
371{
372 struct dlm_rcom *rc;
373 struct dlm_mhandle *mh;
374 char *mb;
375 int mb_len = sizeof(struct dlm_rcom);
376
377 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
378 if (!mh)
379 return -ENOBUFS;
380 memset(mb, 0, mb_len);
381
382 rc = (struct dlm_rcom *) mb;
383
384 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
385 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
386 rc->rc_header.h_nodeid = dlm_our_nodeid();
387 rc->rc_header.h_length = mb_len;
388 rc->rc_header.h_cmd = DLM_RCOM;
389
390 rc->rc_type = DLM_RCOM_STATUS_REPLY;
391 rc->rc_id = rc_in->rc_id;
392 rc->rc_result = -ESRCH;
393
394 dlm_rcom_out(rc);
395 dlm_lowcomms_commit_buffer(mh);
396
397 return 0;
398}
399
400/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
401 recovery-only comms are sent through here. */
402
403void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
404{
405 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
406 struct dlm_ls *ls;
407
408 dlm_rcom_in(rc);
409
410 /* If the lockspace doesn't exist then still send a status message
411 back; it's possible that it just doesn't have its global_id yet. */
412
413 ls = dlm_find_lockspace_global(hd->h_lockspace);
414 if (!ls) {
415 log_print("lockspace %x from %d not found",
416 hd->h_lockspace, nodeid);
417 send_ls_not_ready(nodeid, rc);
418 return;
419 }
420
421 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
422 log_error(ls, "ignoring recovery message %x from %d",
423 rc->rc_type, nodeid);
424 goto out;
425 }
426
427 if (nodeid != rc->rc_header.h_nodeid) {
428 log_error(ls, "bad rcom nodeid %d from %d",
429 rc->rc_header.h_nodeid, nodeid);
430 goto out;
431 }
432
433 switch (rc->rc_type) {
434 case DLM_RCOM_STATUS:
435 receive_rcom_status(ls, rc);
436 break;
437
438 case DLM_RCOM_NAMES:
439 receive_rcom_names(ls, rc);
440 break;
441
442 case DLM_RCOM_LOOKUP:
443 receive_rcom_lookup(ls, rc);
444 break;
445
446 case DLM_RCOM_LOCK:
447 receive_rcom_lock(ls, rc);
448 break;
449
450 case DLM_RCOM_STATUS_REPLY:
451 receive_rcom_status_reply(ls, rc);
452 break;
453
454 case DLM_RCOM_NAMES_REPLY:
455 receive_rcom_names_reply(ls, rc);
456 break;
457
458 case DLM_RCOM_LOOKUP_REPLY:
459 receive_rcom_lookup_reply(ls, rc);
460 break;
461
462 case DLM_RCOM_LOCK_REPLY:
463 receive_rcom_lock_reply(ls, rc);
464 break;
465
466 default:
467 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
468 }
469 out:
470 dlm_put_lockspace(ls);
471}
472
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
309 * rsb's to consider.
310 */
311
312static void set_new_master(struct dlm_rsb *r, int nodeid)
313{
314 lock_rsb(r);
315 r->res_nodeid = nodeid;
316 set_master_lkbs(r);
317 rsb_set_flag(r, RSB_NEW_MASTER);
318 rsb_set_flag(r, RSB_NEW_MASTER2);
319 unlock_rsb(r);
320}
321
322/*
323 * We do async lookups on rsb's that need new masters. The rsb's
324 * waiting for a lookup reply are kept on the recover_list.
325 */
326
327static int recover_master(struct dlm_rsb *r)
328{
329 struct dlm_ls *ls = r->res_ls;
330 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
331
332 dir_nodeid = dlm_dir_nodeid(r);
333
334 if (dir_nodeid == our_nodeid) {
335 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
336 r->res_length, &ret_nodeid);
337 if (error)
338 log_error(ls, "recover dir lookup error %d", error);
339
340 if (ret_nodeid == our_nodeid)
341 ret_nodeid = 0;
342 set_new_master(r, ret_nodeid);
343 } else {
344 recover_list_add(r);
345 error = dlm_send_rcom_lookup(r, dir_nodeid);
346 }
347
348 return error;
349}
350
351/*
352 * When not using a directory, most resource names will hash to a new static
353 * master nodeid and the resource will need to be remastered.
354 */
355
356static int recover_master_static(struct dlm_rsb *r)
357{
358 int master = dlm_dir_nodeid(r);
359
360 if (master == dlm_our_nodeid())
361 master = 0;
362
363 if (r->res_nodeid != master) {
364 if (is_master(r))
365 dlm_purge_mstcpy_locks(r);
366 set_new_master(r, master);
367 return 1;
368 }
369 return 0;
370}
371
372/*
373 * Go through local root resources and for each rsb which has a master which
374 * has departed, get the new master nodeid from the directory. The dir will
375 * assign mastery to the first node to look up the new master. That means
376 * we'll discover in this lookup if we're the new master of any rsb's.
377 *
378 * We fire off all the dir lookup requests individually and asynchronously to
379 * the correct dir node.
380 */
381
382int dlm_recover_masters(struct dlm_ls *ls)
383{
384 struct dlm_rsb *r;
385 int error = 0, count = 0;
386
387 log_debug(ls, "dlm_recover_masters");
388
389 down_read(&ls->ls_root_sem);
390 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
391 if (dlm_recovery_stopped(ls)) {
392 up_read(&ls->ls_root_sem);
393 error = -EINTR;
394 goto out;
395 }
396
397 if (dlm_no_directory(ls))
398 count += recover_master_static(r);
399 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
400 recover_master(r);
401 count++;
402 }
403
404 schedule();
405 }
406 up_read(&ls->ls_root_sem);
407
408 log_debug(ls, "dlm_recover_masters %d resources", count);
409
410 error = dlm_wait_function(ls, &recover_list_empty);
411 out:
412 if (error)
413 recover_list_clear(ls);
414 return error;
415}
416
417int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
418{
419 struct dlm_rsb *r;
420 int nodeid;
421
422 r = recover_list_find(ls, rc->rc_id);
423 if (!r) {
424 log_error(ls, "dlm_recover_master_reply no id %llx",
425 (unsigned long long)rc->rc_id);
426 goto out;
427 }
428
429 nodeid = rc->rc_result;
430 if (nodeid == dlm_our_nodeid())
431 nodeid = 0;
432
433 set_new_master(r, nodeid);
434 recover_list_del(r);
435
436 if (recover_list_empty(ls))
437 wake_up(&ls->ls_wait_general);
438 out:
439 return 0;
440}
441
442
443/* Lock recovery: rebuild the process-copy locks we hold on a
444 remastered rsb on the new rsb master.
445
446 dlm_recover_locks
447 recover_locks
448 recover_locks_queue
449 dlm_send_rcom_lock -> receive_rcom_lock
450 dlm_recover_master_copy
451 receive_rcom_lock_reply <-
452 dlm_recover_process_copy
453*/
454
455
456/*
457 * keep a count of the number of lkb's we send to the new master; when we get
458 * an equal number of replies then recovery for the rsb is done
459 */
460
461static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
462{
463 struct dlm_lkb *lkb;
464 int error = 0;
465
466 list_for_each_entry(lkb, head, lkb_statequeue) {
467 error = dlm_send_rcom_lock(r, lkb);
468 if (error)
469 break;
470 r->res_recover_locks_count++;
471 }
472
473 return error;
474}
475
476static int recover_locks(struct dlm_rsb *r)
477{
478 int error = 0;
479
480 lock_rsb(r);
481
482 DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
483
484 error = recover_locks_queue(r, &r->res_grantqueue);
485 if (error)
486 goto out;
487 error = recover_locks_queue(r, &r->res_convertqueue);
488 if (error)
489 goto out;
490 error = recover_locks_queue(r, &r->res_waitqueue);
491 if (error)
492 goto out;
493
494 if (r->res_recover_locks_count)
495 recover_list_add(r);
496 else
497 rsb_clear_flag(r, RSB_NEW_MASTER);
498 out:
499 unlock_rsb(r);
500 return error;
501}
502
503int dlm_recover_locks(struct dlm_ls *ls)
504{
505 struct dlm_rsb *r;
506 int error, count = 0;
507
508 log_debug(ls, "dlm_recover_locks");
509
510 down_read(&ls->ls_root_sem);
511 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
512 if (is_master(r)) {
513 rsb_clear_flag(r, RSB_NEW_MASTER);
514 continue;
515 }
516
517 if (!rsb_flag(r, RSB_NEW_MASTER))
518 continue;
519
520 if (dlm_recovery_stopped(ls)) {
521 error = -EINTR;
522 up_read(&ls->ls_root_sem);
523 goto out;
524 }
525
526 error = recover_locks(r);
527 if (error) {
528 up_read(&ls->ls_root_sem);
529 goto out;
530 }
531
532 count += r->res_recover_locks_count;
533 }
534 up_read(&ls->ls_root_sem);
535
536 log_debug(ls, "dlm_recover_locks %d locks", count);
537
538 error = dlm_wait_function(ls, &recover_list_empty);
539 out:
540 if (error)
541 recover_list_clear(ls);
542 else
543 dlm_set_recover_status(ls, DLM_RS_LOCKS);
544 return error;
545}
546
547void dlm_recovered_lock(struct dlm_rsb *r)
548{
549 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
550
551 r->res_recover_locks_count--;
552 if (!r->res_recover_locks_count) {
553 rsb_clear_flag(r, RSB_NEW_MASTER);
554 recover_list_del(r);
555 }
556
557 if (recover_list_empty(r->res_ls))
558 wake_up(&r->res_ls->ls_wait_general);
559}
560
561/*
562 * The lvb needs to be recovered on all master rsb's. This includes setting
563 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
564 * based on the lvb's of the locks held on the rsb.
565 *
566 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
567 * was already set prior to recovery, it's not cleared, regardless of locks.
568 *
569 * The LVB contents are only considered for changing when this is a new master
570 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
571 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
572 * from the lkb with the largest lvb sequence number.
573 */
574
575static void recover_lvb(struct dlm_rsb *r)
576{
577 struct dlm_lkb *lkb, *high_lkb = NULL;
578 uint32_t high_seq = 0;
579 int lock_lvb_exists = 0;
580 int big_lock_exists = 0;
581 int lvblen = r->res_ls->ls_lvblen;
582
583 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
584 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
585 continue;
586
587 lock_lvb_exists = 1;
588
589 if (lkb->lkb_grmode > DLM_LOCK_CR) {
590 big_lock_exists = 1;
591 goto setflag;
592 }
593
594 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
595 high_lkb = lkb;
596 high_seq = lkb->lkb_lvbseq;
597 }
598 }
599
600 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
601 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
602 continue;
603
604 lock_lvb_exists = 1;
605
606 if (lkb->lkb_grmode > DLM_LOCK_CR) {
607 big_lock_exists = 1;
608 goto setflag;
609 }
610
611 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
612 high_lkb = lkb;
613 high_seq = lkb->lkb_lvbseq;
614 }
615 }
616
617 setflag:
618 if (!lock_lvb_exists)
619 goto out;
620
621 if (!big_lock_exists)
622 rsb_set_flag(r, RSB_VALNOTVALID);
623
624 /* don't mess with the lvb unless we're the new master */
625 if (!rsb_flag(r, RSB_NEW_MASTER2))
626 goto out;
627
628 if (!r->res_lvbptr) {
629 r->res_lvbptr = allocate_lvb(r->res_ls);
630 if (!r->res_lvbptr)
631 goto out;
632 }
633
634 if (big_lock_exists) {
635 r->res_lvbseq = lkb->lkb_lvbseq;
636 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
637 } else if (high_lkb) {
638 r->res_lvbseq = high_lkb->lkb_lvbseq;
639 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
640 } else {
641 r->res_lvbseq = 0;
642 memset(r->res_lvbptr, 0, lvblen);
643 }
644 out:
645 return;
646}
647
648/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
649 converting PR->CW or CW->PR need to have their lkb_grmode set. */
650
651static void recover_conversion(struct dlm_rsb *r)
652{
653 struct dlm_lkb *lkb;
654 int grmode = -1;
655
656 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
657 if (lkb->lkb_grmode == DLM_LOCK_PR ||
658 lkb->lkb_grmode == DLM_LOCK_CW) {
659 grmode = lkb->lkb_grmode;
660 break;
661 }
662 }
663
664 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
665 if (lkb->lkb_grmode != DLM_LOCK_IV)
666 continue;
667 if (grmode == -1)
668 lkb->lkb_grmode = lkb->lkb_rqmode;
669 else
670 lkb->lkb_grmode = grmode;
671 }
672}
673
674/* We've become the new master for this rsb and waiting/converting locks may
675 need to be granted in dlm_grant_after_purge() due to locks that may have
676 existed from a removed node. */
677
678static void set_locks_purged(struct dlm_rsb *r)
679{
680 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
681 rsb_set_flag(r, RSB_LOCKS_PURGED);
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 if (rsb_flag(r, RSB_NEW_MASTER2))
698 set_locks_purged(r);
699 recover_lvb(r);
700 count++;
701 }
702 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
703 rsb_clear_flag(r, RSB_NEW_MASTER2);
704 unlock_rsb(r);
705 }
706 up_read(&ls->ls_root_sem);
707
708 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
709}
710
711/* Create a single list of all root rsb's to be used during recovery */
712
713int dlm_create_root_list(struct dlm_ls *ls)
714{
715 struct dlm_rsb *r;
716 int i, error = 0;
717
718 down_write(&ls->ls_root_sem);
719 if (!list_empty(&ls->ls_root_list)) {
720 log_error(ls, "root list not empty");
721 error = -EINVAL;
722 goto out;
723 }
724
725 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
726 read_lock(&ls->ls_rsbtbl[i].lock);
727 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
728 list_add(&r->res_root_list, &ls->ls_root_list);
729 dlm_hold_rsb(r);
730 }
731 read_unlock(&ls->ls_rsbtbl[i].lock);
732 }
733 out:
734 up_write(&ls->ls_root_sem);
735 return error;
736}
737
738void dlm_release_root_list(struct dlm_ls *ls)
739{
740 struct dlm_rsb *r, *safe;
741
742 down_write(&ls->ls_root_sem);
743 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
744 list_del_init(&r->res_root_list);
745 dlm_put_rsb(r);
746 }
747 up_write(&ls->ls_root_sem);
748}
749
750void dlm_clear_toss_list(struct dlm_ls *ls)
751{
752 struct dlm_rsb *r, *safe;
753 int i;
754
755 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
756 write_lock(&ls->ls_rsbtbl[i].lock);
757 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
758 res_hashchain) {
759 list_del(&r->res_hashchain);
760 free_rsb(r);
761 }
762 write_unlock(&ls->ls_rsbtbl[i].lock);
763 }
764}
765
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237 if (!ls) {
238 log_print("dlm_recoverd: no lockspace %p", arg);
239 return -1;
240 }
241
242 while (!kthread_should_stop()) {
243 set_current_state(TASK_INTERRUPTIBLE);
244 if (!test_bit(LSFL_WORK, &ls->ls_flags))
245 schedule();
246 set_current_state(TASK_RUNNING);
247
248 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
249 do_ls_recovery(ls);
250 }
251
252 dlm_put_lockspace(ls);
253 return 0;
254}
255
256void dlm_recoverd_kick(struct dlm_ls *ls)
257{
258 set_bit(LSFL_WORK, &ls->ls_flags);
259 wake_up_process(ls->ls_recoverd_task);
260}
261
262int dlm_recoverd_start(struct dlm_ls *ls)
263{
264 struct task_struct *p;
265 int error = 0;
266
267 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
268 if (IS_ERR(p))
269 error = PTR_ERR(p);
270 else
271 ls->ls_recoverd_task = p;
272 return error;
273}
274
275void dlm_recoverd_stop(struct dlm_ls *ls)
276{
277 kthread_stop(ls->ls_recoverd_task);
278}
279
280void dlm_recoverd_suspend(struct dlm_ls *ls)
281{
282 wake_up(&ls->ls_wait_general);
283 mutex_lock(&ls->ls_recoverd_active);
284}
285
286void dlm_recoverd_resume(struct dlm_ls *ls)
287{
288 mutex_unlock(&ls->ls_recoverd_active);
289}
290
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..c37e93e4f2df
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,788 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/init.h>
11#include <linux/wait.h>
12#include <linux/module.h>
13#include <linux/file.h>
14#include <linux/fs.h>
15#include <linux/poll.h>
16#include <linux/signal.h>
17#include <linux/spinlock.h>
18#include <linux/dlm.h>
19#include <linux/dlm_device.h>
20
21#include "dlm_internal.h"
22#include "lockspace.h"
23#include "lock.h"
24#include "lvb_table.h"
25
26static const char *name_prefix="dlm";
27static struct miscdevice ctl_device;
28static struct file_operations device_fops;
29
30#ifdef CONFIG_COMPAT
31
32struct dlm_lock_params32 {
33 __u8 mode;
34 __u8 namelen;
35 __u16 flags;
36 __u32 lkid;
37 __u32 parent;
38
39 __u32 castparam;
40 __u32 castaddr;
41 __u32 bastparam;
42 __u32 bastaddr;
43 __u32 lksb;
44
45 char lvb[DLM_USER_LVB_LEN];
46 char name[0];
47};
48
49struct dlm_write_request32 {
50 __u32 version[3];
51 __u8 cmd;
52 __u8 is64bit;
53 __u8 unused[2];
54
55 union {
56 struct dlm_lock_params32 lock;
57 struct dlm_lspace_params lspace;
58 } i;
59};
60
61struct dlm_lksb32 {
62 __u32 sb_status;
63 __u32 sb_lkid;
64 __u8 sb_flags;
65 __u32 sb_lvbptr;
66};
67
68struct dlm_lock_result32 {
69 __u32 length;
70 __u32 user_astaddr;
71 __u32 user_astparam;
72 __u32 user_lksb;
73 struct dlm_lksb32 lksb;
74 __u8 bast_mode;
75 __u8 unused[3];
76 /* Offsets may be zero if no data is present */
77 __u32 lvb_offset;
78};
79
80static void compat_input(struct dlm_write_request *kb,
81 struct dlm_write_request32 *kb32)
82{
83 kb->version[0] = kb32->version[0];
84 kb->version[1] = kb32->version[1];
85 kb->version[2] = kb32->version[2];
86
87 kb->cmd = kb32->cmd;
88 kb->is64bit = kb32->is64bit;
89 if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
90 kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
91 kb->i.lspace.flags = kb32->i.lspace.flags;
92 kb->i.lspace.minor = kb32->i.lspace.minor;
93 strcpy(kb->i.lspace.name, kb32->i.lspace.name);
94 } else {
95 kb->i.lock.mode = kb32->i.lock.mode;
96 kb->i.lock.namelen = kb32->i.lock.namelen;
97 kb->i.lock.flags = kb32->i.lock.flags;
98 kb->i.lock.lkid = kb32->i.lock.lkid;
99 kb->i.lock.parent = kb32->i.lock.parent;
100 kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
101 kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
102 kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
103 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
104 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
105 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
106 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
107 }
108}
109
110static void compat_output(struct dlm_lock_result *res,
111 struct dlm_lock_result32 *res32)
112{
113 res32->length = res->length - (sizeof(struct dlm_lock_result) -
114 sizeof(struct dlm_lock_result32));
115 res32->user_astaddr = (__u32)(long)res->user_astaddr;
116 res32->user_astparam = (__u32)(long)res->user_astparam;
117 res32->user_lksb = (__u32)(long)res->user_lksb;
118 res32->bast_mode = res->bast_mode;
119
120 res32->lvb_offset = res->lvb_offset;
121 res32->length = res->length;
122
123 res32->lksb.sb_status = res->lksb.sb_status;
124 res32->lksb.sb_flags = res->lksb.sb_flags;
125 res32->lksb.sb_lkid = res->lksb.sb_lkid;
126 res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
127}
128#endif
129
130
131void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
132{
133 struct dlm_ls *ls;
134 struct dlm_user_args *ua;
135 struct dlm_user_proc *proc;
136 int remove_ownqueue = 0;
137
138 /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
139 lkb before dealing with it. We need to check this
140 flag before taking ls_clear_proc_locks mutex because if
141 it's set, dlm_clear_proc_locks() holds the mutex. */
142
143 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
144 /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
145 return;
146 }
147
148 ls = lkb->lkb_resource->res_ls;
149 mutex_lock(&ls->ls_clear_proc_locks);
150
151 /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
152 can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
153 lkb->ua so we can't try to use it. */
154
155 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
156 /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
157 goto out;
158 }
159
160 DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
161 ua = (struct dlm_user_args *)lkb->lkb_astparam;
162 proc = ua->proc;
163
164 if (type == AST_BAST && ua->bastaddr == NULL)
165 goto out;
166
167 spin_lock(&proc->asts_spin);
168 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
169 kref_get(&lkb->lkb_ref);
170 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
171 lkb->lkb_ast_type |= type;
172 wake_up_interruptible(&proc->wait);
173 }
174
175 /* noqueue requests that fail may need to be removed from the
176 proc's locks list, there should be a better way of detecting
177 this situation than checking all these things... */
178
179 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
180 ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
181 remove_ownqueue = 1;
182
183 /* We want to copy the lvb to userspace when the completion
184 ast is read if the status is 0, the lock has an lvb and
185 lvb_ops says we should. We could probably have set_lvb_lock()
186 set update_user_lvb instead and not need old_mode */
187
188 if ((lkb->lkb_ast_type & AST_COMP) &&
189 (lkb->lkb_lksb->sb_status == 0) &&
190 lkb->lkb_lksb->sb_lvbptr &&
191 dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
192 ua->update_user_lvb = 1;
193 else
194 ua->update_user_lvb = 0;
195
196 spin_unlock(&proc->asts_spin);
197
198 if (remove_ownqueue) {
199 spin_lock(&ua->proc->locks_spin);
200 list_del_init(&lkb->lkb_ownqueue);
201 spin_unlock(&ua->proc->locks_spin);
202 dlm_put_lkb(lkb);
203 }
204 out:
205 mutex_unlock(&ls->ls_clear_proc_locks);
206}
207
208static int device_user_lock(struct dlm_user_proc *proc,
209 struct dlm_lock_params *params)
210{
211 struct dlm_ls *ls;
212 struct dlm_user_args *ua;
213 int error = -ENOMEM;
214
215 ls = dlm_find_lockspace_local(proc->lockspace);
216 if (!ls)
217 return -ENOENT;
218
219 if (!params->castaddr || !params->lksb) {
220 error = -EINVAL;
221 goto out;
222 }
223
224 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
225 if (!ua)
226 goto out;
227 ua->proc = proc;
228 ua->user_lksb = params->lksb;
229 ua->castparam = params->castparam;
230 ua->castaddr = params->castaddr;
231 ua->bastparam = params->bastparam;
232 ua->bastaddr = params->bastaddr;
233
234 if (params->flags & DLM_LKF_CONVERT)
235 error = dlm_user_convert(ls, ua,
236 params->mode, params->flags,
237 params->lkid, params->lvb);
238 else {
239 error = dlm_user_request(ls, ua,
240 params->mode, params->flags,
241 params->name, params->namelen,
242 params->parent);
243 if (!error)
244 error = ua->lksb.sb_lkid;
245 }
246 out:
247 dlm_put_lockspace(ls);
248 return error;
249}
250
251static int device_user_unlock(struct dlm_user_proc *proc,
252 struct dlm_lock_params *params)
253{
254 struct dlm_ls *ls;
255 struct dlm_user_args *ua;
256 int error = -ENOMEM;
257
258 ls = dlm_find_lockspace_local(proc->lockspace);
259 if (!ls)
260 return -ENOENT;
261
262 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
263 if (!ua)
264 goto out;
265 ua->proc = proc;
266 ua->user_lksb = params->lksb;
267 ua->castparam = params->castparam;
268 ua->castaddr = params->castaddr;
269
270 if (params->flags & DLM_LKF_CANCEL)
271 error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
272 else
273 error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
274 params->lvb);
275 out:
276 dlm_put_lockspace(ls);
277 return error;
278}
279
280static int device_create_lockspace(struct dlm_lspace_params *params)
281{
282 dlm_lockspace_t *lockspace;
283 struct dlm_ls *ls;
284 int error, len;
285
286 if (!capable(CAP_SYS_ADMIN))
287 return -EPERM;
288
289 error = dlm_new_lockspace(params->name, strlen(params->name),
290 &lockspace, 0, DLM_USER_LVB_LEN);
291 if (error)
292 return error;
293
294 ls = dlm_find_lockspace_local(lockspace);
295 if (!ls)
296 return -ENOENT;
297
298 error = -ENOMEM;
299 len = strlen(params->name) + strlen(name_prefix) + 2;
300 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
301 if (!ls->ls_device.name)
302 goto fail;
303 snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
304 params->name);
305 ls->ls_device.fops = &device_fops;
306 ls->ls_device.minor = MISC_DYNAMIC_MINOR;
307
308 error = misc_register(&ls->ls_device);
309 if (error) {
310 kfree(ls->ls_device.name);
311 goto fail;
312 }
313
314 error = ls->ls_device.minor;
315 dlm_put_lockspace(ls);
316 return error;
317
318 fail:
319 dlm_put_lockspace(ls);
320 dlm_release_lockspace(lockspace, 0);
321 return error;
322}
323
324static int device_remove_lockspace(struct dlm_lspace_params *params)
325{
326 dlm_lockspace_t *lockspace;
327 struct dlm_ls *ls;
328 int error, force = 0;
329
330 if (!capable(CAP_SYS_ADMIN))
331 return -EPERM;
332
333 ls = dlm_find_lockspace_device(params->minor);
334 if (!ls)
335 return -ENOENT;
336
337 error = misc_deregister(&ls->ls_device);
338 if (error) {
339 dlm_put_lockspace(ls);
340 goto out;
341 }
342 kfree(ls->ls_device.name);
343
344 if (params->flags & DLM_USER_LSFLG_FORCEFREE)
345 force = 2;
346
347 lockspace = ls->ls_local_handle;
348
349 /* dlm_release_lockspace waits for references to go to zero,
350 so all processes will need to close their device for the ls
351 before the release will procede */
352
353 dlm_put_lockspace(ls);
354 error = dlm_release_lockspace(lockspace, force);
355 out:
356 return error;
357}
358
359/* Check the user's version matches ours */
360static int check_version(struct dlm_write_request *req)
361{
362 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
363 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
364 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
365
366 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
367 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
368 current->comm,
369 current->pid,
370 req->version[0],
371 req->version[1],
372 req->version[2],
373 DLM_DEVICE_VERSION_MAJOR,
374 DLM_DEVICE_VERSION_MINOR,
375 DLM_DEVICE_VERSION_PATCH);
376 return -EINVAL;
377 }
378 return 0;
379}
380
381/*
382 * device_write
383 *
384 * device_user_lock
385 * dlm_user_request -> request_lock
386 * dlm_user_convert -> convert_lock
387 *
388 * device_user_unlock
389 * dlm_user_unlock -> unlock_lock
390 * dlm_user_cancel -> cancel_lock
391 *
392 * device_create_lockspace
393 * dlm_new_lockspace
394 *
395 * device_remove_lockspace
396 * dlm_release_lockspace
397 */
398
399/* a write to a lockspace device is a lock or unlock request, a write
400 to the control device is to create/remove a lockspace */
401
402static ssize_t device_write(struct file *file, const char __user *buf,
403 size_t count, loff_t *ppos)
404{
405 struct dlm_user_proc *proc = file->private_data;
406 struct dlm_write_request *kbuf;
407 sigset_t tmpsig, allsigs;
408 int error;
409
410#ifdef CONFIG_COMPAT
411 if (count < sizeof(struct dlm_write_request32))
412#else
413 if (count < sizeof(struct dlm_write_request))
414#endif
415 return -EINVAL;
416
417 kbuf = kmalloc(count, GFP_KERNEL);
418 if (!kbuf)
419 return -ENOMEM;
420
421 if (copy_from_user(kbuf, buf, count)) {
422 error = -EFAULT;
423 goto out_free;
424 }
425
426 if (check_version(kbuf)) {
427 error = -EBADE;
428 goto out_free;
429 }
430
431#ifdef CONFIG_COMPAT
432 if (!kbuf->is64bit) {
433 struct dlm_write_request32 *k32buf;
434 k32buf = (struct dlm_write_request32 *)kbuf;
435 kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
436 sizeof(struct dlm_write_request32)), GFP_KERNEL);
437 if (!kbuf)
438 return -ENOMEM;
439
440 if (proc)
441 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
442 compat_input(kbuf, k32buf);
443 kfree(k32buf);
444 }
445#endif
446
447 /* do we really need this? can a write happen after a close? */
448 if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
449 test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
450 return -EINVAL;
451
452 sigfillset(&allsigs);
453 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
454
455 error = -EINVAL;
456
457 switch (kbuf->cmd)
458 {
459 case DLM_USER_LOCK:
460 if (!proc) {
461 log_print("no locking on control device");
462 goto out_sig;
463 }
464 error = device_user_lock(proc, &kbuf->i.lock);
465 break;
466
467 case DLM_USER_UNLOCK:
468 if (!proc) {
469 log_print("no locking on control device");
470 goto out_sig;
471 }
472 error = device_user_unlock(proc, &kbuf->i.lock);
473 break;
474
475 case DLM_USER_CREATE_LOCKSPACE:
476 if (proc) {
477 log_print("create/remove only on control device");
478 goto out_sig;
479 }
480 error = device_create_lockspace(&kbuf->i.lspace);
481 break;
482
483 case DLM_USER_REMOVE_LOCKSPACE:
484 if (proc) {
485 log_print("create/remove only on control device");
486 goto out_sig;
487 }
488 error = device_remove_lockspace(&kbuf->i.lspace);
489 break;
490
491 default:
492 log_print("Unknown command passed to DLM device : %d\n",
493 kbuf->cmd);
494 }
495
496 out_sig:
497 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
498 recalc_sigpending();
499 out_free:
500 kfree(kbuf);
501 return error;
502}
503
504/* Every process that opens the lockspace device has its own "proc" structure
505 hanging off the open file that's used to keep track of locks owned by the
506 process and asts that need to be delivered to the process. */
507
508static int device_open(struct inode *inode, struct file *file)
509{
510 struct dlm_user_proc *proc;
511 struct dlm_ls *ls;
512
513 ls = dlm_find_lockspace_device(iminor(inode));
514 if (!ls)
515 return -ENOENT;
516
517 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
518 if (!proc) {
519 dlm_put_lockspace(ls);
520 return -ENOMEM;
521 }
522
523 proc->lockspace = ls->ls_local_handle;
524 INIT_LIST_HEAD(&proc->asts);
525 INIT_LIST_HEAD(&proc->locks);
526 spin_lock_init(&proc->asts_spin);
527 spin_lock_init(&proc->locks_spin);
528 init_waitqueue_head(&proc->wait);
529 file->private_data = proc;
530
531 return 0;
532}
533
534static int device_close(struct inode *inode, struct file *file)
535{
536 struct dlm_user_proc *proc = file->private_data;
537 struct dlm_ls *ls;
538 sigset_t tmpsig, allsigs;
539
540 ls = dlm_find_lockspace_local(proc->lockspace);
541 if (!ls)
542 return -ENOENT;
543
544 sigfillset(&allsigs);
545 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
546
547 set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
548
549 dlm_clear_proc_locks(ls, proc);
550
551 /* at this point no more lkb's should exist for this lockspace,
552 so there's no chance of dlm_user_add_ast() being called and
553 looking for lkb->ua->proc */
554
555 kfree(proc);
556 file->private_data = NULL;
557
558 dlm_put_lockspace(ls);
559 dlm_put_lockspace(ls); /* for the find in device_open() */
560
561 /* FIXME: AUTOFREE: if this ls is no longer used do
562 device_remove_lockspace() */
563
564 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
565 recalc_sigpending();
566
567 return 0;
568}
569
570static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
571 int bmode, char __user *buf, size_t count)
572{
573#ifdef CONFIG_COMPAT
574 struct dlm_lock_result32 result32;
575#endif
576 struct dlm_lock_result result;
577 void *resultptr;
578 int error=0;
579 int len;
580 int struct_len;
581
582 memset(&result, 0, sizeof(struct dlm_lock_result));
583 memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
584 result.user_lksb = ua->user_lksb;
585
586 /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
587 in a conversion unless the conversion is successful. See code
588 in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
589 notes that a new blocking AST address and parameter are set even if
590 the conversion fails, so maybe we should just do that. */
591
592 if (type == AST_BAST) {
593 result.user_astaddr = ua->bastaddr;
594 result.user_astparam = ua->bastparam;
595 result.bast_mode = bmode;
596 } else {
597 result.user_astaddr = ua->castaddr;
598 result.user_astparam = ua->castparam;
599 }
600
601#ifdef CONFIG_COMPAT
602 if (compat)
603 len = sizeof(struct dlm_lock_result32);
604 else
605#endif
606 len = sizeof(struct dlm_lock_result);
607 struct_len = len;
608
609 /* copy lvb to userspace if there is one, it's been updated, and
610 the user buffer has space for it */
611
612 if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
613 count >= len + DLM_USER_LVB_LEN) {
614 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
615 DLM_USER_LVB_LEN)) {
616 error = -EFAULT;
617 goto out;
618 }
619
620 result.lvb_offset = len;
621 len += DLM_USER_LVB_LEN;
622 }
623
624 result.length = len;
625 resultptr = &result;
626#ifdef CONFIG_COMPAT
627 if (compat) {
628 compat_output(&result, &result32);
629 resultptr = &result32;
630 }
631#endif
632
633 if (copy_to_user(buf, resultptr, struct_len))
634 error = -EFAULT;
635 else
636 error = len;
637 out:
638 return error;
639}
640
641/* a read returns a single ast described in a struct dlm_lock_result */
642
643static ssize_t device_read(struct file *file, char __user *buf, size_t count,
644 loff_t *ppos)
645{
646 struct dlm_user_proc *proc = file->private_data;
647 struct dlm_lkb *lkb;
648 struct dlm_user_args *ua;
649 DECLARE_WAITQUEUE(wait, current);
650 int error, type=0, bmode=0, removed = 0;
651
652#ifdef CONFIG_COMPAT
653 if (count < sizeof(struct dlm_lock_result32))
654#else
655 if (count < sizeof(struct dlm_lock_result))
656#endif
657 return -EINVAL;
658
659 /* do we really need this? can a read happen after a close? */
660 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
661 return -EINVAL;
662
663 spin_lock(&proc->asts_spin);
664 if (list_empty(&proc->asts)) {
665 if (file->f_flags & O_NONBLOCK) {
666 spin_unlock(&proc->asts_spin);
667 return -EAGAIN;
668 }
669
670 add_wait_queue(&proc->wait, &wait);
671
672 repeat:
673 set_current_state(TASK_INTERRUPTIBLE);
674 if (list_empty(&proc->asts) && !signal_pending(current)) {
675 spin_unlock(&proc->asts_spin);
676 schedule();
677 spin_lock(&proc->asts_spin);
678 goto repeat;
679 }
680 set_current_state(TASK_RUNNING);
681 remove_wait_queue(&proc->wait, &wait);
682
683 if (signal_pending(current)) {
684 spin_unlock(&proc->asts_spin);
685 return -ERESTARTSYS;
686 }
687 }
688
689 if (list_empty(&proc->asts)) {
690 spin_unlock(&proc->asts_spin);
691 return -EAGAIN;
692 }
693
694 /* there may be both completion and blocking asts to return for
695 the lkb, don't remove lkb from asts list unless no asts remain */
696
697 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
698
699 if (lkb->lkb_ast_type & AST_COMP) {
700 lkb->lkb_ast_type &= ~AST_COMP;
701 type = AST_COMP;
702 } else if (lkb->lkb_ast_type & AST_BAST) {
703 lkb->lkb_ast_type &= ~AST_BAST;
704 type = AST_BAST;
705 bmode = lkb->lkb_bastmode;
706 }
707
708 if (!lkb->lkb_ast_type) {
709 list_del(&lkb->lkb_astqueue);
710 removed = 1;
711 }
712 spin_unlock(&proc->asts_spin);
713
714 ua = (struct dlm_user_args *)lkb->lkb_astparam;
715 error = copy_result_to_user(ua,
716 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
717 type, bmode, buf, count);
718
719 /* removes reference for the proc->asts lists added by
720 dlm_user_add_ast() and may result in the lkb being freed */
721 if (removed)
722 dlm_put_lkb(lkb);
723
724 return error;
725}
726
727static unsigned int device_poll(struct file *file, poll_table *wait)
728{
729 struct dlm_user_proc *proc = file->private_data;
730
731 poll_wait(file, &proc->wait, wait);
732
733 spin_lock(&proc->asts_spin);
734 if (!list_empty(&proc->asts)) {
735 spin_unlock(&proc->asts_spin);
736 return POLLIN | POLLRDNORM;
737 }
738 spin_unlock(&proc->asts_spin);
739 return 0;
740}
741
742static int ctl_device_open(struct inode *inode, struct file *file)
743{
744 file->private_data = NULL;
745 return 0;
746}
747
748static int ctl_device_close(struct inode *inode, struct file *file)
749{
750 return 0;
751}
752
753static struct file_operations device_fops = {
754 .open = device_open,
755 .release = device_close,
756 .read = device_read,
757 .write = device_write,
758 .poll = device_poll,
759 .owner = THIS_MODULE,
760};
761
762static struct file_operations ctl_device_fops = {
763 .open = ctl_device_open,
764 .release = ctl_device_close,
765 .write = device_write,
766 .owner = THIS_MODULE,
767};
768
769int dlm_user_init(void)
770{
771 int error;
772
773 ctl_device.name = "dlm-control";
774 ctl_device.fops = &ctl_device_fops;
775 ctl_device.minor = MISC_DYNAMIC_MINOR;
776
777 error = misc_register(&ctl_device);
778 if (error)
779 log_print("misc_register failed for control device");
780
781 return error;
782}
783
784void dlm_user_exit(void)
785{
786 misc_deregister(&ctl_device);
787}
788
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__
11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void);
14void dlm_user_exit(void);
15
16#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57}
58
59void dlm_message_in(struct dlm_message *ms)
60{
61 struct dlm_header *hd = (struct dlm_header *) ms;
62
63 header_in(hd);
64
65 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
67 ms->m_pid = le32_to_cpu(ms->m_pid);
68 ms->m_lkid = le32_to_cpu(ms->m_lkid);
69 ms->m_remid = le32_to_cpu(ms->m_remid);
70 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
71 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
72 ms->m_exflags = le32_to_cpu(ms->m_exflags);
73 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
74 ms->m_flags = le32_to_cpu(ms->m_flags);
75 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
76 ms->m_hash = le32_to_cpu(ms->m_hash);
77 ms->m_status = le32_to_cpu(ms->m_status);
78 ms->m_grmode = le32_to_cpu(ms->m_grmode);
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result);
83}
84
85static void rcom_lock_out(struct rcom_lock *rl)
86{
87 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
88 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
89 rl->rl_remid = cpu_to_le32(rl->rl_remid);
90 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
91 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
92 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
93 rl->rl_flags = cpu_to_le32(rl->rl_flags);
94 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
95 rl->rl_result = cpu_to_le32(rl->rl_result);
96 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
97 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
98}
99
100static void rcom_lock_in(struct rcom_lock *rl)
101{
102 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
103 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
104 rl->rl_remid = le32_to_cpu(rl->rl_remid);
105 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
106 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
107 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
108 rl->rl_flags = le32_to_cpu(rl->rl_flags);
109 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
110 rl->rl_result = le32_to_cpu(rl->rl_result);
111 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
112 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
113}
114
115static void rcom_config_out(struct rcom_config *rf)
116{
117 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
118 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
119}
120
121static void rcom_config_in(struct rcom_config *rf)
122{
123 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
124 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
125}
126
127void dlm_rcom_out(struct dlm_rcom *rc)
128{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type;
131
132 header_out(hd);
133
134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id);
137
138 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
140
141 else if (type == DLM_RCOM_STATUS_REPLY)
142 rcom_config_out((struct rcom_config *) rc->rc_buf);
143}
144
145void dlm_rcom_in(struct dlm_rcom *rc)
146{
147 struct dlm_header *hd = (struct dlm_header *) rc;
148
149 header_in(hd);
150
151 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id);
154
155 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
157
158 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
159 rcom_config_in((struct rcom_config *) rc->rc_buf);
160}
161
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
new file mode 100644
index 000000000000..ca6562451eeb
--- /dev/null
+++ b/fs/ecryptfs/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the Linux 2.6 eCryptfs
3#
4
5obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
6
7ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o crypto.o keystore.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
new file mode 100644
index 000000000000..ed35a9712fa1
--- /dev/null
+++ b/fs/ecryptfs/crypto.c
@@ -0,0 +1,1659 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 *
4 * Copyright (C) 1997-2004 Erez Zadok
5 * Copyright (C) 2001-2004 Stony Brook University
6 * Copyright (C) 2004-2006 International Business Machines Corp.
7 * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
8 * Michael C. Thompson <mcthomps@us.ibm.com>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 * 02111-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/mount.h>
28#include <linux/pagemap.h>
29#include <linux/random.h>
30#include <linux/compiler.h>
31#include <linux/key.h>
32#include <linux/namei.h>
33#include <linux/crypto.h>
34#include <linux/file.h>
35#include <linux/scatterlist.h>
36#include "ecryptfs_kernel.h"
37
38static int
39ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
40 struct page *dst_page, int dst_offset,
41 struct page *src_page, int src_offset, int size,
42 unsigned char *iv);
43static int
44ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
45 struct page *dst_page, int dst_offset,
46 struct page *src_page, int src_offset, int size,
47 unsigned char *iv);
48
49/**
50 * ecryptfs_to_hex
51 * @dst: Buffer to take hex character representation of contents of
52 * src; must be at least of size (src_size * 2)
53 * @src: Buffer to be converted to a hex string respresentation
54 * @src_size: number of bytes to convert
55 */
56void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
57{
58 int x;
59
60 for (x = 0; x < src_size; x++)
61 sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]);
62}
63
64/**
65 * ecryptfs_from_hex
66 * @dst: Buffer to take the bytes from src hex; must be at least of
67 * size (src_size / 2)
68 * @src: Buffer to be converted from a hex string respresentation to raw value
69 * @dst_size: size of dst buffer, or number of hex characters pairs to convert
70 */
71void ecryptfs_from_hex(char *dst, char *src, int dst_size)
72{
73 int x;
74 char tmp[3] = { 0, };
75
76 for (x = 0; x < dst_size; x++) {
77 tmp[0] = src[x * 2];
78 tmp[1] = src[x * 2 + 1];
79 dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16);
80 }
81}
82
83/**
84 * ecryptfs_calculate_md5 - calculates the md5 of @src
85 * @dst: Pointer to 16 bytes of allocated memory
86 * @crypt_stat: Pointer to crypt_stat struct for the current inode
87 * @src: Data to be md5'd
88 * @len: Length of @src
89 *
90 * Uses the allocated crypto context that crypt_stat references to
91 * generate the MD5 sum of the contents of src.
92 */
93static int ecryptfs_calculate_md5(char *dst,
94 struct ecryptfs_crypt_stat *crypt_stat,
95 char *src, int len)
96{
97 int rc = 0;
98 struct scatterlist sg;
99
100 mutex_lock(&crypt_stat->cs_md5_tfm_mutex);
101 sg_init_one(&sg, (u8 *)src, len);
102 if (!crypt_stat->md5_tfm) {
103 crypt_stat->md5_tfm =
104 crypto_alloc_tfm("md5", CRYPTO_TFM_REQ_MAY_SLEEP);
105 if (!crypt_stat->md5_tfm) {
106 rc = -ENOMEM;
107 ecryptfs_printk(KERN_ERR, "Error attempting to "
108 "allocate crypto context\n");
109 goto out;
110 }
111 }
112 crypto_digest_init(crypt_stat->md5_tfm);
113 crypto_digest_update(crypt_stat->md5_tfm, &sg, 1);
114 crypto_digest_final(crypt_stat->md5_tfm, dst);
115 mutex_unlock(&crypt_stat->cs_md5_tfm_mutex);
116out:
117 return rc;
118}
119
120/**
121 * ecryptfs_derive_iv
122 * @iv: destination for the derived iv vale
123 * @crypt_stat: Pointer to crypt_stat struct for the current inode
124 * @offset: Offset of the page whose's iv we are to derive
125 *
126 * Generate the initialization vector from the given root IV and page
127 * offset.
128 *
129 * Returns zero on success; non-zero on error.
130 */
131static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
132 pgoff_t offset)
133{
134 int rc = 0;
135 char dst[MD5_DIGEST_SIZE];
136 char src[ECRYPTFS_MAX_IV_BYTES + 16];
137
138 if (unlikely(ecryptfs_verbosity > 0)) {
139 ecryptfs_printk(KERN_DEBUG, "root iv:\n");
140 ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes);
141 }
142 /* TODO: It is probably secure to just cast the least
143 * significant bits of the root IV into an unsigned long and
144 * add the offset to that rather than go through all this
145 * hashing business. -Halcrow */
146 memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes);
147 memset((src + crypt_stat->iv_bytes), 0, 16);
148 snprintf((src + crypt_stat->iv_bytes), 16, "%ld", offset);
149 if (unlikely(ecryptfs_verbosity > 0)) {
150 ecryptfs_printk(KERN_DEBUG, "source:\n");
151 ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
152 }
153 rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
154 (crypt_stat->iv_bytes + 16));
155 if (rc) {
156 ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
157 "MD5 while generating IV for a page\n");
158 goto out;
159 }
160 memcpy(iv, dst, crypt_stat->iv_bytes);
161 if (unlikely(ecryptfs_verbosity > 0)) {
162 ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
163 ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
164 }
165out:
166 return rc;
167}
168
169/**
170 * ecryptfs_init_crypt_stat
171 * @crypt_stat: Pointer to the crypt_stat struct to initialize.
172 *
173 * Initialize the crypt_stat structure.
174 */
175void
176ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
177{
178 memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
179 mutex_init(&crypt_stat->cs_mutex);
180 mutex_init(&crypt_stat->cs_tfm_mutex);
181 mutex_init(&crypt_stat->cs_md5_tfm_mutex);
182 ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_STRUCT_INITIALIZED);
183}
184
185/**
186 * ecryptfs_destruct_crypt_stat
187 * @crypt_stat: Pointer to the crypt_stat struct to initialize.
188 *
189 * Releases all memory associated with a crypt_stat struct.
190 */
191void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
192{
193 if (crypt_stat->tfm)
194 crypto_free_tfm(crypt_stat->tfm);
195 if (crypt_stat->md5_tfm)
196 crypto_free_tfm(crypt_stat->md5_tfm);
197 memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
198}
199
200void ecryptfs_destruct_mount_crypt_stat(
201 struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
202{
203 if (mount_crypt_stat->global_auth_tok_key)
204 key_put(mount_crypt_stat->global_auth_tok_key);
205 if (mount_crypt_stat->global_key_tfm)
206 crypto_free_tfm(mount_crypt_stat->global_key_tfm);
207 memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat));
208}
209
210/**
211 * virt_to_scatterlist
212 * @addr: Virtual address
213 * @size: Size of data; should be an even multiple of the block size
214 * @sg: Pointer to scatterlist array; set to NULL to obtain only
215 * the number of scatterlist structs required in array
216 * @sg_size: Max array size
217 *
218 * Fills in a scatterlist array with page references for a passed
219 * virtual address.
220 *
221 * Returns the number of scatterlist structs in array used
222 */
223int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
224 int sg_size)
225{
226 int i = 0;
227 struct page *pg;
228 int offset;
229 int remainder_of_page;
230
231 while (size > 0 && i < sg_size) {
232 pg = virt_to_page(addr);
233 offset = offset_in_page(addr);
234 if (sg) {
235 sg[i].page = pg;
236 sg[i].offset = offset;
237 }
238 remainder_of_page = PAGE_CACHE_SIZE - offset;
239 if (size >= remainder_of_page) {
240 if (sg)
241 sg[i].length = remainder_of_page;
242 addr += remainder_of_page;
243 size -= remainder_of_page;
244 } else {
245 if (sg)
246 sg[i].length = size;
247 addr += size;
248 size = 0;
249 }
250 i++;
251 }
252 if (size > 0)
253 return -ENOMEM;
254 return i;
255}
256
257/**
258 * encrypt_scatterlist
259 * @crypt_stat: Pointer to the crypt_stat struct to initialize.
260 * @dest_sg: Destination of encrypted data
261 * @src_sg: Data to be encrypted
262 * @size: Length of data to be encrypted
263 * @iv: iv to use during encryption
264 *
265 * Returns the number of bytes encrypted; negative value on error
266 */
267static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
268 struct scatterlist *dest_sg,
269 struct scatterlist *src_sg, int size,
270 unsigned char *iv)
271{
272 int rc = 0;
273
274 BUG_ON(!crypt_stat || !crypt_stat->tfm
275 || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
276 ECRYPTFS_STRUCT_INITIALIZED));
277 if (unlikely(ecryptfs_verbosity > 0)) {
278 ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
279 crypt_stat->key_size);
280 ecryptfs_dump_hex(crypt_stat->key,
281 crypt_stat->key_size);
282 }
283 /* Consider doing this once, when the file is opened */
284 mutex_lock(&crypt_stat->cs_tfm_mutex);
285 rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key,
286 crypt_stat->key_size);
287 if (rc) {
288 ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
289 rc);
290 mutex_unlock(&crypt_stat->cs_tfm_mutex);
291 rc = -EINVAL;
292 goto out;
293 }
294 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
295 crypto_cipher_encrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size, iv);
296 mutex_unlock(&crypt_stat->cs_tfm_mutex);
297out:
298 return rc;
299}
300
301static void
302ecryptfs_extent_to_lwr_pg_idx_and_offset(unsigned long *lower_page_idx,
303 int *byte_offset,
304 struct ecryptfs_crypt_stat *crypt_stat,
305 unsigned long extent_num)
306{
307 unsigned long lower_extent_num;
308 int extents_occupied_by_headers_at_front;
309 int bytes_occupied_by_headers_at_front;
310 int extent_offset;
311 int extents_per_page;
312
313 bytes_occupied_by_headers_at_front =
314 ( crypt_stat->header_extent_size
315 * crypt_stat->num_header_extents_at_front );
316 extents_occupied_by_headers_at_front =
317 ( bytes_occupied_by_headers_at_front
318 / crypt_stat->extent_size );
319 lower_extent_num = extents_occupied_by_headers_at_front + extent_num;
320 extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
321 (*lower_page_idx) = lower_extent_num / extents_per_page;
322 extent_offset = lower_extent_num % extents_per_page;
323 (*byte_offset) = extent_offset * crypt_stat->extent_size;
324 ecryptfs_printk(KERN_DEBUG, " * crypt_stat->header_extent_size = "
325 "[%d]\n", crypt_stat->header_extent_size);
326 ecryptfs_printk(KERN_DEBUG, " * crypt_stat->"
327 "num_header_extents_at_front = [%d]\n",
328 crypt_stat->num_header_extents_at_front);
329 ecryptfs_printk(KERN_DEBUG, " * extents_occupied_by_headers_at_"
330 "front = [%d]\n", extents_occupied_by_headers_at_front);
331 ecryptfs_printk(KERN_DEBUG, " * lower_extent_num = [0x%.16x]\n",
332 lower_extent_num);
333 ecryptfs_printk(KERN_DEBUG, " * extents_per_page = [%d]\n",
334 extents_per_page);
335 ecryptfs_printk(KERN_DEBUG, " * (*lower_page_idx) = [0x%.16x]\n",
336 (*lower_page_idx));
337 ecryptfs_printk(KERN_DEBUG, " * extent_offset = [%d]\n",
338 extent_offset);
339 ecryptfs_printk(KERN_DEBUG, " * (*byte_offset) = [%d]\n",
340 (*byte_offset));
341}
342
343static int ecryptfs_write_out_page(struct ecryptfs_page_crypt_context *ctx,
344 struct page *lower_page,
345 struct inode *lower_inode,
346 int byte_offset_in_page, int bytes_to_write)
347{
348 int rc = 0;
349
350 if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) {
351 rc = ecryptfs_commit_lower_page(lower_page, lower_inode,
352 ctx->param.lower_file,
353 byte_offset_in_page,
354 bytes_to_write);
355 if (rc) {
356 ecryptfs_printk(KERN_ERR, "Error calling lower "
357 "commit; rc = [%d]\n", rc);
358 goto out;
359 }
360 } else {
361 rc = ecryptfs_writepage_and_release_lower_page(lower_page,
362 lower_inode,
363 ctx->param.wbc);
364 if (rc) {
365 ecryptfs_printk(KERN_ERR, "Error calling lower "
366 "writepage(); rc = [%d]\n", rc);
367 goto out;
368 }
369 }
370out:
371 return rc;
372}
373
374static int ecryptfs_read_in_page(struct ecryptfs_page_crypt_context *ctx,
375 struct page **lower_page,
376 struct inode *lower_inode,
377 unsigned long lower_page_idx,
378 int byte_offset_in_page)
379{
380 int rc = 0;
381
382 if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) {
383 /* TODO: Limit this to only the data extents that are
384 * needed */
385 rc = ecryptfs_get_lower_page(lower_page, lower_inode,
386 ctx->param.lower_file,
387 lower_page_idx,
388 byte_offset_in_page,
389 (PAGE_CACHE_SIZE
390 - byte_offset_in_page));
391 if (rc) {
392 ecryptfs_printk(
393 KERN_ERR, "Error attempting to grab, map, "
394 "and prepare_write lower page with index "
395 "[0x%.16x]; rc = [%d]\n", lower_page_idx, rc);
396 goto out;
397 }
398 } else {
399 rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL,
400 lower_inode,
401 lower_page_idx);
402 if (rc) {
403 ecryptfs_printk(
404 KERN_ERR, "Error attempting to grab and map "
405 "lower page with index [0x%.16x]; rc = [%d]\n",
406 lower_page_idx, rc);
407 goto out;
408 }
409 }
410out:
411 return rc;
412}
413
414/**
415 * ecryptfs_encrypt_page
416 * @ctx: The context of the page
417 *
418 * Encrypt an eCryptfs page. This is done on a per-extent basis. Note
419 * that eCryptfs pages may straddle the lower pages -- for instance,
420 * if the file was created on a machine with an 8K page size
421 * (resulting in an 8K header), and then the file is copied onto a
422 * host with a 32K page size, then when reading page 0 of the eCryptfs
423 * file, 24K of page 0 of the lower file will be read and decrypted,
424 * and then 8K of page 1 of the lower file will be read and decrypted.
425 *
426 * The actual operations performed on each page depends on the
427 * contents of the ecryptfs_page_crypt_context struct.
428 *
429 * Returns zero on success; negative on error
430 */
431int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx)
432{
433 char extent_iv[ECRYPTFS_MAX_IV_BYTES];
434 unsigned long base_extent;
435 unsigned long extent_offset = 0;
436 unsigned long lower_page_idx = 0;
437 unsigned long prior_lower_page_idx = 0;
438 struct page *lower_page;
439 struct inode *lower_inode;
440 struct ecryptfs_inode_info *inode_info;
441 struct ecryptfs_crypt_stat *crypt_stat;
442 int rc = 0;
443 int lower_byte_offset = 0;
444 int orig_byte_offset = 0;
445 int num_extents_per_page;
446#define ECRYPTFS_PAGE_STATE_UNREAD 0
447#define ECRYPTFS_PAGE_STATE_READ 1
448#define ECRYPTFS_PAGE_STATE_MODIFIED 2
449#define ECRYPTFS_PAGE_STATE_WRITTEN 3
450 int page_state;
451
452 lower_inode = ecryptfs_inode_to_lower(ctx->page->mapping->host);
453 inode_info = ecryptfs_inode_to_private(ctx->page->mapping->host);
454 crypt_stat = &inode_info->crypt_stat;
455 if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) {
456 rc = ecryptfs_copy_page_to_lower(ctx->page, lower_inode,
457 ctx->param.lower_file);
458 if (rc)
459 ecryptfs_printk(KERN_ERR, "Error attempting to copy "
460 "page at index [0x%.16x]\n",
461 ctx->page->index);
462 goto out;
463 }
464 num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
465 base_extent = (ctx->page->index * num_extents_per_page);
466 page_state = ECRYPTFS_PAGE_STATE_UNREAD;
467 while (extent_offset < num_extents_per_page) {
468 ecryptfs_extent_to_lwr_pg_idx_and_offset(
469 &lower_page_idx, &lower_byte_offset, crypt_stat,
470 (base_extent + extent_offset));
471 if (prior_lower_page_idx != lower_page_idx
472 && page_state == ECRYPTFS_PAGE_STATE_MODIFIED) {
473 rc = ecryptfs_write_out_page(ctx, lower_page,
474 lower_inode,
475 orig_byte_offset,
476 (PAGE_CACHE_SIZE
477 - orig_byte_offset));
478 if (rc) {
479 ecryptfs_printk(KERN_ERR, "Error attempting "
480 "to write out page; rc = [%d]"
481 "\n", rc);
482 goto out;
483 }
484 page_state = ECRYPTFS_PAGE_STATE_WRITTEN;
485 }
486 if (page_state == ECRYPTFS_PAGE_STATE_UNREAD
487 || page_state == ECRYPTFS_PAGE_STATE_WRITTEN) {
488 rc = ecryptfs_read_in_page(ctx, &lower_page,
489 lower_inode, lower_page_idx,
490 lower_byte_offset);
491 if (rc) {
492 ecryptfs_printk(KERN_ERR, "Error attempting "
493 "to read in lower page with "
494 "index [0x%.16x]; rc = [%d]\n",
495 lower_page_idx, rc);
496 goto out;
497 }
498 orig_byte_offset = lower_byte_offset;
499 prior_lower_page_idx = lower_page_idx;
500 page_state = ECRYPTFS_PAGE_STATE_READ;
501 }
502 BUG_ON(!(page_state == ECRYPTFS_PAGE_STATE_MODIFIED
503 || page_state == ECRYPTFS_PAGE_STATE_READ));
504 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
505 (base_extent + extent_offset));
506 if (rc) {
507 ecryptfs_printk(KERN_ERR, "Error attempting to "
508 "derive IV for extent [0x%.16x]; "
509 "rc = [%d]\n",
510 (base_extent + extent_offset), rc);
511 goto out;
512 }
513 if (unlikely(ecryptfs_verbosity > 0)) {
514 ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
515 "with iv:\n");
516 ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
517 ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
518 "encryption:\n");
519 ecryptfs_dump_hex((char *)
520 (page_address(ctx->page)
521 + (extent_offset
522 * crypt_stat->extent_size)), 8);
523 }
524 rc = ecryptfs_encrypt_page_offset(
525 crypt_stat, lower_page, lower_byte_offset, ctx->page,
526 (extent_offset * crypt_stat->extent_size),
527 crypt_stat->extent_size, extent_iv);
528 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
529 "rc = [%d]\n",
530 (base_extent + extent_offset), rc);
531 if (unlikely(ecryptfs_verbosity > 0)) {
532 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
533 "encryption:\n");
534 ecryptfs_dump_hex((char *)(page_address(lower_page)
535 + lower_byte_offset), 8);
536 }
537 page_state = ECRYPTFS_PAGE_STATE_MODIFIED;
538 extent_offset++;
539 }
540 BUG_ON(orig_byte_offset != 0);
541 rc = ecryptfs_write_out_page(ctx, lower_page, lower_inode, 0,
542 (lower_byte_offset
543 + crypt_stat->extent_size));
544 if (rc) {
545 ecryptfs_printk(KERN_ERR, "Error attempting to write out "
546 "page; rc = [%d]\n", rc);
547 goto out;
548 }
549out:
550 return rc;
551}
552
553/**
554 * ecryptfs_decrypt_page
555 * @file: The ecryptfs file
556 * @page: The page in ecryptfs to decrypt
557 *
558 * Decrypt an eCryptfs page. This is done on a per-extent basis. Note
559 * that eCryptfs pages may straddle the lower pages -- for instance,
560 * if the file was created on a machine with an 8K page size
561 * (resulting in an 8K header), and then the file is copied onto a
562 * host with a 32K page size, then when reading page 0 of the eCryptfs
563 * file, 24K of page 0 of the lower file will be read and decrypted,
564 * and then 8K of page 1 of the lower file will be read and decrypted.
565 *
566 * Returns zero on success; negative on error
567 */
568int ecryptfs_decrypt_page(struct file *file, struct page *page)
569{
570 char extent_iv[ECRYPTFS_MAX_IV_BYTES];
571 unsigned long base_extent;
572 unsigned long extent_offset = 0;
573 unsigned long lower_page_idx = 0;
574 unsigned long prior_lower_page_idx = 0;
575 struct page *lower_page;
576 char *lower_page_virt = NULL;
577 struct inode *lower_inode;
578 struct ecryptfs_crypt_stat *crypt_stat;
579 int rc = 0;
580 int byte_offset;
581 int num_extents_per_page;
582 int page_state;
583
584 crypt_stat = &(ecryptfs_inode_to_private(
585 page->mapping->host)->crypt_stat);
586 lower_inode = ecryptfs_inode_to_lower(page->mapping->host);
587 if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) {
588 rc = ecryptfs_do_readpage(file, page, page->index);
589 if (rc)
590 ecryptfs_printk(KERN_ERR, "Error attempting to copy "
591 "page at index [0x%.16x]\n",
592 page->index);
593 goto out;
594 }
595 num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
596 base_extent = (page->index * num_extents_per_page);
597 lower_page_virt = kmem_cache_alloc(ecryptfs_lower_page_cache,
598 SLAB_KERNEL);
599 if (!lower_page_virt) {
600 rc = -ENOMEM;
601 ecryptfs_printk(KERN_ERR, "Error getting page for encrypted "
602 "lower page(s)\n");
603 goto out;
604 }
605 lower_page = virt_to_page(lower_page_virt);
606 page_state = ECRYPTFS_PAGE_STATE_UNREAD;
607 while (extent_offset < num_extents_per_page) {
608 ecryptfs_extent_to_lwr_pg_idx_and_offset(
609 &lower_page_idx, &byte_offset, crypt_stat,
610 (base_extent + extent_offset));
611 if (prior_lower_page_idx != lower_page_idx
612 || page_state == ECRYPTFS_PAGE_STATE_UNREAD) {
613 rc = ecryptfs_do_readpage(file, lower_page,
614 lower_page_idx);
615 if (rc) {
616 ecryptfs_printk(KERN_ERR, "Error reading "
617 "lower encrypted page; rc = "
618 "[%d]\n", rc);
619 goto out;
620 }
621 prior_lower_page_idx = lower_page_idx;
622 page_state = ECRYPTFS_PAGE_STATE_READ;
623 }
624 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
625 (base_extent + extent_offset));
626 if (rc) {
627 ecryptfs_printk(KERN_ERR, "Error attempting to "
628 "derive IV for extent [0x%.16x]; rc = "
629 "[%d]\n",
630 (base_extent + extent_offset), rc);
631 goto out;
632 }
633 if (unlikely(ecryptfs_verbosity > 0)) {
634 ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
635 "with iv:\n");
636 ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
637 ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
638 "decryption:\n");
639 ecryptfs_dump_hex((lower_page_virt + byte_offset), 8);
640 }
641 rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
642 (extent_offset
643 * crypt_stat->extent_size),
644 lower_page, byte_offset,
645 crypt_stat->extent_size,
646 extent_iv);
647 if (rc != crypt_stat->extent_size) {
648 ecryptfs_printk(KERN_ERR, "Error attempting to "
649 "decrypt extent [0x%.16x]\n",
650 (base_extent + extent_offset));
651 goto out;
652 }
653 rc = 0;
654 if (unlikely(ecryptfs_verbosity > 0)) {
655 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
656 "decryption:\n");
657 ecryptfs_dump_hex((char *)(page_address(page)
658 + byte_offset), 8);
659 }
660 extent_offset++;
661 }
662out:
663 if (lower_page_virt)
664 kmem_cache_free(ecryptfs_lower_page_cache, lower_page_virt);
665 return rc;
666}
667
668/**
669 * decrypt_scatterlist
670 *
671 * Returns the number of bytes decrypted; negative value on error
672 */
673static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
674 struct scatterlist *dest_sg,
675 struct scatterlist *src_sg, int size,
676 unsigned char *iv)
677{
678 int rc = 0;
679
680 /* Consider doing this once, when the file is opened */
681 mutex_lock(&crypt_stat->cs_tfm_mutex);
682 rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key,
683 crypt_stat->key_size);
684 if (rc) {
685 ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
686 rc);
687 mutex_unlock(&crypt_stat->cs_tfm_mutex);
688 rc = -EINVAL;
689 goto out;
690 }
691 ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
692 rc = crypto_cipher_decrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size,
693 iv);
694 mutex_unlock(&crypt_stat->cs_tfm_mutex);
695 if (rc) {
696 ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n",
697 rc);
698 goto out;
699 }
700 rc = size;
701out:
702 return rc;
703}
704
705/**
706 * ecryptfs_encrypt_page_offset
707 *
708 * Returns the number of bytes encrypted
709 */
710static int
711ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
712 struct page *dst_page, int dst_offset,
713 struct page *src_page, int src_offset, int size,
714 unsigned char *iv)
715{
716 struct scatterlist src_sg, dst_sg;
717
718 src_sg.page = src_page;
719 src_sg.offset = src_offset;
720 src_sg.length = size;
721 dst_sg.page = dst_page;
722 dst_sg.offset = dst_offset;
723 dst_sg.length = size;
724 return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
725}
726
727/**
728 * ecryptfs_decrypt_page_offset
729 *
730 * Returns the number of bytes decrypted
731 */
732static int
733ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
734 struct page *dst_page, int dst_offset,
735 struct page *src_page, int src_offset, int size,
736 unsigned char *iv)
737{
738 struct scatterlist src_sg, dst_sg;
739
740 src_sg.page = src_page;
741 src_sg.offset = src_offset;
742 src_sg.length = size;
743 dst_sg.page = dst_page;
744 dst_sg.offset = dst_offset;
745 dst_sg.length = size;
746 return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
747}
748
749#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
750
751/**
752 * ecryptfs_init_crypt_ctx
753 * @crypt_stat: Uninitilized crypt stats structure
754 *
755 * Initialize the crypto context.
756 *
757 * TODO: Performance: Keep a cache of initialized cipher contexts;
758 * only init if needed
759 */
760int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
761{
762 int rc = -EINVAL;
763
764 if (!crypt_stat->cipher) {
765 ecryptfs_printk(KERN_ERR, "No cipher specified\n");
766 goto out;
767 }
768 ecryptfs_printk(KERN_DEBUG,
769 "Initializing cipher [%s]; strlen = [%d]; "
770 "key_size_bits = [%d]\n",
771 crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
772 crypt_stat->key_size << 3);
773 if (crypt_stat->tfm) {
774 rc = 0;
775 goto out;
776 }
777 mutex_lock(&crypt_stat->cs_tfm_mutex);
778 crypt_stat->tfm = crypto_alloc_tfm(crypt_stat->cipher,
779 ECRYPTFS_DEFAULT_CHAINING_MODE
780 | CRYPTO_TFM_REQ_WEAK_KEY);
781 mutex_unlock(&crypt_stat->cs_tfm_mutex);
782 if (!crypt_stat->tfm) {
783 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
784 "Error initializing cipher [%s]\n",
785 crypt_stat->cipher);
786 goto out;
787 }
788 rc = 0;
789out:
790 return rc;
791}
792
793static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat)
794{
795 int extent_size_tmp;
796
797 crypt_stat->extent_mask = 0xFFFFFFFF;
798 crypt_stat->extent_shift = 0;
799 if (crypt_stat->extent_size == 0)
800 return;
801 extent_size_tmp = crypt_stat->extent_size;
802 while ((extent_size_tmp & 0x01) == 0) {
803 extent_size_tmp >>= 1;
804 crypt_stat->extent_mask <<= 1;
805 crypt_stat->extent_shift++;
806 }
807}
808
809void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
810{
811 /* Default values; may be overwritten as we are parsing the
812 * packets. */
813 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
814 set_extent_mask_and_shift(crypt_stat);
815 crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
816 if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
817 crypt_stat->header_extent_size =
818 ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
819 } else
820 crypt_stat->header_extent_size = PAGE_CACHE_SIZE;
821 crypt_stat->num_header_extents_at_front = 1;
822}
823
824/**
825 * ecryptfs_compute_root_iv
826 * @crypt_stats
827 *
828 * On error, sets the root IV to all 0's.
829 */
830int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
831{
832 int rc = 0;
833 char dst[MD5_DIGEST_SIZE];
834
835 BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
836 BUG_ON(crypt_stat->iv_bytes <= 0);
837 if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID)) {
838 rc = -EINVAL;
839 ecryptfs_printk(KERN_WARNING, "Session key not valid; "
840 "cannot generate root IV\n");
841 goto out;
842 }
843 rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
844 crypt_stat->key_size);
845 if (rc) {
846 ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
847 "MD5 while generating root IV\n");
848 goto out;
849 }
850 memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
851out:
852 if (rc) {
853 memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
854 ECRYPTFS_SET_FLAG(crypt_stat->flags,
855 ECRYPTFS_SECURITY_WARNING);
856 }
857 return rc;
858}
859
860static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
861{
862 get_random_bytes(crypt_stat->key, crypt_stat->key_size);
863 ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
864 ecryptfs_compute_root_iv(crypt_stat);
865 if (unlikely(ecryptfs_verbosity > 0)) {
866 ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n");
867 ecryptfs_dump_hex(crypt_stat->key,
868 crypt_stat->key_size);
869 }
870}
871
872/**
873 * ecryptfs_set_default_crypt_stat_vals
874 * @crypt_stat
875 *
876 * Default values in the event that policy does not override them.
877 */
878static void ecryptfs_set_default_crypt_stat_vals(
879 struct ecryptfs_crypt_stat *crypt_stat,
880 struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
881{
882 ecryptfs_set_default_sizes(crypt_stat);
883 strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER);
884 crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES;
885 ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
886 crypt_stat->file_version = ECRYPTFS_FILE_VERSION;
887 crypt_stat->mount_crypt_stat = mount_crypt_stat;
888}
889
890/**
891 * ecryptfs_new_file_context
892 * @ecryptfs_dentry
893 *
894 * If the crypto context for the file has not yet been established,
895 * this is where we do that. Establishing a new crypto context
896 * involves the following decisions:
897 * - What cipher to use?
898 * - What set of authentication tokens to use?
899 * Here we just worry about getting enough information into the
900 * authentication tokens so that we know that they are available.
901 * We associate the available authentication tokens with the new file
902 * via the set of signatures in the crypt_stat struct. Later, when
903 * the headers are actually written out, we may again defer to
904 * userspace to perform the encryption of the session key; for the
905 * foreseeable future, this will be the case with public key packets.
906 *
907 * Returns zero on success; non-zero otherwise
908 */
909/* Associate an authentication token(s) with the file */
910int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry)
911{
912 int rc = 0;
913 struct ecryptfs_crypt_stat *crypt_stat =
914 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
915 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
916 &ecryptfs_superblock_to_private(
917 ecryptfs_dentry->d_sb)->mount_crypt_stat;
918 int cipher_name_len;
919
920 ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat);
921 /* See if there are mount crypt options */
922 if (mount_crypt_stat->global_auth_tok) {
923 ecryptfs_printk(KERN_DEBUG, "Initializing context for new "
924 "file using mount_crypt_stat\n");
925 ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
926 ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
927 memcpy(crypt_stat->keysigs[crypt_stat->num_keysigs++],
928 mount_crypt_stat->global_auth_tok_sig,
929 ECRYPTFS_SIG_SIZE_HEX);
930 cipher_name_len =
931 strlen(mount_crypt_stat->global_default_cipher_name);
932 memcpy(crypt_stat->cipher,
933 mount_crypt_stat->global_default_cipher_name,
934 cipher_name_len);
935 crypt_stat->cipher[cipher_name_len] = '\0';
936 crypt_stat->key_size =
937 mount_crypt_stat->global_default_cipher_key_size;
938 ecryptfs_generate_new_key(crypt_stat);
939 } else
940 /* We should not encounter this scenario since we
941 * should detect lack of global_auth_tok at mount time
942 * TODO: Applies to 0.1 release only; remove in future
943 * release */
944 BUG();
945 rc = ecryptfs_init_crypt_ctx(crypt_stat);
946 if (rc)
947 ecryptfs_printk(KERN_ERR, "Error initializing cryptographic "
948 "context for cipher [%s]: rc = [%d]\n",
949 crypt_stat->cipher, rc);
950 return rc;
951}
952
953/**
954 * contains_ecryptfs_marker - check for the ecryptfs marker
955 * @data: The data block in which to check
956 *
957 * Returns one if marker found; zero if not found
958 */
959int contains_ecryptfs_marker(char *data)
960{
961 u32 m_1, m_2;
962
963 memcpy(&m_1, data, 4);
964 m_1 = be32_to_cpu(m_1);
965 memcpy(&m_2, (data + 4), 4);
966 m_2 = be32_to_cpu(m_2);
967 if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
968 return 1;
969 ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
970 "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
971 MAGIC_ECRYPTFS_MARKER);
972 ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
973 "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
974 return 0;
975}
976
977struct ecryptfs_flag_map_elem {
978 u32 file_flag;
979 u32 local_flag;
980};
981
982/* Add support for additional flags by adding elements here. */
983static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
984 {0x00000001, ECRYPTFS_ENABLE_HMAC},
985 {0x00000002, ECRYPTFS_ENCRYPTED}
986};
987
988/**
989 * ecryptfs_process_flags
990 * @crypt_stat
991 * @page_virt: Source data to be parsed
992 * @bytes_read: Updated with the number of bytes read
993 *
994 * Returns zero on success; non-zero if the flag set is invalid
995 */
996static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
997 char *page_virt, int *bytes_read)
998{
999 int rc = 0;
1000 int i;
1001 u32 flags;
1002
1003 memcpy(&flags, page_virt, 4);
1004 flags = be32_to_cpu(flags);
1005 for (i = 0; i < ((sizeof(ecryptfs_flag_map)
1006 / sizeof(struct ecryptfs_flag_map_elem))); i++)
1007 if (flags & ecryptfs_flag_map[i].file_flag) {
1008 ECRYPTFS_SET_FLAG(crypt_stat->flags,
1009 ecryptfs_flag_map[i].local_flag);
1010 } else
1011 ECRYPTFS_CLEAR_FLAG(crypt_stat->flags,
1012 ecryptfs_flag_map[i].local_flag);
1013 /* Version is in top 8 bits of the 32-bit flag vector */
1014 crypt_stat->file_version = ((flags >> 24) & 0xFF);
1015 (*bytes_read) = 4;
1016 return rc;
1017}
1018
1019/**
1020 * write_ecryptfs_marker
1021 * @page_virt: The pointer to in a page to begin writing the marker
1022 * @written: Number of bytes written
1023 *
1024 * Marker = 0x3c81b7f5
1025 */
1026static void write_ecryptfs_marker(char *page_virt, size_t *written)
1027{
1028 u32 m_1, m_2;
1029
1030 get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
1031 m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
1032 m_1 = cpu_to_be32(m_1);
1033 memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
1034 m_2 = cpu_to_be32(m_2);
1035 memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2,
1036 (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
1037 (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
1038}
1039
1040static void
1041write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
1042 size_t *written)
1043{
1044 u32 flags = 0;
1045 int i;
1046
1047 for (i = 0; i < ((sizeof(ecryptfs_flag_map)
1048 / sizeof(struct ecryptfs_flag_map_elem))); i++)
1049 if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
1050 ecryptfs_flag_map[i].local_flag))
1051 flags |= ecryptfs_flag_map[i].file_flag;
1052 /* Version is in top 8 bits of the 32-bit flag vector */
1053 flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
1054 flags = cpu_to_be32(flags);
1055 memcpy(page_virt, &flags, 4);
1056 (*written) = 4;
1057}
1058
1059struct ecryptfs_cipher_code_str_map_elem {
1060 char cipher_str[16];
1061 u16 cipher_code;
1062};
1063
1064/* Add support for additional ciphers by adding elements here. The
1065 * cipher_code is whatever OpenPGP applicatoins use to identify the
1066 * ciphers. List in order of probability. */
1067static struct ecryptfs_cipher_code_str_map_elem
1068ecryptfs_cipher_code_str_map[] = {
1069 {"aes",RFC2440_CIPHER_AES_128 },
1070 {"blowfish", RFC2440_CIPHER_BLOWFISH},
1071 {"des3_ede", RFC2440_CIPHER_DES3_EDE},
1072 {"cast5", RFC2440_CIPHER_CAST_5},
1073 {"twofish", RFC2440_CIPHER_TWOFISH},
1074 {"cast6", RFC2440_CIPHER_CAST_6},
1075 {"aes", RFC2440_CIPHER_AES_192},
1076 {"aes", RFC2440_CIPHER_AES_256}
1077};
1078
1079/**
1080 * ecryptfs_code_for_cipher_string
1081 * @str: The string representing the cipher name
1082 *
1083 * Returns zero on no match, or the cipher code on match
1084 */
1085u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
1086{
1087 int i;
1088 u16 code = 0;
1089 struct ecryptfs_cipher_code_str_map_elem *map =
1090 ecryptfs_cipher_code_str_map;
1091
1092 if (strcmp(crypt_stat->cipher, "aes") == 0) {
1093 switch (crypt_stat->key_size) {
1094 case 16:
1095 code = RFC2440_CIPHER_AES_128;
1096 break;
1097 case 24:
1098 code = RFC2440_CIPHER_AES_192;
1099 break;
1100 case 32:
1101 code = RFC2440_CIPHER_AES_256;
1102 }
1103 } else {
1104 for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
1105 if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
1106 code = map[i].cipher_code;
1107 break;
1108 }
1109 }
1110 return code;
1111}
1112
1113/**
1114 * ecryptfs_cipher_code_to_string
1115 * @str: Destination to write out the cipher name
1116 * @cipher_code: The code to convert to cipher name string
1117 *
1118 * Returns zero on success
1119 */
1120int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code)
1121{
1122 int rc = 0;
1123 int i;
1124
1125 str[0] = '\0';
1126 for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
1127 if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code)
1128 strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str);
1129 if (str[0] == '\0') {
1130 ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: "
1131 "[%d]\n", cipher_code);
1132 rc = -EINVAL;
1133 }
1134 return rc;
1135}
1136
1137/**
1138 * ecryptfs_read_header_region
1139 * @data
1140 * @dentry
1141 * @nd
1142 *
1143 * Returns zero on success; non-zero otherwise
1144 */
1145int ecryptfs_read_header_region(char *data, struct dentry *dentry,
1146 struct vfsmount *mnt)
1147{
1148 struct file *file;
1149 mm_segment_t oldfs;
1150 int rc;
1151
1152 mnt = mntget(mnt);
1153 file = dentry_open(dentry, mnt, O_RDONLY);
1154 if (IS_ERR(file)) {
1155 ecryptfs_printk(KERN_DEBUG, "Error opening file to "
1156 "read header region\n");
1157 mntput(mnt);
1158 rc = PTR_ERR(file);
1159 goto out;
1160 }
1161 file->f_pos = 0;
1162 oldfs = get_fs();
1163 set_fs(get_ds());
1164 /* For releases 0.1 and 0.2, all of the header information
1165 * fits in the first data extent-sized region. */
1166 rc = file->f_op->read(file, (char __user *)data,
1167 ECRYPTFS_DEFAULT_EXTENT_SIZE, &file->f_pos);
1168 set_fs(oldfs);
1169 fput(file);
1170 rc = 0;
1171out:
1172 return rc;
1173}
1174
1175static void
1176write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat,
1177 size_t *written)
1178{
1179 u32 header_extent_size;
1180 u16 num_header_extents_at_front;
1181
1182 header_extent_size = (u32)crypt_stat->header_extent_size;
1183 num_header_extents_at_front =
1184 (u16)crypt_stat->num_header_extents_at_front;
1185 header_extent_size = cpu_to_be32(header_extent_size);
1186 memcpy(virt, &header_extent_size, 4);
1187 virt += 4;
1188 num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front);
1189 memcpy(virt, &num_header_extents_at_front, 2);
1190 (*written) = 6;
1191}
1192
1193struct kmem_cache *ecryptfs_header_cache_0;
1194struct kmem_cache *ecryptfs_header_cache_1;
1195struct kmem_cache *ecryptfs_header_cache_2;
1196
1197/**
1198 * ecryptfs_write_headers_virt
1199 * @page_virt
1200 * @crypt_stat
1201 * @ecryptfs_dentry
1202 *
1203 * Format version: 1
1204 *
1205 * Header Extent:
1206 * Octets 0-7: Unencrypted file size (big-endian)
1207 * Octets 8-15: eCryptfs special marker
1208 * Octets 16-19: Flags
1209 * Octet 16: File format version number (between 0 and 255)
1210 * Octets 17-18: Reserved
1211 * Octet 19: Bit 1 (lsb): Reserved
1212 * Bit 2: Encrypted?
1213 * Bits 3-8: Reserved
1214 * Octets 20-23: Header extent size (big-endian)
1215 * Octets 24-25: Number of header extents at front of file
1216 * (big-endian)
1217 * Octet 26: Begin RFC 2440 authentication token packet set
1218 * Data Extent 0:
1219 * Lower data (CBC encrypted)
1220 * Data Extent 1:
1221 * Lower data (CBC encrypted)
1222 * ...
1223 *
1224 * Returns zero on success
1225 */
1226int ecryptfs_write_headers_virt(char *page_virt,
1227 struct ecryptfs_crypt_stat *crypt_stat,
1228 struct dentry *ecryptfs_dentry)
1229{
1230 int rc;
1231 size_t written;
1232 size_t offset;
1233
1234 offset = ECRYPTFS_FILE_SIZE_BYTES;
1235 write_ecryptfs_marker((page_virt + offset), &written);
1236 offset += written;
1237 write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
1238 offset += written;
1239 write_header_metadata((page_virt + offset), crypt_stat, &written);
1240 offset += written;
1241 rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
1242 ecryptfs_dentry, &written,
1243 PAGE_CACHE_SIZE - offset);
1244 if (rc)
1245 ecryptfs_printk(KERN_WARNING, "Error generating key packet "
1246 "set; rc = [%d]\n", rc);
1247 return rc;
1248}
1249
1250/**
1251 * ecryptfs_write_headers
1252 * @lower_file: The lower file struct, which was returned from dentry_open
1253 *
1254 * Write the file headers out. This will likely involve a userspace
1255 * callout, in which the session key is encrypted with one or more
1256 * public keys and/or the passphrase necessary to do the encryption is
1257 * retrieved via a prompt. Exactly what happens at this point should
1258 * be policy-dependent.
1259 *
1260 * Returns zero on success; non-zero on error
1261 */
1262int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
1263 struct file *lower_file)
1264{
1265 mm_segment_t oldfs;
1266 struct ecryptfs_crypt_stat *crypt_stat;
1267 char *page_virt;
1268 int current_header_page;
1269 int header_pages;
1270 int rc = 0;
1271
1272 crypt_stat = &ecryptfs_inode_to_private(
1273 ecryptfs_dentry->d_inode)->crypt_stat;
1274 if (likely(ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
1275 ECRYPTFS_ENCRYPTED))) {
1276 if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
1277 ECRYPTFS_KEY_VALID)) {
1278 ecryptfs_printk(KERN_DEBUG, "Key is "
1279 "invalid; bailing out\n");
1280 rc = -EINVAL;
1281 goto out;
1282 }
1283 } else {
1284 rc = -EINVAL;
1285 ecryptfs_printk(KERN_WARNING,
1286 "Called with crypt_stat->encrypted == 0\n");
1287 goto out;
1288 }
1289 /* Released in this function */
1290 page_virt = kmem_cache_alloc(ecryptfs_header_cache_0, SLAB_USER);
1291 if (!page_virt) {
1292 ecryptfs_printk(KERN_ERR, "Out of memory\n");
1293 rc = -ENOMEM;
1294 goto out;
1295 }
1296 memset(page_virt, 0, PAGE_CACHE_SIZE);
1297 rc = ecryptfs_write_headers_virt(page_virt, crypt_stat,
1298 ecryptfs_dentry);
1299 if (unlikely(rc)) {
1300 ecryptfs_printk(KERN_ERR, "Error whilst writing headers\n");
1301 memset(page_virt, 0, PAGE_CACHE_SIZE);
1302 goto out_free;
1303 }
1304 ecryptfs_printk(KERN_DEBUG,
1305 "Writing key packet set to underlying file\n");
1306 lower_file->f_pos = 0;
1307 oldfs = get_fs();
1308 set_fs(get_ds());
1309 ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->"
1310 "write() w/ header page; lower_file->f_pos = "
1311 "[0x%.16x]\n", lower_file->f_pos);
1312 lower_file->f_op->write(lower_file, (char __user *)page_virt,
1313 PAGE_CACHE_SIZE, &lower_file->f_pos);
1314 header_pages = ((crypt_stat->header_extent_size
1315 * crypt_stat->num_header_extents_at_front)
1316 / PAGE_CACHE_SIZE);
1317 memset(page_virt, 0, PAGE_CACHE_SIZE);
1318 current_header_page = 1;
1319 while (current_header_page < header_pages) {
1320 ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->"
1321 "write() w/ zero'd page; lower_file->f_pos = "
1322 "[0x%.16x]\n", lower_file->f_pos);
1323 lower_file->f_op->write(lower_file, (char __user *)page_virt,
1324 PAGE_CACHE_SIZE, &lower_file->f_pos);
1325 current_header_page++;
1326 }
1327 set_fs(oldfs);
1328 ecryptfs_printk(KERN_DEBUG,
1329 "Done writing key packet set to underlying file.\n");
1330out_free:
1331 kmem_cache_free(ecryptfs_header_cache_0, page_virt);
1332out:
1333 return rc;
1334}
1335
1336static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
1337 char *virt, int *bytes_read)
1338{
1339 int rc = 0;
1340 u32 header_extent_size;
1341 u16 num_header_extents_at_front;
1342
1343 memcpy(&header_extent_size, virt, 4);
1344 header_extent_size = be32_to_cpu(header_extent_size);
1345 virt += 4;
1346 memcpy(&num_header_extents_at_front, virt, 2);
1347 num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front);
1348 crypt_stat->header_extent_size = (int)header_extent_size;
1349 crypt_stat->num_header_extents_at_front =
1350 (int)num_header_extents_at_front;
1351 (*bytes_read) = 6;
1352 if ((crypt_stat->header_extent_size
1353 * crypt_stat->num_header_extents_at_front)
1354 < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
1355 rc = -EINVAL;
1356 ecryptfs_printk(KERN_WARNING, "Invalid header extent size: "
1357 "[%d]\n", crypt_stat->header_extent_size);
1358 }
1359 return rc;
1360}
1361
1362/**
1363 * set_default_header_data
1364 *
1365 * For version 0 file format; this function is only for backwards
1366 * compatibility for files created with the prior versions of
1367 * eCryptfs.
1368 */
1369static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
1370{
1371 crypt_stat->header_extent_size = 4096;
1372 crypt_stat->num_header_extents_at_front = 1;
1373}
1374
1375/**
1376 * ecryptfs_read_headers_virt
1377 *
1378 * Read/parse the header data. The header format is detailed in the
1379 * comment block for the ecryptfs_write_headers_virt() function.
1380 *
1381 * Returns zero on success
1382 */
1383static int ecryptfs_read_headers_virt(char *page_virt,
1384 struct ecryptfs_crypt_stat *crypt_stat,
1385 struct dentry *ecryptfs_dentry)
1386{
1387 int rc = 0;
1388 int offset;
1389 int bytes_read;
1390
1391 ecryptfs_set_default_sizes(crypt_stat);
1392 crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
1393 ecryptfs_dentry->d_sb)->mount_crypt_stat;
1394 offset = ECRYPTFS_FILE_SIZE_BYTES;
1395 rc = contains_ecryptfs_marker(page_virt + offset);
1396 if (rc == 0) {
1397 rc = -EINVAL;
1398 goto out;
1399 }
1400 offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
1401 rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
1402 &bytes_read);
1403 if (rc) {
1404 ecryptfs_printk(KERN_WARNING, "Error processing flags\n");
1405 goto out;
1406 }
1407 if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) {
1408 ecryptfs_printk(KERN_WARNING, "File version is [%d]; only "
1409 "file version [%d] is supported by this "
1410 "version of eCryptfs\n",
1411 crypt_stat->file_version,
1412 ECRYPTFS_SUPPORTED_FILE_VERSION);
1413 rc = -EINVAL;
1414 goto out;
1415 }
1416 offset += bytes_read;
1417 if (crypt_stat->file_version >= 1) {
1418 rc = parse_header_metadata(crypt_stat, (page_virt + offset),
1419 &bytes_read);
1420 if (rc) {
1421 ecryptfs_printk(KERN_WARNING, "Error reading header "
1422 "metadata; rc = [%d]\n", rc);
1423 }
1424 offset += bytes_read;
1425 } else
1426 set_default_header_data(crypt_stat);
1427 rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset),
1428 ecryptfs_dentry);
1429out:
1430 return rc;
1431}
1432
1433/**
1434 * ecryptfs_read_headers
1435 *
1436 * Returns zero if valid headers found and parsed; non-zero otherwise
1437 */
1438int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
1439 struct file *lower_file)
1440{
1441 int rc = 0;
1442 char *page_virt = NULL;
1443 mm_segment_t oldfs;
1444 ssize_t bytes_read;
1445 struct ecryptfs_crypt_stat *crypt_stat =
1446 &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
1447
1448 /* Read the first page from the underlying file */
1449 page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, SLAB_USER);
1450 if (!page_virt) {
1451 rc = -ENOMEM;
1452 ecryptfs_printk(KERN_ERR, "Unable to allocate page_virt\n");
1453 goto out;
1454 }
1455 lower_file->f_pos = 0;
1456 oldfs = get_fs();
1457 set_fs(get_ds());
1458 bytes_read = lower_file->f_op->read(lower_file,
1459 (char __user *)page_virt,
1460 ECRYPTFS_DEFAULT_EXTENT_SIZE,
1461 &lower_file->f_pos);
1462 set_fs(oldfs);
1463 if (bytes_read != ECRYPTFS_DEFAULT_EXTENT_SIZE) {
1464 rc = -EINVAL;
1465 goto out;
1466 }
1467 rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
1468 ecryptfs_dentry);
1469 if (rc) {
1470 ecryptfs_printk(KERN_DEBUG, "Valid eCryptfs headers not "
1471 "found\n");
1472 rc = -EINVAL;
1473 }
1474out:
1475 if (page_virt) {
1476 memset(page_virt, 0, PAGE_CACHE_SIZE);
1477 kmem_cache_free(ecryptfs_header_cache_1, page_virt);
1478 }
1479 return rc;
1480}
1481
1482/**
1483 * ecryptfs_encode_filename - converts a plaintext file name to cipher text
1484 * @crypt_stat: The crypt_stat struct associated with the file anem to encode
1485 * @name: The plaintext name
1486 * @length: The length of the plaintext
1487 * @encoded_name: The encypted name
1488 *
1489 * Encrypts and encodes a filename into something that constitutes a
1490 * valid filename for a filesystem, with printable characters.
1491 *
1492 * We assume that we have a properly initialized crypto context,
1493 * pointed to by crypt_stat->tfm.
1494 *
1495 * TODO: Implement filename decoding and decryption here, in place of
1496 * memcpy. We are keeping the framework around for now to (1)
1497 * facilitate testing of the components needed to implement filename
1498 * encryption and (2) to provide a code base from which other
1499 * developers in the community can easily implement this feature.
1500 *
1501 * Returns the length of encoded filename; negative if error
1502 */
1503int
1504ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
1505 const char *name, int length, char **encoded_name)
1506{
1507 int error = 0;
1508
1509 (*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
1510 if (!(*encoded_name)) {
1511 error = -ENOMEM;
1512 goto out;
1513 }
1514 /* TODO: Filename encryption is a scheduled feature for a
1515 * future version of eCryptfs. This function is here only for
1516 * the purpose of providing a framework for other developers
1517 * to easily implement filename encryption. Hint: Replace this
1518 * memcpy() with a call to encrypt and encode the
1519 * filename, the set the length accordingly. */
1520 memcpy((void *)(*encoded_name), (void *)name, length);
1521 (*encoded_name)[length] = '\0';
1522 error = length + 1;
1523out:
1524 return error;
1525}
1526
1527/**
1528 * ecryptfs_decode_filename - converts the cipher text name to plaintext
1529 * @crypt_stat: The crypt_stat struct associated with the file
1530 * @name: The filename in cipher text
1531 * @length: The length of the cipher text name
1532 * @decrypted_name: The plaintext name
1533 *
1534 * Decodes and decrypts the filename.
1535 *
1536 * We assume that we have a properly initialized crypto context,
1537 * pointed to by crypt_stat->tfm.
1538 *
1539 * TODO: Implement filename decoding and decryption here, in place of
1540 * memcpy. We are keeping the framework around for now to (1)
1541 * facilitate testing of the components needed to implement filename
1542 * encryption and (2) to provide a code base from which other
1543 * developers in the community can easily implement this feature.
1544 *
1545 * Returns the length of decoded filename; negative if error
1546 */
1547int
1548ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
1549 const char *name, int length, char **decrypted_name)
1550{
1551 int error = 0;
1552
1553 (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
1554 if (!(*decrypted_name)) {
1555 error = -ENOMEM;
1556 goto out;
1557 }
1558 /* TODO: Filename encryption is a scheduled feature for a
1559 * future version of eCryptfs. This function is here only for
1560 * the purpose of providing a framework for other developers
1561 * to easily implement filename encryption. Hint: Replace this
1562 * memcpy() with a call to decode and decrypt the
1563 * filename, the set the length accordingly. */
1564 memcpy((void *)(*decrypted_name), (void *)name, length);
1565 (*decrypted_name)[length + 1] = '\0'; /* Only for convenience
1566 * in printing out the
1567 * string in debug
1568 * messages */
1569 error = length;
1570out:
1571 return error;
1572}
1573
1574/**
1575 * ecryptfs_process_cipher - Perform cipher initialization.
1576 * @tfm: Crypto context set by this function
1577 * @key_tfm: Crypto context for key material, set by this function
1578 * @cipher_name: Name of the cipher.
1579 * @key_size: Size of the key in bytes.
1580 *
1581 * Returns zero on success. Any crypto_tfm structs allocated here
1582 * should be released by other functions, such as on a superblock put
1583 * event, regardless of whether this function succeeds for fails.
1584 */
1585int
1586ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm,
1587 char *cipher_name, size_t key_size)
1588{
1589 char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
1590 int rc;
1591
1592 *tfm = *key_tfm = NULL;
1593 if (key_size > ECRYPTFS_MAX_KEY_BYTES) {
1594 rc = -EINVAL;
1595 printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
1596 "allowable is [%d]\n", key_size, ECRYPTFS_MAX_KEY_BYTES);
1597 goto out;
1598 }
1599 *tfm = crypto_alloc_tfm(cipher_name, (ECRYPTFS_DEFAULT_CHAINING_MODE
1600 | CRYPTO_TFM_REQ_WEAK_KEY));
1601 if (!(*tfm)) {
1602 rc = -EINVAL;
1603 printk(KERN_ERR "Unable to allocate crypto cipher with name "
1604 "[%s]\n", cipher_name);
1605 goto out;
1606 }
1607 *key_tfm = crypto_alloc_tfm(cipher_name, CRYPTO_TFM_REQ_WEAK_KEY);
1608 if (!(*key_tfm)) {
1609 rc = -EINVAL;
1610 printk(KERN_ERR "Unable to allocate crypto cipher with name "
1611 "[%s]\n", cipher_name);
1612 goto out;
1613 }
1614 if (key_size < crypto_tfm_alg_min_keysize(*tfm)) {
1615 rc = -EINVAL;
1616 printk(KERN_ERR "Request key size is [%Zd]; minimum key size "
1617 "supported by cipher [%s] is [%d]\n", key_size,
1618 cipher_name, crypto_tfm_alg_min_keysize(*tfm));
1619 goto out;
1620 }
1621 if (key_size < crypto_tfm_alg_min_keysize(*key_tfm)) {
1622 rc = -EINVAL;
1623 printk(KERN_ERR "Request key size is [%Zd]; minimum key size "
1624 "supported by cipher [%s] is [%d]\n", key_size,
1625 cipher_name, crypto_tfm_alg_min_keysize(*key_tfm));
1626 goto out;
1627 }
1628 if (key_size > crypto_tfm_alg_max_keysize(*tfm)) {
1629 rc = -EINVAL;
1630 printk(KERN_ERR "Request key size is [%Zd]; maximum key size "
1631 "supported by cipher [%s] is [%d]\n", key_size,
1632 cipher_name, crypto_tfm_alg_min_keysize(*tfm));
1633 goto out;
1634 }
1635 if (key_size > crypto_tfm_alg_max_keysize(*key_tfm)) {
1636 rc = -EINVAL;
1637 printk(KERN_ERR "Request key size is [%Zd]; maximum key size "
1638 "supported by cipher [%s] is [%d]\n", key_size,
1639 cipher_name, crypto_tfm_alg_min_keysize(*key_tfm));
1640 goto out;
1641 }
1642 get_random_bytes(dummy_key, key_size);
1643 rc = crypto_cipher_setkey(*tfm, dummy_key, key_size);
1644 if (rc) {
1645 printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
1646 "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc);
1647 rc = -EINVAL;
1648 goto out;
1649 }
1650 rc = crypto_cipher_setkey(*key_tfm, dummy_key, key_size);
1651 if (rc) {
1652 printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
1653 "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc);
1654 rc = -EINVAL;
1655 goto out;
1656 }
1657out:
1658 return rc;
1659}
diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c
new file mode 100644
index 000000000000..61f8e894284f
--- /dev/null
+++ b/fs/ecryptfs/debug.c
@@ -0,0 +1,123 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 * Functions only useful for debugging.
4 *
5 * Copyright (C) 2006 International Business Machines Corp.
6 * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 * 02111-1307, USA.
22 */
23
24#include "ecryptfs_kernel.h"
25
26/**
27 * ecryptfs_dump_auth_tok - debug function to print auth toks
28 *
29 * This function will print the contents of an ecryptfs authentication
30 * token.
31 */
32void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok)
33{
34 char salt[ECRYPTFS_SALT_SIZE * 2 + 1];
35 char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
36
37 ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n",
38 auth_tok);
39 if (ECRYPTFS_CHECK_FLAG(auth_tok->flags, ECRYPTFS_PRIVATE_KEY)) {
40 ecryptfs_printk(KERN_DEBUG, " * private key type\n");
41 ecryptfs_printk(KERN_DEBUG, " * (NO PRIVATE KEY SUPPORT "
42 "IN ECRYPTFS VERSION 0.1)\n");
43 } else {
44 ecryptfs_printk(KERN_DEBUG, " * passphrase type\n");
45 ecryptfs_to_hex(salt, auth_tok->token.password.salt,
46 ECRYPTFS_SALT_SIZE);
47 salt[ECRYPTFS_SALT_SIZE * 2] = '\0';
48 ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt);
49 if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags,
50 ECRYPTFS_PERSISTENT_PASSWORD)) {
51 ecryptfs_printk(KERN_DEBUG, " * persistent\n");
52 }
53 memcpy(sig, auth_tok->token.password.signature,
54 ECRYPTFS_SIG_SIZE_HEX);
55 sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
56 ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig);
57 }
58 ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n",
59 auth_tok->session_key.flags);
60 if (auth_tok->session_key.flags
61 & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT)
62 ecryptfs_printk(KERN_DEBUG,
63 " * Userspace decrypt request set\n");
64 if (auth_tok->session_key.flags
65 & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT)
66 ecryptfs_printk(KERN_DEBUG,
67 " * Userspace encrypt request set\n");
68 if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) {
69 ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n");
70 ecryptfs_printk(KERN_DEBUG,
71 " * session_key.decrypted_key_size = [0x%x]\n",
72 auth_tok->session_key.decrypted_key_size);
73 ecryptfs_printk(KERN_DEBUG, " * Decrypted session key "
74 "dump:\n");
75 if (ecryptfs_verbosity > 0)
76 ecryptfs_dump_hex(auth_tok->session_key.decrypted_key,
77 ECRYPTFS_DEFAULT_KEY_BYTES);
78 }
79 if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) {
80 ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n");
81 ecryptfs_printk(KERN_DEBUG,
82 " * session_key.encrypted_key_size = [0x%x]\n",
83 auth_tok->session_key.encrypted_key_size);
84 ecryptfs_printk(KERN_DEBUG, " * Encrypted session key "
85 "dump:\n");
86 if (ecryptfs_verbosity > 0)
87 ecryptfs_dump_hex(auth_tok->session_key.encrypted_key,
88 auth_tok->session_key.
89 encrypted_key_size);
90 }
91}
92
93/**
94 * ecryptfs_dump_hex - debug hex printer
95 * @data: string of bytes to be printed
96 * @bytes: number of bytes to print
97 *
98 * Dump hexadecimal representation of char array
99 */
100void ecryptfs_dump_hex(char *data, int bytes)
101{
102 int i = 0;
103 int add_newline = 1;
104
105 if (ecryptfs_verbosity < 1)
106 return;
107 if (bytes != 0) {
108 printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]);
109 i++;
110 }
111 while (i < bytes) {
112 printk("0x%.2x.", (unsigned char)data[i]);
113 i++;
114 if (i % 16 == 0) {
115 printk("\n");
116 add_newline = 0;
117 } else
118 add_newline = 1;
119 }
120 if (add_newline)
121 printk("\n");
122}
123
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
new file mode 100644
index 000000000000..f0d2a433242b
--- /dev/null
+++ b/fs/ecryptfs/dentry.c
@@ -0,0 +1,87 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 *
4 * Copyright (C) 1997-2003 Erez Zadok
5 * Copyright (C) 2001-2003 Stony Brook University
6 * Copyright (C) 2004-2006 International Business Machines Corp.
7 * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22 * 02111-1307, USA.
23 */
24
25#include <linux/dcache.h>
26#include <linux/namei.h>
27#include "ecryptfs_kernel.h"
28
29/**
30 * ecryptfs_d_revalidate - revalidate an ecryptfs dentry
31 * @dentry: The ecryptfs dentry
32 * @nd: The associated nameidata
33 *
34 * Called when the VFS needs to revalidate a dentry. This
35 * is called whenever a name lookup finds a dentry in the
36 * dcache. Most filesystems leave this as NULL, because all their
37 * dentries in the dcache are valid.
38 *
39 * Returns 1 if valid, 0 otherwise.
40 *
41 */
42static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
43{
44 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
45 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
46 struct dentry *dentry_save;
47 struct vfsmount *vfsmount_save;
48 int rc = 1;
49
50 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
51 goto out;
52 dentry_save = nd->dentry;
53 vfsmount_save = nd->mnt;
54 nd->dentry = lower_dentry;
55 nd->mnt = lower_mnt;
56 rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
57 nd->dentry = dentry_save;
58 nd->mnt = vfsmount_save;
59out:
60 return rc;
61}
62
63struct kmem_cache *ecryptfs_dentry_info_cache;
64
65/**
66 * ecryptfs_d_release
67 * @dentry: The ecryptfs dentry
68 *
69 * Called when a dentry is really deallocated.
70 */
71static void ecryptfs_d_release(struct dentry *dentry)
72{
73 struct dentry *lower_dentry;
74
75 lower_dentry = ecryptfs_dentry_to_lower(dentry);
76 if (ecryptfs_dentry_to_private(dentry))
77 kmem_cache_free(ecryptfs_dentry_info_cache,
78 ecryptfs_dentry_to_private(dentry));
79 if (lower_dentry)
80 dput(lower_dentry);
81 return;
82}
83
84struct dentry_operations ecryptfs_dops = {
85 .d_revalidate = ecryptfs_d_revalidate,
86 .d_release = ecryptfs_d_release,
87};
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
new file mode 100644
index 000000000000..872c9958531a
--- /dev/null
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -0,0 +1,482 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 * Kernel declarations.
4 *
5 * Copyright (C) 1997-2003 Erez Zadok
6 * Copyright (C) 2001-2003 Stony Brook University
7 * Copyright (C) 2004-2006 International Business Machines Corp.
8 * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 * 02111-1307, USA.
24 */
25
26#ifndef ECRYPTFS_KERNEL_H
27#define ECRYPTFS_KERNEL_H
28
29#include <keys/user-type.h>
30#include <linux/fs.h>
31#include <linux/scatterlist.h>
32
33/* Version verification for shared data structures w/ userspace */
34#define ECRYPTFS_VERSION_MAJOR 0x00
35#define ECRYPTFS_VERSION_MINOR 0x04
36#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x01
37/* These flags indicate which features are supported by the kernel
38 * module; userspace tools such as the mount helper read
39 * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine
40 * how to behave. */
41#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001
42#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002
43#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004
44#define ECRYPTFS_VERSIONING_POLICY 0x00000008
45#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
46 | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH)
47
48#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
49#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
50#define ECRYPTFS_SALT_SIZE 8
51#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2)
52/* The original signature size is only for what is stored on disk; all
53 * in-memory representations are expanded hex, so it better adapted to
54 * be passed around or referenced on the command line */
55#define ECRYPTFS_SIG_SIZE 8
56#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2)
57#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX
58#define ECRYPTFS_MAX_KEY_BYTES 64
59#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512
60#define ECRYPTFS_DEFAULT_IV_BYTES 16
61#define ECRYPTFS_FILE_VERSION 0x01
62#define ECRYPTFS_DEFAULT_HEADER_EXTENT_SIZE 8192
63#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
64#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192
65
66#define RFC2440_CIPHER_DES3_EDE 0x02
67#define RFC2440_CIPHER_CAST_5 0x03
68#define RFC2440_CIPHER_BLOWFISH 0x04
69#define RFC2440_CIPHER_AES_128 0x07
70#define RFC2440_CIPHER_AES_192 0x08
71#define RFC2440_CIPHER_AES_256 0x09
72#define RFC2440_CIPHER_TWOFISH 0x0a
73#define RFC2440_CIPHER_CAST_6 0x0b
74
75#define ECRYPTFS_SET_FLAG(flag_bit_vector, flag) (flag_bit_vector |= (flag))
76#define ECRYPTFS_CLEAR_FLAG(flag_bit_vector, flag) (flag_bit_vector &= ~(flag))
77#define ECRYPTFS_CHECK_FLAG(flag_bit_vector, flag) (flag_bit_vector & (flag))
78
79/**
80 * For convenience, we may need to pass around the encrypted session
81 * key between kernel and userspace because the authentication token
82 * may not be extractable. For example, the TPM may not release the
83 * private key, instead requiring the encrypted data and returning the
84 * decrypted data.
85 */
86struct ecryptfs_session_key {
87#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001
88#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002
89#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004
90#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008
91 u32 flags;
92 u32 encrypted_key_size;
93 u32 decrypted_key_size;
94 u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
95 u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES];
96};
97
98struct ecryptfs_password {
99 u32 password_bytes;
100 s32 hash_algo;
101 u32 hash_iterations;
102 u32 session_key_encryption_key_bytes;
103#define ECRYPTFS_PERSISTENT_PASSWORD 0x01
104#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02
105 u32 flags;
106 /* Iterated-hash concatenation of salt and passphrase */
107 u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
108 u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
109 /* Always in expanded hex */
110 u8 salt[ECRYPTFS_SALT_SIZE];
111};
112
113enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY};
114
115/* May be a password or a private key */
116struct ecryptfs_auth_tok {
117 u16 version; /* 8-bit major and 8-bit minor */
118 u16 token_type;
119 u32 flags;
120 struct ecryptfs_session_key session_key;
121 u8 reserved[32];
122 union {
123 struct ecryptfs_password password;
124 /* Private key is in future eCryptfs releases */
125 } token;
126} __attribute__ ((packed));
127
128void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
129extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
130extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
131
132struct ecryptfs_key_record {
133 unsigned char type;
134 size_t enc_key_size;
135 unsigned char sig[ECRYPTFS_SIG_SIZE];
136 unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
137};
138
139struct ecryptfs_auth_tok_list {
140 struct ecryptfs_auth_tok *auth_tok;
141 struct list_head list;
142};
143
144struct ecryptfs_crypt_stat;
145struct ecryptfs_mount_crypt_stat;
146
147struct ecryptfs_page_crypt_context {
148 struct page *page;
149#define ECRYPTFS_PREPARE_COMMIT_MODE 0
150#define ECRYPTFS_WRITEPAGE_MODE 1
151 unsigned int mode;
152 union {
153 struct file *lower_file;
154 struct writeback_control *wbc;
155 } param;
156};
157
158static inline struct ecryptfs_auth_tok *
159ecryptfs_get_key_payload_data(struct key *key)
160{
161 return (struct ecryptfs_auth_tok *)
162 (((struct user_key_payload*)key->payload.data)->data);
163}
164
165#define ECRYPTFS_SUPER_MAGIC 0xf15f
166#define ECRYPTFS_MAX_KEYSET_SIZE 1024
167#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
168#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
169#define ECRYPTFS_MAX_NUM_KEYSIGS 2 /* TODO: Make this a linked list */
170#define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */
171#define ECRYPTFS_SALT_BYTES 2
172#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
173#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */
174#define ECRYPTFS_FILE_SIZE_BYTES 8
175#define ECRYPTFS_DEFAULT_CIPHER "aes"
176#define ECRYPTFS_DEFAULT_KEY_BYTES 16
177#define ECRYPTFS_DEFAULT_CHAINING_MODE CRYPTO_TFM_MODE_CBC
178#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
179#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
180#define MD5_DIGEST_SIZE 16
181
182/**
183 * This is the primary struct associated with each encrypted file.
184 *
185 * TODO: cache align/pack?
186 */
187struct ecryptfs_crypt_stat {
188#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
189#define ECRYPTFS_POLICY_APPLIED 0x00000002
190#define ECRYPTFS_NEW_FILE 0x00000004
191#define ECRYPTFS_ENCRYPTED 0x00000008
192#define ECRYPTFS_SECURITY_WARNING 0x00000010
193#define ECRYPTFS_ENABLE_HMAC 0x00000020
194#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040
195#define ECRYPTFS_KEY_VALID 0x00000080
196 u32 flags;
197 unsigned int file_version;
198 size_t iv_bytes;
199 size_t num_keysigs;
200 size_t header_extent_size;
201 size_t num_header_extents_at_front;
202 size_t extent_size; /* Data extent size; default is 4096 */
203 size_t key_size;
204 size_t extent_shift;
205 unsigned int extent_mask;
206 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
207 struct crypto_tfm *tfm;
208 struct crypto_tfm *md5_tfm; /* Crypto context for generating
209 * the initialization vectors */
210 unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
211 unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
212 unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
213 unsigned char keysigs[ECRYPTFS_MAX_NUM_KEYSIGS][ECRYPTFS_SIG_SIZE_HEX];
214 struct mutex cs_tfm_mutex;
215 struct mutex cs_md5_tfm_mutex;
216 struct mutex cs_mutex;
217};
218
219/* inode private data. */
220struct ecryptfs_inode_info {
221 struct inode vfs_inode;
222 struct inode *wii_inode;
223 struct ecryptfs_crypt_stat crypt_stat;
224};
225
226/* dentry private data. Each dentry must keep track of a lower
227 * vfsmount too. */
228struct ecryptfs_dentry_info {
229 struct dentry *wdi_dentry;
230 struct vfsmount *lower_mnt;
231 struct ecryptfs_crypt_stat *crypt_stat;
232};
233
234/**
235 * This struct is to enable a mount-wide passphrase/salt combo. This
236 * is more or less a stopgap to provide similar functionality to other
237 * crypto filesystems like EncFS or CFS until full policy support is
238 * implemented in eCryptfs.
239 */
240struct ecryptfs_mount_crypt_stat {
241 /* Pointers to memory we do not own, do not free these */
242#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001
243 u32 flags;
244 struct ecryptfs_auth_tok *global_auth_tok;
245 struct key *global_auth_tok_key;
246 size_t global_default_cipher_key_size;
247 struct crypto_tfm *global_key_tfm;
248 struct mutex global_key_tfm_mutex;
249 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
250 + 1];
251 unsigned char global_auth_tok_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
252};
253
254/* superblock private data. */
255struct ecryptfs_sb_info {
256 struct super_block *wsi_sb;
257 struct ecryptfs_mount_crypt_stat mount_crypt_stat;
258};
259
260/* file private data. */
261struct ecryptfs_file_info {
262 struct file *wfi_file;
263 struct ecryptfs_crypt_stat *crypt_stat;
264};
265
266/* auth_tok <=> encrypted_session_key mappings */
267struct ecryptfs_auth_tok_list_item {
268 unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES];
269 struct list_head list;
270 struct ecryptfs_auth_tok auth_tok;
271};
272
273static inline struct ecryptfs_file_info *
274ecryptfs_file_to_private(struct file *file)
275{
276 return (struct ecryptfs_file_info *)file->private_data;
277}
278
279static inline void
280ecryptfs_set_file_private(struct file *file,
281 struct ecryptfs_file_info *file_info)
282{
283 file->private_data = file_info;
284}
285
286static inline struct file *ecryptfs_file_to_lower(struct file *file)
287{
288 return ((struct ecryptfs_file_info *)file->private_data)->wfi_file;
289}
290
291static inline void
292ecryptfs_set_file_lower(struct file *file, struct file *lower_file)
293{
294 ((struct ecryptfs_file_info *)file->private_data)->wfi_file =
295 lower_file;
296}
297
298static inline struct ecryptfs_inode_info *
299ecryptfs_inode_to_private(struct inode *inode)
300{
301 return container_of(inode, struct ecryptfs_inode_info, vfs_inode);
302}
303
304static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode)
305{
306 return ecryptfs_inode_to_private(inode)->wii_inode;
307}
308
309static inline void
310ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode)
311{
312 ecryptfs_inode_to_private(inode)->wii_inode = lower_inode;
313}
314
315static inline struct ecryptfs_sb_info *
316ecryptfs_superblock_to_private(struct super_block *sb)
317{
318 return (struct ecryptfs_sb_info *)sb->s_fs_info;
319}
320
321static inline void
322ecryptfs_set_superblock_private(struct super_block *sb,
323 struct ecryptfs_sb_info *sb_info)
324{
325 sb->s_fs_info = sb_info;
326}
327
328static inline struct super_block *
329ecryptfs_superblock_to_lower(struct super_block *sb)
330{
331 return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb;
332}
333
334static inline void
335ecryptfs_set_superblock_lower(struct super_block *sb,
336 struct super_block *lower_sb)
337{
338 ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb;
339}
340
341static inline struct ecryptfs_dentry_info *
342ecryptfs_dentry_to_private(struct dentry *dentry)
343{
344 return (struct ecryptfs_dentry_info *)dentry->d_fsdata;
345}
346
347static inline void
348ecryptfs_set_dentry_private(struct dentry *dentry,
349 struct ecryptfs_dentry_info *dentry_info)
350{
351 dentry->d_fsdata = dentry_info;
352}
353
354static inline struct dentry *
355ecryptfs_dentry_to_lower(struct dentry *dentry)
356{
357 return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry;
358}
359
360static inline void
361ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry)
362{
363 ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry =
364 lower_dentry;
365}
366
367static inline struct vfsmount *
368ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
369{
370 return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt;
371}
372
373static inline void
374ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
375{
376 ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt =
377 lower_mnt;
378}
379
380#define ecryptfs_printk(type, fmt, arg...) \
381 __ecryptfs_printk(type "%s: " fmt, __FUNCTION__, ## arg);
382void __ecryptfs_printk(const char *fmt, ...);
383
384extern const struct file_operations ecryptfs_main_fops;
385extern const struct file_operations ecryptfs_dir_fops;
386extern struct inode_operations ecryptfs_main_iops;
387extern struct inode_operations ecryptfs_dir_iops;
388extern struct inode_operations ecryptfs_symlink_iops;
389extern struct super_operations ecryptfs_sops;
390extern struct dentry_operations ecryptfs_dops;
391extern struct address_space_operations ecryptfs_aops;
392extern int ecryptfs_verbosity;
393
394extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
395extern struct kmem_cache *ecryptfs_file_info_cache;
396extern struct kmem_cache *ecryptfs_dentry_info_cache;
397extern struct kmem_cache *ecryptfs_inode_info_cache;
398extern struct kmem_cache *ecryptfs_sb_info_cache;
399extern struct kmem_cache *ecryptfs_header_cache_0;
400extern struct kmem_cache *ecryptfs_header_cache_1;
401extern struct kmem_cache *ecryptfs_header_cache_2;
402extern struct kmem_cache *ecryptfs_lower_page_cache;
403
404int ecryptfs_interpose(struct dentry *hidden_dentry,
405 struct dentry *this_dentry, struct super_block *sb,
406 int flag);
407int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
408int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
409 const char *name, int length,
410 char **decrypted_name);
411int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
412 const char *name, int length,
413 char **encoded_name);
414struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
415void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src);
416void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src);
417void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src);
418void ecryptfs_dump_hex(char *data, int bytes);
419int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
420 int sg_size);
421int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
422void ecryptfs_rotate_iv(unsigned char *iv);
423void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
424void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
425void ecryptfs_destruct_mount_crypt_stat(
426 struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
427int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
428int ecryptfs_write_inode_size_to_header(struct file *lower_file,
429 struct inode *lower_inode,
430 struct inode *inode);
431int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
432 struct file *lower_file,
433 unsigned long lower_page_index, int byte_offset,
434 int region_bytes);
435int
436ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
437 struct file *lower_file, int byte_offset,
438 int region_size);
439int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode,
440 struct file *lower_file);
441int ecryptfs_do_readpage(struct file *file, struct page *page,
442 pgoff_t lower_page_index);
443int ecryptfs_grab_and_map_lower_page(struct page **lower_page,
444 char **lower_virt,
445 struct inode *lower_inode,
446 unsigned long lower_page_index);
447int ecryptfs_writepage_and_release_lower_page(struct page *lower_page,
448 struct inode *lower_inode,
449 struct writeback_control *wbc);
450int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx);
451int ecryptfs_decrypt_page(struct file *file, struct page *page);
452int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
453 struct file *lower_file);
454int ecryptfs_write_headers_virt(char *page_virt,
455 struct ecryptfs_crypt_stat *crypt_stat,
456 struct dentry *ecryptfs_dentry);
457int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
458 struct file *lower_file);
459int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
460int contains_ecryptfs_marker(char *data);
461int ecryptfs_read_header_region(char *data, struct dentry *dentry,
462 struct vfsmount *mnt);
463u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
464int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code);
465void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
466int ecryptfs_generate_key_packet_set(char *dest_base,
467 struct ecryptfs_crypt_stat *crypt_stat,
468 struct dentry *ecryptfs_dentry,
469 size_t *len, size_t max);
470int process_request_key_err(long err_code);
471int
472ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
473 unsigned char *src, struct dentry *ecryptfs_dentry);
474int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
475int
476ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm,
477 char *cipher_name, size_t key_size);
478int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
479int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
480void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
481
482#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
new file mode 100644
index 000000000000..c8550c9f9cd2
--- /dev/null
+++ b/fs/ecryptfs/file.c
@@ -0,0 +1,440 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 *
4 * Copyright (C) 1997-2004 Erez Zadok
5 * Copyright (C) 2001-2004 Stony Brook University
6 * Copyright (C) 2004-2006 International Business Machines Corp.
7 * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
8 * Michael C. Thompson <mcthomps@us.ibm.com>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 * 02111-1307, USA.
24 */
25
26#include <linux/file.h>
27#include <linux/poll.h>
28#include <linux/mount.h>
29#include <linux/pagemap.h>
30#include <linux/security.h>
31#include <linux/smp_lock.h>
32#include <linux/compat.h>
33#include "ecryptfs_kernel.h"
34
35/**
36 * ecryptfs_llseek
37 * @file: File we are seeking in
38 * @offset: The offset to seek to
39 * @origin: 2 - offset from i_size; 1 - offset from f_pos
40 *
41 * Returns the position we have seeked to, or negative on error
42 */
43static loff_t ecryptfs_llseek(struct file *file, loff_t offset, int origin)
44{
45 loff_t rv;
46 loff_t new_end_pos;
47 int rc;
48 int expanding_file = 0;
49 struct inode *inode = file->f_mapping->host;
50
51 /* If our offset is past the end of our file, we're going to
52 * need to grow it so we have a valid length of 0's */
53 new_end_pos = offset;
54 switch (origin) {
55 case 2:
56 new_end_pos += i_size_read(inode);
57 expanding_file = 1;
58 break;
59 case 1:
60 new_end_pos += file->f_pos;
61 if (new_end_pos > i_size_read(inode)) {
62 ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
63 "> i_size_read(inode)(=[0x%.16x])\n",
64 new_end_pos, i_size_read(inode));
65 expanding_file = 1;
66 }
67 break;
68 default:
69 if (new_end_pos > i_size_read(inode)) {
70 ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
71 "> i_size_read(inode)(=[0x%.16x])\n",
72 new_end_pos, i_size_read(inode));
73 expanding_file = 1;
74 }
75 }
76 ecryptfs_printk(KERN_DEBUG, "new_end_pos = [0x%.16x]\n", new_end_pos);
77 if (expanding_file) {
78 rc = ecryptfs_truncate(file->f_dentry, new_end_pos);
79 if (rc) {
80 rv = rc;
81 ecryptfs_printk(KERN_ERR, "Error on attempt to "
82 "truncate to (higher) offset [0x%.16x];"
83 " rc = [%d]\n", new_end_pos, rc);
84 goto out;
85 }
86 }
87 rv = generic_file_llseek(file, offset, origin);
88out:
89 return rv;
90}
91
92/**
93 * ecryptfs_read_update_atime
94 *
95 * generic_file_read updates the atime of upper layer inode. But, it
96 * doesn't give us a chance to update the atime of the lower layer
97 * inode. This function is a wrapper to generic_file_read. It
98 * updates the atime of the lower level inode if generic_file_read
99 * returns without any errors. This is to be used only for file reads.
100 * The function to be used for directory reads is ecryptfs_read.
101 */
102static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
103 const struct iovec *iov,
104 unsigned long nr_segs, loff_t pos)
105{
106 int rc;
107 struct dentry *lower_dentry;
108 struct vfsmount *lower_vfsmount;
109 struct file *file = iocb->ki_filp;
110
111 rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
112 /*
113 * Even though this is a async interface, we need to wait
114 * for IO to finish to update atime
115 */
116 if (-EIOCBQUEUED == rc)
117 rc = wait_on_sync_kiocb(iocb);
118 if (rc >= 0) {
119 lower_dentry = ecryptfs_dentry_to_lower(file->f_dentry);
120 lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_dentry);
121 touch_atime(lower_vfsmount, lower_dentry);
122 }
123 return rc;
124}
125
126struct ecryptfs_getdents_callback {
127 void *dirent;
128 struct dentry *dentry;
129 filldir_t filldir;
130 int err;
131 int filldir_called;
132 int entries_written;
133};
134
135/* Inspired by generic filldir in fs/readir.c */
136static int
137ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
138 u64 ino, unsigned int d_type)
139{
140 struct ecryptfs_crypt_stat *crypt_stat;
141 struct ecryptfs_getdents_callback *buf =
142 (struct ecryptfs_getdents_callback *)dirent;
143 int rc;
144 int decoded_length;
145 char *decoded_name;
146
147 crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
148 buf->filldir_called++;
149 decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
150 &decoded_name);
151 if (decoded_length < 0) {
152 rc = decoded_length;
153 goto out;
154 }
155 rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
156 ino, d_type);
157 kfree(decoded_name);
158 if (rc >= 0)
159 buf->entries_written++;
160out:
161 return rc;
162}
163
164/**
165 * ecryptfs_readdir
166 * @file: The ecryptfs file struct
167 * @dirent: Directory entry
168 * @filldir: The filldir callback function
169 */
170static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
171{
172 int rc;
173 struct file *lower_file;
174 struct inode *inode;
175 struct ecryptfs_getdents_callback buf;
176
177 lower_file = ecryptfs_file_to_lower(file);
178 lower_file->f_pos = file->f_pos;
179 inode = file->f_dentry->d_inode;
180 memset(&buf, 0, sizeof(buf));
181 buf.dirent = dirent;
182 buf.dentry = file->f_dentry;
183 buf.filldir = filldir;
184retry:
185 buf.filldir_called = 0;
186 buf.entries_written = 0;
187 buf.err = 0;
188 rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
189 if (buf.err)
190 rc = buf.err;
191 if (buf.filldir_called && !buf.entries_written)
192 goto retry;
193 file->f_pos = lower_file->f_pos;
194 if (rc >= 0)
195 ecryptfs_copy_attr_atime(inode, lower_file->f_dentry->d_inode);
196 return rc;
197}
198
199struct kmem_cache *ecryptfs_file_info_cache;
200
201/**
202 * ecryptfs_open
203 * @inode: inode speciying file to open
204 * @file: Structure to return filled in
205 *
206 * Opens the file specified by inode.
207 *
208 * Returns zero on success; non-zero otherwise
209 */
210static int ecryptfs_open(struct inode *inode, struct file *file)
211{
212 int rc = 0;
213 struct ecryptfs_crypt_stat *crypt_stat = NULL;
214 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
215 struct dentry *ecryptfs_dentry = file->f_dentry;
216 /* Private value of ecryptfs_dentry allocated in
217 * ecryptfs_lookup() */
218 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
219 struct inode *lower_inode = NULL;
220 struct file *lower_file = NULL;
221 struct vfsmount *lower_mnt;
222 struct ecryptfs_file_info *file_info;
223 int lower_flags;
224
225 /* Released in ecryptfs_release or end of function if failure */
226 file_info = kmem_cache_alloc(ecryptfs_file_info_cache, SLAB_KERNEL);
227 ecryptfs_set_file_private(file, file_info);
228 if (!file_info) {
229 ecryptfs_printk(KERN_ERR,
230 "Error attempting to allocate memory\n");
231 rc = -ENOMEM;
232 goto out;
233 }
234 memset(file_info, 0, sizeof(*file_info));
235 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
236 crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
237 mount_crypt_stat = &ecryptfs_superblock_to_private(
238 ecryptfs_dentry->d_sb)->mount_crypt_stat;
239 mutex_lock(&crypt_stat->cs_mutex);
240 if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED)) {
241 ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n");
242 /* Policy code enabled in future release */
243 ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED);
244 ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
245 }
246 mutex_unlock(&crypt_stat->cs_mutex);
247 /* This mntget & dget is undone via fput when the file is released */
248 dget(lower_dentry);
249 lower_flags = file->f_flags;
250 if ((lower_flags & O_ACCMODE) == O_WRONLY)
251 lower_flags = (lower_flags & O_ACCMODE) | O_RDWR;
252 if (file->f_flags & O_APPEND)
253 lower_flags &= ~O_APPEND;
254 lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
255 mntget(lower_mnt);
256 /* Corresponding fput() in ecryptfs_release() */
257 lower_file = dentry_open(lower_dentry, lower_mnt, lower_flags);
258 if (IS_ERR(lower_file)) {
259 rc = PTR_ERR(lower_file);
260 ecryptfs_printk(KERN_ERR, "Error opening lower file\n");
261 goto out_puts;
262 }
263 ecryptfs_set_file_lower(file, lower_file);
264 /* Isn't this check the same as the one in lookup? */
265 lower_inode = lower_dentry->d_inode;
266 if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
267 ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
268 ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
269 rc = 0;
270 goto out;
271 }
272 mutex_lock(&crypt_stat->cs_mutex);
273 if (i_size_read(lower_inode) < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
274 if (!(mount_crypt_stat->flags
275 & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
276 rc = -EIO;
277 printk(KERN_WARNING "Attempt to read file that is "
278 "not in a valid eCryptfs format, and plaintext "
279 "passthrough mode is not enabled; returning "
280 "-EIO\n");
281 mutex_unlock(&crypt_stat->cs_mutex);
282 goto out_puts;
283 }
284 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
285 rc = 0;
286 mutex_unlock(&crypt_stat->cs_mutex);
287 goto out;
288 } else if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
289 ECRYPTFS_POLICY_APPLIED)
290 || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
291 ECRYPTFS_KEY_VALID)) {
292 rc = ecryptfs_read_headers(ecryptfs_dentry, lower_file);
293 if (rc) {
294 ecryptfs_printk(KERN_DEBUG,
295 "Valid headers not found\n");
296 if (!(mount_crypt_stat->flags
297 & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
298 rc = -EIO;
299 printk(KERN_WARNING "Attempt to read file that "
300 "is not in a valid eCryptfs format, "
301 "and plaintext passthrough mode is not "
302 "enabled; returning -EIO\n");
303 mutex_unlock(&crypt_stat->cs_mutex);
304 goto out_puts;
305 }
306 ECRYPTFS_CLEAR_FLAG(crypt_stat->flags,
307 ECRYPTFS_ENCRYPTED);
308 rc = 0;
309 mutex_unlock(&crypt_stat->cs_mutex);
310 goto out;
311 }
312 }
313 mutex_unlock(&crypt_stat->cs_mutex);
314 ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
315 "size: [0x%.16x]\n", inode, inode->i_ino,
316 i_size_read(inode));
317 ecryptfs_set_file_lower(file, lower_file);
318 goto out;
319out_puts:
320 mntput(lower_mnt);
321 dput(lower_dentry);
322 kmem_cache_free(ecryptfs_file_info_cache,
323 ecryptfs_file_to_private(file));
324out:
325 return rc;
326}
327
328static int ecryptfs_flush(struct file *file, fl_owner_t td)
329{
330 int rc = 0;
331 struct file *lower_file = NULL;
332
333 lower_file = ecryptfs_file_to_lower(file);
334 if (lower_file->f_op && lower_file->f_op->flush)
335 rc = lower_file->f_op->flush(lower_file, td);
336 return rc;
337}
338
339static int ecryptfs_release(struct inode *inode, struct file *file)
340{
341 struct file *lower_file = ecryptfs_file_to_lower(file);
342 struct ecryptfs_file_info *file_info = ecryptfs_file_to_private(file);
343 struct inode *lower_inode = ecryptfs_inode_to_lower(inode);
344
345 fput(lower_file);
346 inode->i_blocks = lower_inode->i_blocks;
347 kmem_cache_free(ecryptfs_file_info_cache, file_info);
348 return 0;
349}
350
351static int
352ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
353{
354 struct file *lower_file = ecryptfs_file_to_lower(file);
355 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
356 struct inode *lower_inode = lower_dentry->d_inode;
357 int rc = -EINVAL;
358
359 if (lower_inode->i_fop->fsync) {
360 mutex_lock(&lower_inode->i_mutex);
361 rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
362 datasync);
363 mutex_unlock(&lower_inode->i_mutex);
364 }
365 return rc;
366}
367
368static int ecryptfs_fasync(int fd, struct file *file, int flag)
369{
370 int rc = 0;
371 struct file *lower_file = NULL;
372
373 lower_file = ecryptfs_file_to_lower(file);
374 if (lower_file->f_op && lower_file->f_op->fasync)
375 rc = lower_file->f_op->fasync(fd, lower_file, flag);
376 return rc;
377}
378
379static ssize_t ecryptfs_sendfile(struct file *file, loff_t * ppos,
380 size_t count, read_actor_t actor, void *target)
381{
382 struct file *lower_file = NULL;
383 int rc = -EINVAL;
384
385 lower_file = ecryptfs_file_to_lower(file);
386 if (lower_file->f_op && lower_file->f_op->sendfile)
387 rc = lower_file->f_op->sendfile(lower_file, ppos, count,
388 actor, target);
389
390 return rc;
391}
392
393static int ecryptfs_ioctl(struct inode *inode, struct file *file,
394 unsigned int cmd, unsigned long arg);
395
396const struct file_operations ecryptfs_dir_fops = {
397 .readdir = ecryptfs_readdir,
398 .ioctl = ecryptfs_ioctl,
399 .mmap = generic_file_mmap,
400 .open = ecryptfs_open,
401 .flush = ecryptfs_flush,
402 .release = ecryptfs_release,
403 .fsync = ecryptfs_fsync,
404 .fasync = ecryptfs_fasync,
405 .sendfile = ecryptfs_sendfile,
406};
407
408const struct file_operations ecryptfs_main_fops = {
409 .llseek = ecryptfs_llseek,
410 .read = do_sync_read,
411 .aio_read = ecryptfs_read_update_atime,
412 .write = do_sync_write,
413 .aio_write = generic_file_aio_write,
414 .readdir = ecryptfs_readdir,
415 .ioctl = ecryptfs_ioctl,
416 .mmap = generic_file_mmap,
417 .open = ecryptfs_open,
418 .flush = ecryptfs_flush,
419 .release = ecryptfs_release,
420 .fsync = ecryptfs_fsync,
421 .fasync = ecryptfs_fasync,
422 .sendfile = ecryptfs_sendfile,
423};
424
425static int
426ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
427 unsigned long arg)
428{
429 int rc = 0;
430 struct file *lower_file = NULL;
431
432 if (ecryptfs_file_to_private(file))
433 lower_file = ecryptfs_file_to_lower(file);
434 if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
435 rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
436 lower_file, cmd, arg);
437 else
438 rc = -ENOTTY;
439 return rc;
440}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
new file mode 100644
index 000000000000..efdd2b7b62d7
--- /dev/null
+++ b/fs/ecryptfs/inode.c
@@ -0,0 +1,1079 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 *
4 * Copyright (C) 1997-2004 Erez Zadok
5 * Copyright (C) 2001-2004 Stony Brook University
6 * Copyright (C) 2004-2006 International Business Machines Corp.
7 * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
8 * Michael C. Thompsion <mcthomps@us.ibm.com>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 * 02111-1307, USA.
24 */
25
26#include <linux/file.h>
27#include <linux/vmalloc.h>
28#include <linux/pagemap.h>
29#include <linux/dcache.h>
30#include <linux/namei.h>
31#include <linux/mount.h>
32#include <linux/crypto.h>
33#include "ecryptfs_kernel.h"
34
35static struct dentry *lock_parent(struct dentry *dentry)
36{
37 struct dentry *dir;
38
39 dir = dget(dentry->d_parent);
40 mutex_lock(&(dir->d_inode->i_mutex));
41 return dir;
42}
43
44static void unlock_parent(struct dentry *dentry)
45{
46 mutex_unlock(&(dentry->d_parent->d_inode->i_mutex));
47 dput(dentry->d_parent);
48}
49
50static void unlock_dir(struct dentry *dir)
51{
52 mutex_unlock(&dir->d_inode->i_mutex);
53 dput(dir);
54}
55
56void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src)
57{
58 i_size_write(dst, i_size_read((struct inode *)src));
59 dst->i_blocks = src->i_blocks;
60}
61
62void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src)
63{
64 dest->i_atime = src->i_atime;
65}
66
67static void ecryptfs_copy_attr_times(struct inode *dest,
68 const struct inode *src)
69{
70 dest->i_atime = src->i_atime;
71 dest->i_mtime = src->i_mtime;
72 dest->i_ctime = src->i_ctime;
73}
74
75static void ecryptfs_copy_attr_timesizes(struct inode *dest,
76 const struct inode *src)
77{
78 dest->i_atime = src->i_atime;
79 dest->i_mtime = src->i_mtime;
80 dest->i_ctime = src->i_ctime;
81 ecryptfs_copy_inode_size(dest, src);
82}
83
84void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src)
85{
86 dest->i_mode = src->i_mode;
87 dest->i_nlink = src->i_nlink;
88 dest->i_uid = src->i_uid;
89 dest->i_gid = src->i_gid;
90 dest->i_rdev = src->i_rdev;
91 dest->i_atime = src->i_atime;
92 dest->i_mtime = src->i_mtime;
93 dest->i_ctime = src->i_ctime;
94 dest->i_blkbits = src->i_blkbits;
95 dest->i_flags = src->i_flags;
96}
97
98/**
99 * ecryptfs_create_underlying_file
100 * @lower_dir_inode: inode of the parent in the lower fs of the new file
101 * @lower_dentry: New file's dentry in the lower fs
102 * @ecryptfs_dentry: New file's dentry in ecryptfs
103 * @mode: The mode of the new file
104 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
105 *
106 * Creates the file in the lower file system.
107 *
108 * Returns zero on success; non-zero on error condition
109 */
110static int
111ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
112 struct dentry *dentry, int mode,
113 struct nameidata *nd)
114{
115 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
116 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
117 struct dentry *dentry_save;
118 struct vfsmount *vfsmount_save;
119 int rc;
120
121 dentry_save = nd->dentry;
122 vfsmount_save = nd->mnt;
123 nd->dentry = lower_dentry;
124 nd->mnt = lower_mnt;
125 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
126 nd->dentry = dentry_save;
127 nd->mnt = vfsmount_save;
128 return rc;
129}
130
131/**
132 * ecryptfs_do_create
133 * @directory_inode: inode of the new file's dentry's parent in ecryptfs
134 * @ecryptfs_dentry: New file's dentry in ecryptfs
135 * @mode: The mode of the new file
136 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
137 *
138 * Creates the underlying file and the eCryptfs inode which will link to
139 * it. It will also update the eCryptfs directory inode to mimic the
140 * stat of the lower directory inode.
141 *
142 * Returns zero on success; non-zero on error condition
143 */
144static int
145ecryptfs_do_create(struct inode *directory_inode,
146 struct dentry *ecryptfs_dentry, int mode,
147 struct nameidata *nd)
148{
149 int rc;
150 struct dentry *lower_dentry;
151 struct dentry *lower_dir_dentry;
152
153 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
154 lower_dir_dentry = lock_parent(lower_dentry);
155 if (unlikely(IS_ERR(lower_dir_dentry))) {
156 ecryptfs_printk(KERN_ERR, "Error locking directory of "
157 "dentry\n");
158 rc = PTR_ERR(lower_dir_dentry);
159 goto out;
160 }
161 rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
162 ecryptfs_dentry, mode, nd);
163 if (unlikely(rc)) {
164 ecryptfs_printk(KERN_ERR,
165 "Failure to create underlying file\n");
166 goto out_lock;
167 }
168 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
169 directory_inode->i_sb, 0);
170 if (rc) {
171 ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
172 goto out_lock;
173 }
174 ecryptfs_copy_attr_timesizes(directory_inode,
175 lower_dir_dentry->d_inode);
176out_lock:
177 unlock_dir(lower_dir_dentry);
178out:
179 return rc;
180}
181
182/**
183 * grow_file
184 * @ecryptfs_dentry: the ecryptfs dentry
185 * @lower_file: The lower file
186 * @inode: The ecryptfs inode
187 * @lower_inode: The lower inode
188 *
189 * This is the code which will grow the file to its correct size.
190 */
191static int grow_file(struct dentry *ecryptfs_dentry, struct file *lower_file,
192 struct inode *inode, struct inode *lower_inode)
193{
194 int rc = 0;
195 struct file fake_file;
196 struct ecryptfs_file_info tmp_file_info;
197
198 memset(&fake_file, 0, sizeof(fake_file));
199 fake_file.f_dentry = ecryptfs_dentry;
200 memset(&tmp_file_info, 0, sizeof(tmp_file_info));
201 ecryptfs_set_file_private(&fake_file, &tmp_file_info);
202 ecryptfs_set_file_lower(&fake_file, lower_file);
203 rc = ecryptfs_fill_zeros(&fake_file, 1);
204 if (rc) {
205 ECRYPTFS_SET_FLAG(
206 ecryptfs_inode_to_private(inode)->crypt_stat.flags,
207 ECRYPTFS_SECURITY_WARNING);
208 ecryptfs_printk(KERN_WARNING, "Error attempting to fill zeros "
209 "in file; rc = [%d]\n", rc);
210 goto out;
211 }
212 i_size_write(inode, 0);
213 ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode);
214 ECRYPTFS_SET_FLAG(ecryptfs_inode_to_private(inode)->crypt_stat.flags,
215 ECRYPTFS_NEW_FILE);
216out:
217 return rc;
218}
219
220/**
221 * ecryptfs_initialize_file
222 *
223 * Cause the file to be changed from a basic empty file to an ecryptfs
224 * file with a header and first data page.
225 *
226 * Returns zero on success
227 */
228static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
229{
230 int rc = 0;
231 int lower_flags;
232 struct ecryptfs_crypt_stat *crypt_stat;
233 struct dentry *lower_dentry;
234 struct dentry *tlower_dentry = NULL;
235 struct file *lower_file;
236 struct inode *inode, *lower_inode;
237 struct vfsmount *lower_mnt;
238
239 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
240 ecryptfs_printk(KERN_DEBUG, "lower_dentry->d_name.name = [%s]\n",
241 lower_dentry->d_name.name);
242 inode = ecryptfs_dentry->d_inode;
243 crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
244 tlower_dentry = dget(lower_dentry);
245 if (!tlower_dentry) {
246 rc = -ENOMEM;
247 ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry\n");
248 goto out;
249 }
250 lower_flags = ((O_CREAT | O_WRONLY | O_TRUNC) & O_ACCMODE) | O_RDWR;
251#if BITS_PER_LONG != 32
252 lower_flags |= O_LARGEFILE;
253#endif
254 lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
255 mntget(lower_mnt);
256 /* Corresponding fput() at end of this function */
257 lower_file = dentry_open(tlower_dentry, lower_mnt, lower_flags);
258 if (IS_ERR(lower_file)) {
259 rc = PTR_ERR(lower_file);
260 ecryptfs_printk(KERN_ERR,
261 "Error opening dentry; rc = [%i]\n", rc);
262 goto out;
263 }
264 /* fput(lower_file) should handle the puts if we do this */
265 lower_file->f_dentry = tlower_dentry;
266 lower_file->f_vfsmnt = lower_mnt;
267 lower_inode = tlower_dentry->d_inode;
268 if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
269 ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
270 ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
271 goto out_fput;
272 }
273 ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE);
274 ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
275 rc = ecryptfs_new_file_context(ecryptfs_dentry);
276 if (rc) {
277 ecryptfs_printk(KERN_DEBUG, "Error creating new file "
278 "context\n");
279 goto out_fput;
280 }
281 rc = ecryptfs_write_headers(ecryptfs_dentry, lower_file);
282 if (rc) {
283 ecryptfs_printk(KERN_DEBUG, "Error writing headers\n");
284 goto out_fput;
285 }
286 rc = grow_file(ecryptfs_dentry, lower_file, inode, lower_inode);
287out_fput:
288 fput(lower_file);
289out:
290 return rc;
291}
292
293/**
294 * ecryptfs_create
295 * @dir: The inode of the directory in which to create the file.
296 * @dentry: The eCryptfs dentry
297 * @mode: The mode of the new file.
298 * @nd: nameidata
299 *
300 * Creates a new file.
301 *
302 * Returns zero on success; non-zero on error condition
303 */
304static int
305ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
306 int mode, struct nameidata *nd)
307{
308 int rc;
309
310 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
311 if (unlikely(rc)) {
312 ecryptfs_printk(KERN_WARNING, "Failed to create file in"
313 "lower filesystem\n");
314 goto out;
315 }
316 /* At this point, a file exists on "disk"; we need to make sure
317 * that this on disk file is prepared to be an ecryptfs file */
318 rc = ecryptfs_initialize_file(ecryptfs_dentry);
319out:
320 return rc;
321}
322
323/**
324 * ecryptfs_lookup
325 * @dir: inode
326 * @dentry: The dentry
327 * @nd: nameidata, may be NULL
328 *
329 * Find a file on disk. If the file does not exist, then we'll add it to the
330 * dentry cache and continue on to read it from the disk.
331 */
332static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
333 struct nameidata *nd)
334{
335 int rc = 0;
336 struct dentry *lower_dir_dentry;
337 struct dentry *lower_dentry;
338 struct vfsmount *lower_mnt;
339 struct dentry *tlower_dentry = NULL;
340 char *encoded_name;
341 unsigned int encoded_namelen;
342 struct ecryptfs_crypt_stat *crypt_stat = NULL;
343 char *page_virt = NULL;
344 struct inode *lower_inode;
345 u64 file_size;
346
347 lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
348 dentry->d_op = &ecryptfs_dops;
349 if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
350 || (dentry->d_name.len == 2 && !strcmp(dentry->d_name.name, "..")))
351 goto out_drop;
352 encoded_namelen = ecryptfs_encode_filename(crypt_stat,
353 dentry->d_name.name,
354 dentry->d_name.len,
355 &encoded_name);
356 if (encoded_namelen < 0) {
357 rc = encoded_namelen;
358 goto out_drop;
359 }
360 ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
361 "= [%d]\n", encoded_name, encoded_namelen);
362 lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
363 encoded_namelen - 1);
364 kfree(encoded_name);
365 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
366 if (IS_ERR(lower_dentry)) {
367 ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
368 rc = PTR_ERR(lower_dentry);
369 goto out_drop;
370 }
371 ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
372 "d_name.name = [%s]\n", lower_dentry,
373 lower_dentry->d_name.name);
374 lower_inode = lower_dentry->d_inode;
375 ecryptfs_copy_attr_atime(dir, lower_dir_dentry->d_inode);
376 BUG_ON(!atomic_read(&lower_dentry->d_count));
377 ecryptfs_set_dentry_private(dentry,
378 kmem_cache_alloc(ecryptfs_dentry_info_cache,
379 SLAB_KERNEL));
380 if (!ecryptfs_dentry_to_private(dentry)) {
381 rc = -ENOMEM;
382 ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
383 "to allocate ecryptfs_dentry_info struct\n");
384 goto out_dput;
385 }
386 ecryptfs_set_dentry_lower(dentry, lower_dentry);
387 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
388 if (!lower_dentry->d_inode) {
389 /* We want to add because we couldn't find in lower */
390 d_add(dentry, NULL);
391 goto out;
392 }
393 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1);
394 if (rc) {
395 ecryptfs_printk(KERN_ERR, "Error interposing\n");
396 goto out_dput;
397 }
398 if (S_ISDIR(lower_inode->i_mode)) {
399 ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
400 goto out;
401 }
402 if (S_ISLNK(lower_inode->i_mode)) {
403 ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
404 goto out;
405 }
406 if (!nd) {
407 ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
408 "as we *think* we are about to unlink\n");
409 goto out;
410 }
411 tlower_dentry = dget(lower_dentry);
412 if (!tlower_dentry || IS_ERR(tlower_dentry)) {
413 rc = -ENOMEM;
414 ecryptfs_printk(KERN_ERR, "Cannot dget lower_dentry\n");
415 goto out_dput;
416 }
417 /* Released in this function */
418 page_virt =
419 (char *)kmem_cache_alloc(ecryptfs_header_cache_2,
420 SLAB_USER);
421 if (!page_virt) {
422 rc = -ENOMEM;
423 ecryptfs_printk(KERN_ERR,
424 "Cannot ecryptfs_kmalloc a page\n");
425 goto out_dput;
426 }
427 memset(page_virt, 0, PAGE_CACHE_SIZE);
428 rc = ecryptfs_read_header_region(page_virt, tlower_dentry, nd->mnt);
429 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
430 if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED))
431 ecryptfs_set_default_sizes(crypt_stat);
432 if (rc) {
433 rc = 0;
434 ecryptfs_printk(KERN_WARNING, "Error reading header region;"
435 " assuming unencrypted\n");
436 } else {
437 if (!contains_ecryptfs_marker(page_virt
438 + ECRYPTFS_FILE_SIZE_BYTES)) {
439 kmem_cache_free(ecryptfs_header_cache_2, page_virt);
440 goto out;
441 }
442 memcpy(&file_size, page_virt, sizeof(file_size));
443 file_size = be64_to_cpu(file_size);
444 i_size_write(dentry->d_inode, (loff_t)file_size);
445 }
446 kmem_cache_free(ecryptfs_header_cache_2, page_virt);
447 goto out;
448
449out_dput:
450 dput(lower_dentry);
451 if (tlower_dentry)
452 dput(tlower_dentry);
453out_drop:
454 d_drop(dentry);
455out:
456 return ERR_PTR(rc);
457}
458
459static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
460 struct dentry *new_dentry)
461{
462 struct dentry *lower_old_dentry;
463 struct dentry *lower_new_dentry;
464 struct dentry *lower_dir_dentry;
465 u64 file_size_save;
466 int rc;
467
468 file_size_save = i_size_read(old_dentry->d_inode);
469 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
470 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
471 dget(lower_old_dentry);
472 dget(lower_new_dentry);
473 lower_dir_dentry = lock_parent(lower_new_dentry);
474 rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
475 lower_new_dentry);
476 if (rc || !lower_new_dentry->d_inode)
477 goto out_lock;
478 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
479 if (rc)
480 goto out_lock;
481 ecryptfs_copy_attr_timesizes(dir, lower_new_dentry->d_inode);
482 old_dentry->d_inode->i_nlink =
483 ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
484 i_size_write(new_dentry->d_inode, file_size_save);
485out_lock:
486 unlock_dir(lower_dir_dentry);
487 dput(lower_new_dentry);
488 dput(lower_old_dentry);
489 if (!new_dentry->d_inode)
490 d_drop(new_dentry);
491 return rc;
492}
493
494static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
495{
496 int rc = 0;
497 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
498 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
499
500 lock_parent(lower_dentry);
501 rc = vfs_unlink(lower_dir_inode, lower_dentry);
502 if (rc) {
503 ecryptfs_printk(KERN_ERR, "Error in vfs_unlink\n");
504 goto out_unlock;
505 }
506 ecryptfs_copy_attr_times(dir, lower_dir_inode);
507 dentry->d_inode->i_nlink =
508 ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
509 dentry->d_inode->i_ctime = dir->i_ctime;
510out_unlock:
511 unlock_parent(lower_dentry);
512 return rc;
513}
514
515static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
516 const char *symname)
517{
518 int rc;
519 struct dentry *lower_dentry;
520 struct dentry *lower_dir_dentry;
521 umode_t mode;
522 char *encoded_symname;
523 unsigned int encoded_symlen;
524 struct ecryptfs_crypt_stat *crypt_stat = NULL;
525
526 lower_dentry = ecryptfs_dentry_to_lower(dentry);
527 dget(lower_dentry);
528 lower_dir_dentry = lock_parent(lower_dentry);
529 mode = S_IALLUGO;
530 encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
531 strlen(symname),
532 &encoded_symname);
533 if (encoded_symlen < 0) {
534 rc = encoded_symlen;
535 goto out_lock;
536 }
537 rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
538 encoded_symname, mode);
539 kfree(encoded_symname);
540 if (rc || !lower_dentry->d_inode)
541 goto out_lock;
542 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
543 if (rc)
544 goto out_lock;
545 ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
546out_lock:
547 unlock_dir(lower_dir_dentry);
548 dput(lower_dentry);
549 if (!dentry->d_inode)
550 d_drop(dentry);
551 return rc;
552}
553
554static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
555{
556 int rc;
557 struct dentry *lower_dentry;
558 struct dentry *lower_dir_dentry;
559
560 lower_dentry = ecryptfs_dentry_to_lower(dentry);
561 lower_dir_dentry = lock_parent(lower_dentry);
562 rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
563 if (rc || !lower_dentry->d_inode)
564 goto out;
565 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
566 if (rc)
567 goto out;
568 ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
569 dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
570out:
571 unlock_dir(lower_dir_dentry);
572 if (!dentry->d_inode)
573 d_drop(dentry);
574 return rc;
575}
576
577static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
578{
579 int rc = 0;
580 struct dentry *tdentry = NULL;
581 struct dentry *lower_dentry;
582 struct dentry *tlower_dentry = NULL;
583 struct dentry *lower_dir_dentry;
584
585 lower_dentry = ecryptfs_dentry_to_lower(dentry);
586 if (!(tdentry = dget(dentry))) {
587 rc = -EINVAL;
588 ecryptfs_printk(KERN_ERR, "Error dget'ing dentry [%p]\n",
589 dentry);
590 goto out;
591 }
592 lower_dir_dentry = lock_parent(lower_dentry);
593 if (!(tlower_dentry = dget(lower_dentry))) {
594 rc = -EINVAL;
595 ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry "
596 "[%p]\n", lower_dentry);
597 goto out;
598 }
599 rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
600 if (!rc) {
601 d_delete(tlower_dentry);
602 tlower_dentry = NULL;
603 }
604 ecryptfs_copy_attr_times(dir, lower_dir_dentry->d_inode);
605 dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
606 unlock_dir(lower_dir_dentry);
607 if (!rc)
608 d_drop(dentry);
609out:
610 if (tdentry)
611 dput(tdentry);
612 if (tlower_dentry)
613 dput(tlower_dentry);
614 return rc;
615}
616
617static int
618ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
619{
620 int rc;
621 struct dentry *lower_dentry;
622 struct dentry *lower_dir_dentry;
623
624 lower_dentry = ecryptfs_dentry_to_lower(dentry);
625 lower_dir_dentry = lock_parent(lower_dentry);
626 rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
627 if (rc || !lower_dentry->d_inode)
628 goto out;
629 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
630 if (rc)
631 goto out;
632 ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
633out:
634 unlock_dir(lower_dir_dentry);
635 if (!dentry->d_inode)
636 d_drop(dentry);
637 return rc;
638}
639
640static int
641ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
642 struct inode *new_dir, struct dentry *new_dentry)
643{
644 int rc;
645 struct dentry *lower_old_dentry;
646 struct dentry *lower_new_dentry;
647 struct dentry *lower_old_dir_dentry;
648 struct dentry *lower_new_dir_dentry;
649
650 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
651 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
652 dget(lower_old_dentry);
653 dget(lower_new_dentry);
654 lower_old_dir_dentry = dget_parent(lower_old_dentry);
655 lower_new_dir_dentry = dget_parent(lower_new_dentry);
656 lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
657 rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
658 lower_new_dir_dentry->d_inode, lower_new_dentry);
659 if (rc)
660 goto out_lock;
661 ecryptfs_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
662 if (new_dir != old_dir)
663 ecryptfs_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
664out_lock:
665 unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
666 dput(lower_new_dentry);
667 dput(lower_old_dentry);
668 return rc;
669}
670
671static int
672ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
673{
674 int rc;
675 struct dentry *lower_dentry;
676 char *decoded_name;
677 char *lower_buf;
678 mm_segment_t old_fs;
679 struct ecryptfs_crypt_stat *crypt_stat;
680
681 lower_dentry = ecryptfs_dentry_to_lower(dentry);
682 if (!lower_dentry->d_inode->i_op ||
683 !lower_dentry->d_inode->i_op->readlink) {
684 rc = -EINVAL;
685 goto out;
686 }
687 /* Released in this function */
688 lower_buf = kmalloc(bufsiz, GFP_KERNEL);
689 if (lower_buf == NULL) {
690 ecryptfs_printk(KERN_ERR, "Out of memory\n");
691 rc = -ENOMEM;
692 goto out;
693 }
694 old_fs = get_fs();
695 set_fs(get_ds());
696 ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
697 "lower_dentry->d_name.name = [%s]\n",
698 lower_dentry->d_name.name);
699 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
700 (char __user *)lower_buf,
701 bufsiz);
702 set_fs(old_fs);
703 if (rc >= 0) {
704 crypt_stat = NULL;
705 rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
706 &decoded_name);
707 if (rc == -ENOMEM)
708 goto out_free_lower_buf;
709 if (rc > 0) {
710 ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
711 "to userspace: [%*s]\n", rc,
712 decoded_name);
713 if (copy_to_user(buf, decoded_name, rc))
714 rc = -EFAULT;
715 }
716 kfree(decoded_name);
717 ecryptfs_copy_attr_atime(dentry->d_inode,
718 lower_dentry->d_inode);
719 }
720out_free_lower_buf:
721 kfree(lower_buf);
722out:
723 return rc;
724}
725
726static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
727{
728 char *buf;
729 int len = PAGE_SIZE, rc;
730 mm_segment_t old_fs;
731
732 /* Released in ecryptfs_put_link(); only release here on error */
733 buf = kmalloc(len, GFP_KERNEL);
734 if (!buf) {
735 rc = -ENOMEM;
736 goto out;
737 }
738 old_fs = get_fs();
739 set_fs(get_ds());
740 ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
741 "dentry->d_name.name = [%s]\n", dentry->d_name.name);
742 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
743 buf[rc] = '\0';
744 set_fs(old_fs);
745 if (rc < 0)
746 goto out_free;
747 rc = 0;
748 nd_set_link(nd, buf);
749 goto out;
750out_free:
751 kfree(buf);
752out:
753 return ERR_PTR(rc);
754}
755
756static void
757ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
758{
759 /* Free the char* */
760 kfree(nd_get_link(nd));
761}
762
763/**
764 * upper_size_to_lower_size
765 * @crypt_stat: Crypt_stat associated with file
766 * @upper_size: Size of the upper file
767 *
768 * Calculate the requried size of the lower file based on the
769 * specified size of the upper file. This calculation is based on the
770 * number of headers in the underlying file and the extent size.
771 *
772 * Returns Calculated size of the lower file.
773 */
774static loff_t
775upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
776 loff_t upper_size)
777{
778 loff_t lower_size;
779
780 lower_size = ( crypt_stat->header_extent_size
781 * crypt_stat->num_header_extents_at_front );
782 if (upper_size != 0) {
783 loff_t num_extents;
784
785 num_extents = upper_size >> crypt_stat->extent_shift;
786 if (upper_size & ~crypt_stat->extent_mask)
787 num_extents++;
788 lower_size += (num_extents * crypt_stat->extent_size);
789 }
790 return lower_size;
791}
792
793/**
794 * ecryptfs_truncate
795 * @dentry: The ecryptfs layer dentry
796 * @new_length: The length to expand the file to
797 *
798 * Function to handle truncations modifying the size of the file. Note
799 * that the file sizes are interpolated. When expanding, we are simply
800 * writing strings of 0's out. When truncating, we need to modify the
801 * underlying file size according to the page index interpolations.
802 *
803 * Returns zero on success; non-zero otherwise
804 */
805int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
806{
807 int rc = 0;
808 struct inode *inode = dentry->d_inode;
809 struct dentry *lower_dentry;
810 struct vfsmount *lower_mnt;
811 struct file fake_ecryptfs_file, *lower_file = NULL;
812 struct ecryptfs_crypt_stat *crypt_stat;
813 loff_t i_size = i_size_read(inode);
814 loff_t lower_size_before_truncate;
815 loff_t lower_size_after_truncate;
816
817 if (unlikely((new_length == i_size)))
818 goto out;
819 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
820 /* Set up a fake ecryptfs file, this is used to interface with
821 * the file in the underlying filesystem so that the
822 * truncation has an effect there as well. */
823 memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
824 fake_ecryptfs_file.f_dentry = dentry;
825 /* Released at out_free: label */
826 ecryptfs_set_file_private(&fake_ecryptfs_file,
827 kmem_cache_alloc(ecryptfs_file_info_cache,
828 SLAB_KERNEL));
829 if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
830 rc = -ENOMEM;
831 goto out;
832 }
833 lower_dentry = ecryptfs_dentry_to_lower(dentry);
834 /* This dget & mntget is released through fput at out_fput: */
835 dget(lower_dentry);
836 lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
837 mntget(lower_mnt);
838 lower_file = dentry_open(lower_dentry, lower_mnt, O_RDWR);
839 if (unlikely(IS_ERR(lower_file))) {
840 rc = PTR_ERR(lower_file);
841 goto out_free;
842 }
843 ecryptfs_set_file_lower(&fake_ecryptfs_file, lower_file);
844 /* Switch on growing or shrinking file */
845 if (new_length > i_size) {
846 rc = ecryptfs_fill_zeros(&fake_ecryptfs_file, new_length);
847 if (rc) {
848 ecryptfs_printk(KERN_ERR,
849 "Problem with fill_zeros\n");
850 goto out_fput;
851 }
852 i_size_write(inode, new_length);
853 rc = ecryptfs_write_inode_size_to_header(lower_file,
854 lower_dentry->d_inode,
855 inode);
856 if (rc) {
857 ecryptfs_printk(KERN_ERR,
858 "Problem with ecryptfs_write"
859 "_inode_size\n");
860 goto out_fput;
861 }
862 } else { /* new_length < i_size_read(inode) */
863 vmtruncate(inode, new_length);
864 ecryptfs_write_inode_size_to_header(lower_file,
865 lower_dentry->d_inode,
866 inode);
867 /* We are reducing the size of the ecryptfs file, and need to
868 * know if we need to reduce the size of the lower file. */
869 lower_size_before_truncate =
870 upper_size_to_lower_size(crypt_stat, i_size);
871 lower_size_after_truncate =
872 upper_size_to_lower_size(crypt_stat, new_length);
873 if (lower_size_after_truncate < lower_size_before_truncate)
874 vmtruncate(lower_dentry->d_inode,
875 lower_size_after_truncate);
876 }
877 /* Update the access times */
878 lower_dentry->d_inode->i_mtime = lower_dentry->d_inode->i_ctime
879 = CURRENT_TIME;
880 mark_inode_dirty_sync(inode);
881out_fput:
882 fput(lower_file);
883out_free:
884 if (ecryptfs_file_to_private(&fake_ecryptfs_file))
885 kmem_cache_free(ecryptfs_file_info_cache,
886 ecryptfs_file_to_private(&fake_ecryptfs_file));
887out:
888 return rc;
889}
890
891static int
892ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd)
893{
894 int rc;
895
896 if (nd) {
897 struct vfsmount *vfsmnt_save = nd->mnt;
898 struct dentry *dentry_save = nd->dentry;
899
900 nd->mnt = ecryptfs_dentry_to_lower_mnt(nd->dentry);
901 nd->dentry = ecryptfs_dentry_to_lower(nd->dentry);
902 rc = permission(ecryptfs_inode_to_lower(inode), mask, nd);
903 nd->mnt = vfsmnt_save;
904 nd->dentry = dentry_save;
905 } else
906 rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL);
907 return rc;
908}
909
910/**
911 * ecryptfs_setattr
912 * @dentry: dentry handle to the inode to modify
913 * @ia: Structure with flags of what to change and values
914 *
915 * Updates the metadata of an inode. If the update is to the size
916 * i.e. truncation, then ecryptfs_truncate will handle the size modification
917 * of both the ecryptfs inode and the lower inode.
918 *
919 * All other metadata changes will be passed right to the lower filesystem,
920 * and we will just update our inode to look like the lower.
921 */
922static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
923{
924 int rc = 0;
925 struct dentry *lower_dentry;
926 struct inode *inode;
927 struct inode *lower_inode;
928 struct ecryptfs_crypt_stat *crypt_stat;
929
930 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
931 lower_dentry = ecryptfs_dentry_to_lower(dentry);
932 inode = dentry->d_inode;
933 lower_inode = ecryptfs_inode_to_lower(inode);
934 if (ia->ia_valid & ATTR_SIZE) {
935 ecryptfs_printk(KERN_DEBUG,
936 "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
937 ia->ia_valid, ATTR_SIZE);
938 rc = ecryptfs_truncate(dentry, ia->ia_size);
939 /* ecryptfs_truncate handles resizing of the lower file */
940 ia->ia_valid &= ~ATTR_SIZE;
941 ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
942 ia->ia_valid);
943 if (rc < 0)
944 goto out;
945 }
946 rc = notify_change(lower_dentry, ia);
947out:
948 ecryptfs_copy_attr_all(inode, lower_inode);
949 return rc;
950}
951
952static int
953ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
954 size_t size, int flags)
955{
956 int rc = 0;
957 struct dentry *lower_dentry;
958
959 lower_dentry = ecryptfs_dentry_to_lower(dentry);
960 if (!lower_dentry->d_inode->i_op->setxattr) {
961 rc = -ENOSYS;
962 goto out;
963 }
964 mutex_lock(&lower_dentry->d_inode->i_mutex);
965 rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
966 size, flags);
967 mutex_unlock(&lower_dentry->d_inode->i_mutex);
968out:
969 return rc;
970}
971
972static ssize_t
973ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value,
974 size_t size)
975{
976 int rc = 0;
977 struct dentry *lower_dentry;
978
979 lower_dentry = ecryptfs_dentry_to_lower(dentry);
980 if (!lower_dentry->d_inode->i_op->getxattr) {
981 rc = -ENOSYS;
982 goto out;
983 }
984 mutex_lock(&lower_dentry->d_inode->i_mutex);
985 rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value,
986 size);
987 mutex_unlock(&lower_dentry->d_inode->i_mutex);
988out:
989 return rc;
990}
991
992static ssize_t
993ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
994{
995 int rc = 0;
996 struct dentry *lower_dentry;
997
998 lower_dentry = ecryptfs_dentry_to_lower(dentry);
999 if (!lower_dentry->d_inode->i_op->listxattr) {
1000 rc = -ENOSYS;
1001 goto out;
1002 }
1003 mutex_lock(&lower_dentry->d_inode->i_mutex);
1004 rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size);
1005 mutex_unlock(&lower_dentry->d_inode->i_mutex);
1006out:
1007 return rc;
1008}
1009
1010static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
1011{
1012 int rc = 0;
1013 struct dentry *lower_dentry;
1014
1015 lower_dentry = ecryptfs_dentry_to_lower(dentry);
1016 if (!lower_dentry->d_inode->i_op->removexattr) {
1017 rc = -ENOSYS;
1018 goto out;
1019 }
1020 mutex_lock(&lower_dentry->d_inode->i_mutex);
1021 rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name);
1022 mutex_unlock(&lower_dentry->d_inode->i_mutex);
1023out:
1024 return rc;
1025}
1026
1027int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
1028{
1029 if ((ecryptfs_inode_to_lower(inode)
1030 == (struct inode *)candidate_lower_inode))
1031 return 1;
1032 else
1033 return 0;
1034}
1035
1036int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
1037{
1038 ecryptfs_init_inode(inode, (struct inode *)lower_inode);
1039 return 0;
1040}
1041
1042struct inode_operations ecryptfs_symlink_iops = {
1043 .readlink = ecryptfs_readlink,
1044 .follow_link = ecryptfs_follow_link,
1045 .put_link = ecryptfs_put_link,
1046 .permission = ecryptfs_permission,
1047 .setattr = ecryptfs_setattr,
1048 .setxattr = ecryptfs_setxattr,
1049 .getxattr = ecryptfs_getxattr,
1050 .listxattr = ecryptfs_listxattr,
1051 .removexattr = ecryptfs_removexattr
1052};
1053
1054struct inode_operations ecryptfs_dir_iops = {
1055 .create = ecryptfs_create,
1056 .lookup = ecryptfs_lookup,
1057 .link = ecryptfs_link,
1058 .unlink = ecryptfs_unlink,
1059 .symlink = ecryptfs_symlink,
1060 .mkdir = ecryptfs_mkdir,
1061 .rmdir = ecryptfs_rmdir,
1062 .mknod = ecryptfs_mknod,
1063 .rename = ecryptfs_rename,
1064 .permission = ecryptfs_permission,
1065 .setattr = ecryptfs_setattr,
1066 .setxattr = ecryptfs_setxattr,
1067 .getxattr = ecryptfs_getxattr,
1068 .listxattr = ecryptfs_listxattr,
1069 .removexattr = ecryptfs_removexattr
1070};
1071
1072struct inode_operations ecryptfs_main_iops = {
1073 .permission = ecryptfs_permission,
1074 .setattr = ecryptfs_setattr,
1075 .setxattr = ecryptfs_setxattr,
1076 .getxattr = ecryptfs_getxattr,
1077 .listxattr = ecryptfs_listxattr,
1078 .removexattr = ecryptfs_removexattr
1079};
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
new file mode 100644
index 000000000000..ba454785a0c5
--- /dev/null
+++ b/fs/ecryptfs/keystore.c
@@ -0,0 +1,1061 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 * In-kernel key management code. Includes functions to parse and
4 * write authentication token-related packets with the underlying
5 * file.
6 *
7 * Copyright (C) 2004-2006 International Business Machines Corp.
8 * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
9 * Michael C. Thompson <mcthomps@us.ibm.com>
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24 * 02111-1307, USA.
25 */
26
27#include <linux/string.h>
28#include <linux/sched.h>
29#include <linux/syscalls.h>
30#include <linux/pagemap.h>
31#include <linux/key.h>
32#include <linux/random.h>
33#include <linux/crypto.h>
34#include <linux/scatterlist.h>
35#include "ecryptfs_kernel.h"
36
37/**
38 * request_key returned an error instead of a valid key address;
39 * determine the type of error, make appropriate log entries, and
40 * return an error code.
41 */
42int process_request_key_err(long err_code)
43{
44 int rc = 0;
45
46 switch (err_code) {
47 case ENOKEY:
48 ecryptfs_printk(KERN_WARNING, "No key\n");
49 rc = -ENOENT;
50 break;
51 case EKEYEXPIRED:
52 ecryptfs_printk(KERN_WARNING, "Key expired\n");
53 rc = -ETIME;
54 break;
55 case EKEYREVOKED:
56 ecryptfs_printk(KERN_WARNING, "Key revoked\n");
57 rc = -EINVAL;
58 break;
59 default:
60 ecryptfs_printk(KERN_WARNING, "Unknown error code: "
61 "[0x%.16x]\n", err_code);
62 rc = -EINVAL;
63 }
64 return rc;
65}
66
67static void wipe_auth_tok_list(struct list_head *auth_tok_list_head)
68{
69 struct list_head *walker;
70 struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
71
72 walker = auth_tok_list_head->next;
73 while (walker != auth_tok_list_head) {
74 auth_tok_list_item =
75 list_entry(walker, struct ecryptfs_auth_tok_list_item,
76 list);
77 walker = auth_tok_list_item->list.next;
78 memset(auth_tok_list_item, 0,
79 sizeof(struct ecryptfs_auth_tok_list_item));
80 kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
81 auth_tok_list_item);
82 }
83}
84
85struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
86
87/**
88 * parse_packet_length
89 * @data: Pointer to memory containing length at offset
90 * @size: This function writes the decoded size to this memory
91 * address; zero on error
92 * @length_size: The number of bytes occupied by the encoded length
93 *
94 * Returns Zero on success
95 */
96static int parse_packet_length(unsigned char *data, size_t *size,
97 size_t *length_size)
98{
99 int rc = 0;
100
101 (*length_size) = 0;
102 (*size) = 0;
103 if (data[0] < 192) {
104 /* One-byte length */
105 (*size) = data[0];
106 (*length_size) = 1;
107 } else if (data[0] < 224) {
108 /* Two-byte length */
109 (*size) = ((data[0] - 192) * 256);
110 (*size) += (data[1] + 192);
111 (*length_size) = 2;
112 } else if (data[0] == 255) {
113 /* Five-byte length; we're not supposed to see this */
114 ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
115 "supported\n");
116 rc = -EINVAL;
117 goto out;
118 } else {
119 ecryptfs_printk(KERN_ERR, "Error parsing packet length\n");
120 rc = -EINVAL;
121 goto out;
122 }
123out:
124 return rc;
125}
126
127/**
128 * write_packet_length
129 * @dest: The byte array target into which to write the
130 * length. Must have at least 5 bytes allocated.
131 * @size: The length to write.
132 * @packet_size_length: The number of bytes used to encode the
133 * packet length is written to this address.
134 *
135 * Returns zero on success; non-zero on error.
136 */
137static int write_packet_length(char *dest, size_t size,
138 size_t *packet_size_length)
139{
140 int rc = 0;
141
142 if (size < 192) {
143 dest[0] = size;
144 (*packet_size_length) = 1;
145 } else if (size < 65536) {
146 dest[0] = (((size - 192) / 256) + 192);
147 dest[1] = ((size - 192) % 256);
148 (*packet_size_length) = 2;
149 } else {
150 rc = -EINVAL;
151 ecryptfs_printk(KERN_WARNING,
152 "Unsupported packet size: [%d]\n", size);
153 }
154 return rc;
155}
156
157/**
158 * parse_tag_3_packet
159 * @crypt_stat: The cryptographic context to modify based on packet
160 * contents.
161 * @data: The raw bytes of the packet.
162 * @auth_tok_list: eCryptfs parses packets into authentication tokens;
163 * a new authentication token will be placed at the end
164 * of this list for this packet.
165 * @new_auth_tok: Pointer to a pointer to memory that this function
166 * allocates; sets the memory address of the pointer to
167 * NULL on error. This object is added to the
168 * auth_tok_list.
169 * @packet_size: This function writes the size of the parsed packet
170 * into this memory location; zero on error.
171 * @max_packet_size: maximum number of bytes to parse
172 *
173 * Returns zero on success; non-zero on error.
174 */
175static int
176parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
177 unsigned char *data, struct list_head *auth_tok_list,
178 struct ecryptfs_auth_tok **new_auth_tok,
179 size_t *packet_size, size_t max_packet_size)
180{
181 int rc = 0;
182 size_t body_size;
183 struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
184 size_t length_size;
185
186 (*packet_size) = 0;
187 (*new_auth_tok) = NULL;
188
189 /* we check that:
190 * one byte for the Tag 3 ID flag
191 * two bytes for the body size
192 * do not exceed the maximum_packet_size
193 */
194 if (unlikely((*packet_size) + 3 > max_packet_size)) {
195 ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
196 rc = -EINVAL;
197 goto out;
198 }
199
200 /* check for Tag 3 identifyer - one byte */
201 if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) {
202 ecryptfs_printk(KERN_ERR, "Enter w/ first byte != 0x%.2x\n",
203 ECRYPTFS_TAG_3_PACKET_TYPE);
204 rc = -EINVAL;
205 goto out;
206 }
207 /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
208 * at end of function upon failure */
209 auth_tok_list_item =
210 kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, SLAB_KERNEL);
211 if (!auth_tok_list_item) {
212 ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
213 rc = -ENOMEM;
214 goto out;
215 }
216 memset(auth_tok_list_item, 0,
217 sizeof(struct ecryptfs_auth_tok_list_item));
218 (*new_auth_tok) = &auth_tok_list_item->auth_tok;
219
220 /* check for body size - one to two bytes */
221 rc = parse_packet_length(&data[(*packet_size)], &body_size,
222 &length_size);
223 if (rc) {
224 ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
225 "rc = [%d]\n", rc);
226 goto out_free;
227 }
228 if (unlikely(body_size < (0x05 + ECRYPTFS_SALT_SIZE))) {
229 ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n",
230 body_size);
231 rc = -EINVAL;
232 goto out_free;
233 }
234 (*packet_size) += length_size;
235
236 /* now we know the length of the remainting Tag 3 packet size:
237 * 5 fix bytes for: version string, cipher, S2K ID, hash algo,
238 * number of hash iterations
239 * ECRYPTFS_SALT_SIZE bytes for salt
240 * body_size bytes minus the stuff above is the encrypted key size
241 */
242 if (unlikely((*packet_size) + body_size > max_packet_size)) {
243 ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
244 rc = -EINVAL;
245 goto out_free;
246 }
247
248 /* There are 5 characters of additional information in the
249 * packet */
250 (*new_auth_tok)->session_key.encrypted_key_size =
251 body_size - (0x05 + ECRYPTFS_SALT_SIZE);
252 ecryptfs_printk(KERN_DEBUG, "Encrypted key size = [%d]\n",
253 (*new_auth_tok)->session_key.encrypted_key_size);
254
255 /* Version 4 (from RFC2440) - one byte */
256 if (unlikely(data[(*packet_size)++] != 0x04)) {
257 ecryptfs_printk(KERN_DEBUG, "Unknown version number "
258 "[%d]\n", data[(*packet_size) - 1]);
259 rc = -EINVAL;
260 goto out_free;
261 }
262
263 /* cipher - one byte */
264 ecryptfs_cipher_code_to_string(crypt_stat->cipher,
265 (u16)data[(*packet_size)]);
266 /* A little extra work to differentiate among the AES key
267 * sizes; see RFC2440 */
268 switch(data[(*packet_size)++]) {
269 case RFC2440_CIPHER_AES_192:
270 crypt_stat->key_size = 24;
271 break;
272 default:
273 crypt_stat->key_size =
274 (*new_auth_tok)->session_key.encrypted_key_size;
275 }
276 ecryptfs_init_crypt_ctx(crypt_stat);
277 /* S2K identifier 3 (from RFC2440) */
278 if (unlikely(data[(*packet_size)++] != 0x03)) {
279 ecryptfs_printk(KERN_ERR, "Only S2K ID 3 is currently "
280 "supported\n");
281 rc = -ENOSYS;
282 goto out_free;
283 }
284
285 /* TODO: finish the hash mapping */
286 /* hash algorithm - one byte */
287 switch (data[(*packet_size)++]) {
288 case 0x01: /* See RFC2440 for these numbers and their mappings */
289 /* Choose MD5 */
290 /* salt - ECRYPTFS_SALT_SIZE bytes */
291 memcpy((*new_auth_tok)->token.password.salt,
292 &data[(*packet_size)], ECRYPTFS_SALT_SIZE);
293 (*packet_size) += ECRYPTFS_SALT_SIZE;
294
295 /* This conversion was taken straight from RFC2440 */
296 /* number of hash iterations - one byte */
297 (*new_auth_tok)->token.password.hash_iterations =
298 ((u32) 16 + (data[(*packet_size)] & 15))
299 << ((data[(*packet_size)] >> 4) + 6);
300 (*packet_size)++;
301
302 /* encrypted session key -
303 * (body_size-5-ECRYPTFS_SALT_SIZE) bytes */
304 memcpy((*new_auth_tok)->session_key.encrypted_key,
305 &data[(*packet_size)],
306 (*new_auth_tok)->session_key.encrypted_key_size);
307 (*packet_size) +=
308 (*new_auth_tok)->session_key.encrypted_key_size;
309 (*new_auth_tok)->session_key.flags &=
310 ~ECRYPTFS_CONTAINS_DECRYPTED_KEY;
311 (*new_auth_tok)->session_key.flags |=
312 ECRYPTFS_CONTAINS_ENCRYPTED_KEY;
313 (*new_auth_tok)->token.password.hash_algo = 0x01;
314 break;
315 default:
316 ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: "
317 "[%d]\n", data[(*packet_size) - 1]);
318 rc = -ENOSYS;
319 goto out_free;
320 }
321 (*new_auth_tok)->token_type = ECRYPTFS_PASSWORD;
322 /* TODO: Parametarize; we might actually want userspace to
323 * decrypt the session key. */
324 ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags,
325 ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT);
326 ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags,
327 ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT);
328 list_add(&auth_tok_list_item->list, auth_tok_list);
329 goto out;
330out_free:
331 (*new_auth_tok) = NULL;
332 memset(auth_tok_list_item, 0,
333 sizeof(struct ecryptfs_auth_tok_list_item));
334 kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
335 auth_tok_list_item);
336out:
337 if (rc)
338 (*packet_size) = 0;
339 return rc;
340}
341
342/**
343 * parse_tag_11_packet
344 * @data: The raw bytes of the packet
345 * @contents: This function writes the data contents of the literal
346 * packet into this memory location
347 * @max_contents_bytes: The maximum number of bytes that this function
348 * is allowed to write into contents
349 * @tag_11_contents_size: This function writes the size of the parsed
350 * contents into this memory location; zero on
351 * error
352 * @packet_size: This function writes the size of the parsed packet
353 * into this memory location; zero on error
354 * @max_packet_size: maximum number of bytes to parse
355 *
356 * Returns zero on success; non-zero on error.
357 */
358static int
359parse_tag_11_packet(unsigned char *data, unsigned char *contents,
360 size_t max_contents_bytes, size_t *tag_11_contents_size,
361 size_t *packet_size, size_t max_packet_size)
362{
363 int rc = 0;
364 size_t body_size;
365 size_t length_size;
366
367 (*packet_size) = 0;
368 (*tag_11_contents_size) = 0;
369
370 /* check that:
371 * one byte for the Tag 11 ID flag
372 * two bytes for the Tag 11 length
373 * do not exceed the maximum_packet_size
374 */
375 if (unlikely((*packet_size) + 3 > max_packet_size)) {
376 ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
377 rc = -EINVAL;
378 goto out;
379 }
380
381 /* check for Tag 11 identifyer - one byte */
382 if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) {
383 ecryptfs_printk(KERN_WARNING,
384 "Invalid tag 11 packet format\n");
385 rc = -EINVAL;
386 goto out;
387 }
388
389 /* get Tag 11 content length - one or two bytes */
390 rc = parse_packet_length(&data[(*packet_size)], &body_size,
391 &length_size);
392 if (rc) {
393 ecryptfs_printk(KERN_WARNING,
394 "Invalid tag 11 packet format\n");
395 goto out;
396 }
397 (*packet_size) += length_size;
398
399 if (body_size < 13) {
400 ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n",
401 body_size);
402 rc = -EINVAL;
403 goto out;
404 }
405 /* We have 13 bytes of surrounding packet values */
406 (*tag_11_contents_size) = (body_size - 13);
407
408 /* now we know the length of the remainting Tag 11 packet size:
409 * 14 fix bytes for: special flag one, special flag two,
410 * 12 skipped bytes
411 * body_size bytes minus the stuff above is the Tag 11 content
412 */
413 /* FIXME why is the body size one byte smaller than the actual
414 * size of the body?
415 * this seems to be an error here as well as in
416 * write_tag_11_packet() */
417 if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) {
418 ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
419 rc = -EINVAL;
420 goto out;
421 }
422
423 /* special flag one - one byte */
424 if (data[(*packet_size)++] != 0x62) {
425 ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n");
426 rc = -EINVAL;
427 goto out;
428 }
429
430 /* special flag two - one byte */
431 if (data[(*packet_size)++] != 0x08) {
432 ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n");
433 rc = -EINVAL;
434 goto out;
435 }
436
437 /* skip the next 12 bytes */
438 (*packet_size) += 12; /* We don't care about the filename or
439 * the timestamp */
440
441 /* get the Tag 11 contents - tag_11_contents_size bytes */
442 memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size));
443 (*packet_size) += (*tag_11_contents_size);
444
445out:
446 if (rc) {
447 (*packet_size) = 0;
448 (*tag_11_contents_size) = 0;
449 }
450 return rc;
451}
452
453/**
454 * decrypt_session_key - Decrypt the session key with the given auth_tok.
455 *
456 * Returns Zero on success; non-zero error otherwise.
457 */
458static int decrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
459 struct ecryptfs_crypt_stat *crypt_stat)
460{
461 int rc = 0;
462 struct ecryptfs_password *password_s_ptr;
463 struct crypto_tfm *tfm = NULL;
464 struct scatterlist src_sg[2], dst_sg[2];
465 struct mutex *tfm_mutex = NULL;
466 /* TODO: Use virt_to_scatterlist for these */
467 char *encrypted_session_key;
468 char *session_key;
469
470 password_s_ptr = &auth_tok->token.password;
471 if (ECRYPTFS_CHECK_FLAG(password_s_ptr->flags,
472 ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET))
473 ecryptfs_printk(KERN_DEBUG, "Session key encryption key "
474 "set; skipping key generation\n");
475 ecryptfs_printk(KERN_DEBUG, "Session key encryption key (size [%d])"
476 ":\n",
477 password_s_ptr->session_key_encryption_key_bytes);
478 if (ecryptfs_verbosity > 0)
479 ecryptfs_dump_hex(password_s_ptr->session_key_encryption_key,
480 password_s_ptr->
481 session_key_encryption_key_bytes);
482 if (!strcmp(crypt_stat->cipher,
483 crypt_stat->mount_crypt_stat->global_default_cipher_name)
484 && crypt_stat->mount_crypt_stat->global_key_tfm) {
485 tfm = crypt_stat->mount_crypt_stat->global_key_tfm;
486 tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex;
487 } else {
488 tfm = crypto_alloc_tfm(crypt_stat->cipher,
489 CRYPTO_TFM_REQ_WEAK_KEY);
490 if (!tfm) {
491 printk(KERN_ERR "Error allocating crypto context\n");
492 rc = -ENOMEM;
493 goto out;
494 }
495 }
496 if (password_s_ptr->session_key_encryption_key_bytes
497 < crypto_tfm_alg_min_keysize(tfm)) {
498 printk(KERN_WARNING "Session key encryption key is [%d] bytes; "
499 "minimum keysize for selected cipher is [%d] bytes.\n",
500 password_s_ptr->session_key_encryption_key_bytes,
501 crypto_tfm_alg_min_keysize(tfm));
502 rc = -EINVAL;
503 goto out;
504 }
505 if (tfm_mutex)
506 mutex_lock(tfm_mutex);
507 crypto_cipher_setkey(tfm, password_s_ptr->session_key_encryption_key,
508 crypt_stat->key_size);
509 /* TODO: virt_to_scatterlist */
510 encrypted_session_key = (char *)__get_free_page(GFP_KERNEL);
511 if (!encrypted_session_key) {
512 ecryptfs_printk(KERN_ERR, "Out of memory\n");
513 rc = -ENOMEM;
514 goto out_free_tfm;
515 }
516 session_key = (char *)__get_free_page(GFP_KERNEL);
517 if (!session_key) {
518 kfree(encrypted_session_key);
519 ecryptfs_printk(KERN_ERR, "Out of memory\n");
520 rc = -ENOMEM;
521 goto out_free_tfm;
522 }
523 memcpy(encrypted_session_key, auth_tok->session_key.encrypted_key,
524 auth_tok->session_key.encrypted_key_size);
525 src_sg[0].page = virt_to_page(encrypted_session_key);
526 src_sg[0].offset = 0;
527 BUG_ON(auth_tok->session_key.encrypted_key_size > PAGE_CACHE_SIZE);
528 src_sg[0].length = auth_tok->session_key.encrypted_key_size;
529 dst_sg[0].page = virt_to_page(session_key);
530 dst_sg[0].offset = 0;
531 auth_tok->session_key.decrypted_key_size =
532 auth_tok->session_key.encrypted_key_size;
533 dst_sg[0].length = auth_tok->session_key.encrypted_key_size;
534 /* TODO: Handle error condition */
535 crypto_cipher_decrypt(tfm, dst_sg, src_sg,
536 auth_tok->session_key.encrypted_key_size);
537 auth_tok->session_key.decrypted_key_size =
538 auth_tok->session_key.encrypted_key_size;
539 memcpy(auth_tok->session_key.decrypted_key, session_key,
540 auth_tok->session_key.decrypted_key_size);
541 auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY;
542 memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key,
543 auth_tok->session_key.decrypted_key_size);
544 ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
545 ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n");
546 if (ecryptfs_verbosity > 0)
547 ecryptfs_dump_hex(crypt_stat->key,
548 crypt_stat->key_size);
549 memset(encrypted_session_key, 0, PAGE_CACHE_SIZE);
550 free_page((unsigned long)encrypted_session_key);
551 memset(session_key, 0, PAGE_CACHE_SIZE);
552 free_page((unsigned long)session_key);
553out_free_tfm:
554 if (tfm_mutex)
555 mutex_unlock(tfm_mutex);
556 else
557 crypto_free_tfm(tfm);
558out:
559 return rc;
560}
561
562/**
563 * ecryptfs_parse_packet_set
564 * @dest: The header page in memory
565 * @version: Version of file format, to guide parsing behavior
566 *
567 * Get crypt_stat to have the file's session key if the requisite key
568 * is available to decrypt the session key.
569 *
570 * Returns Zero if a valid authentication token was retrieved and
571 * processed; negative value for file not encrypted or for error
572 * conditions.
573 */
574int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
575 unsigned char *src,
576 struct dentry *ecryptfs_dentry)
577{
578 size_t i = 0;
579 int rc = 0;
580 size_t found_auth_tok = 0;
581 size_t next_packet_is_auth_tok_packet;
582 char sig[ECRYPTFS_SIG_SIZE_HEX];
583 struct list_head auth_tok_list;
584 struct list_head *walker;
585 struct ecryptfs_auth_tok *chosen_auth_tok = NULL;
586 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
587 &ecryptfs_superblock_to_private(
588 ecryptfs_dentry->d_sb)->mount_crypt_stat;
589 struct ecryptfs_auth_tok *candidate_auth_tok = NULL;
590 size_t packet_size;
591 struct ecryptfs_auth_tok *new_auth_tok;
592 unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE];
593 size_t tag_11_contents_size;
594 size_t tag_11_packet_size;
595
596 INIT_LIST_HEAD(&auth_tok_list);
597 /* Parse the header to find as many packets as we can, these will be
598 * added the our &auth_tok_list */
599 next_packet_is_auth_tok_packet = 1;
600 while (next_packet_is_auth_tok_packet) {
601 size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
602
603 switch (src[i]) {
604 case ECRYPTFS_TAG_3_PACKET_TYPE:
605 rc = parse_tag_3_packet(crypt_stat,
606 (unsigned char *)&src[i],
607 &auth_tok_list, &new_auth_tok,
608 &packet_size, max_packet_size);
609 if (rc) {
610 ecryptfs_printk(KERN_ERR, "Error parsing "
611 "tag 3 packet\n");
612 rc = -EIO;
613 goto out_wipe_list;
614 }
615 i += packet_size;
616 rc = parse_tag_11_packet((unsigned char *)&src[i],
617 sig_tmp_space,
618 ECRYPTFS_SIG_SIZE,
619 &tag_11_contents_size,
620 &tag_11_packet_size,
621 max_packet_size);
622 if (rc) {
623 ecryptfs_printk(KERN_ERR, "No valid "
624 "(ecryptfs-specific) literal "
625 "packet containing "
626 "authentication token "
627 "signature found after "
628 "tag 3 packet\n");
629 rc = -EIO;
630 goto out_wipe_list;
631 }
632 i += tag_11_packet_size;
633 if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
634 ecryptfs_printk(KERN_ERR, "Expected "
635 "signature of size [%d]; "
636 "read size [%d]\n",
637 ECRYPTFS_SIG_SIZE,
638 tag_11_contents_size);
639 rc = -EIO;
640 goto out_wipe_list;
641 }
642 ecryptfs_to_hex(new_auth_tok->token.password.signature,
643 sig_tmp_space, tag_11_contents_size);
644 new_auth_tok->token.password.signature[
645 ECRYPTFS_PASSWORD_SIG_SIZE] = '\0';
646 ECRYPTFS_SET_FLAG(crypt_stat->flags,
647 ECRYPTFS_ENCRYPTED);
648 break;
649 case ECRYPTFS_TAG_11_PACKET_TYPE:
650 ecryptfs_printk(KERN_WARNING, "Invalid packet set "
651 "(Tag 11 not allowed by itself)\n");
652 rc = -EIO;
653 goto out_wipe_list;
654 break;
655 default:
656 ecryptfs_printk(KERN_DEBUG, "No packet at offset "
657 "[%d] of the file header; hex value of "
658 "character is [0x%.2x]\n", i, src[i]);
659 next_packet_is_auth_tok_packet = 0;
660 }
661 }
662 if (list_empty(&auth_tok_list)) {
663 rc = -EINVAL; /* Do not support non-encrypted files in
664 * the 0.1 release */
665 goto out;
666 }
667 /* If we have a global auth tok, then we should try to use
668 * it */
669 if (mount_crypt_stat->global_auth_tok) {
670 memcpy(sig, mount_crypt_stat->global_auth_tok_sig,
671 ECRYPTFS_SIG_SIZE_HEX);
672 chosen_auth_tok = mount_crypt_stat->global_auth_tok;
673 } else
674 BUG(); /* We should always have a global auth tok in
675 * the 0.1 release */
676 /* Scan list to see if our chosen_auth_tok works */
677 list_for_each(walker, &auth_tok_list) {
678 struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
679 auth_tok_list_item =
680 list_entry(walker, struct ecryptfs_auth_tok_list_item,
681 list);
682 candidate_auth_tok = &auth_tok_list_item->auth_tok;
683 if (unlikely(ecryptfs_verbosity > 0)) {
684 ecryptfs_printk(KERN_DEBUG,
685 "Considering cadidate auth tok:\n");
686 ecryptfs_dump_auth_tok(candidate_auth_tok);
687 }
688 /* TODO: Replace ECRYPTFS_SIG_SIZE_HEX w/ dynamic value */
689 if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD
690 && !strncmp(candidate_auth_tok->token.password.signature,
691 sig, ECRYPTFS_SIG_SIZE_HEX)) {
692 found_auth_tok = 1;
693 goto leave_list;
694 /* TODO: Transfer the common salt into the
695 * crypt_stat salt */
696 }
697 }
698leave_list:
699 if (!found_auth_tok) {
700 ecryptfs_printk(KERN_ERR, "Could not find authentication "
701 "token on temporary list for sig [%.*s]\n",
702 ECRYPTFS_SIG_SIZE_HEX, sig);
703 rc = -EIO;
704 goto out_wipe_list;
705 } else {
706 memcpy(&(candidate_auth_tok->token.password),
707 &(chosen_auth_tok->token.password),
708 sizeof(struct ecryptfs_password));
709 rc = decrypt_session_key(candidate_auth_tok, crypt_stat);
710 if (rc) {
711 ecryptfs_printk(KERN_ERR, "Error decrypting the "
712 "session key\n");
713 goto out_wipe_list;
714 }
715 rc = ecryptfs_compute_root_iv(crypt_stat);
716 if (rc) {
717 ecryptfs_printk(KERN_ERR, "Error computing "
718 "the root IV\n");
719 goto out_wipe_list;
720 }
721 }
722 rc = ecryptfs_init_crypt_ctx(crypt_stat);
723 if (rc) {
724 ecryptfs_printk(KERN_ERR, "Error initializing crypto "
725 "context for cipher [%s]; rc = [%d]\n",
726 crypt_stat->cipher, rc);
727 }
728out_wipe_list:
729 wipe_auth_tok_list(&auth_tok_list);
730out:
731 return rc;
732}
733
734/**
735 * write_tag_11_packet
736 * @dest: Target into which Tag 11 packet is to be written
737 * @max: Maximum packet length
738 * @contents: Byte array of contents to copy in
739 * @contents_length: Number of bytes in contents
740 * @packet_length: Length of the Tag 11 packet written; zero on error
741 *
742 * Returns zero on success; non-zero on error.
743 */
744static int
745write_tag_11_packet(char *dest, int max, char *contents, size_t contents_length,
746 size_t *packet_length)
747{
748 int rc = 0;
749 size_t packet_size_length;
750
751 (*packet_length) = 0;
752 if ((13 + contents_length) > max) {
753 rc = -EINVAL;
754 ecryptfs_printk(KERN_ERR, "Packet length larger than "
755 "maximum allowable\n");
756 goto out;
757 }
758 /* General packet header */
759 /* Packet tag */
760 dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE;
761 /* Packet length */
762 rc = write_packet_length(&dest[(*packet_length)],
763 (13 + contents_length), &packet_size_length);
764 if (rc) {
765 ecryptfs_printk(KERN_ERR, "Error generating tag 11 packet "
766 "header; cannot generate packet length\n");
767 goto out;
768 }
769 (*packet_length) += packet_size_length;
770 /* Tag 11 specific */
771 /* One-octet field that describes how the data is formatted */
772 dest[(*packet_length)++] = 0x62; /* binary data */
773 /* One-octet filename length followed by filename */
774 dest[(*packet_length)++] = 8;
775 memcpy(&dest[(*packet_length)], "_CONSOLE", 8);
776 (*packet_length) += 8;
777 /* Four-octet number indicating modification date */
778 memset(&dest[(*packet_length)], 0x00, 4);
779 (*packet_length) += 4;
780 /* Remainder is literal data */
781 memcpy(&dest[(*packet_length)], contents, contents_length);
782 (*packet_length) += contents_length;
783 out:
784 if (rc)
785 (*packet_length) = 0;
786 return rc;
787}
788
789/**
790 * write_tag_3_packet
791 * @dest: Buffer into which to write the packet
792 * @max: Maximum number of bytes that can be written
793 * @auth_tok: Authentication token
794 * @crypt_stat: The cryptographic context
795 * @key_rec: encrypted key
796 * @packet_size: This function will write the number of bytes that end
797 * up constituting the packet; set to zero on error
798 *
799 * Returns zero on success; non-zero on error.
800 */
801static int
802write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok,
803 struct ecryptfs_crypt_stat *crypt_stat,
804 struct ecryptfs_key_record *key_rec, size_t *packet_size)
805{
806 int rc = 0;
807
808 size_t i;
809 size_t signature_is_valid = 0;
810 size_t encrypted_session_key_valid = 0;
811 char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
812 struct scatterlist dest_sg[2];
813 struct scatterlist src_sg[2];
814 struct crypto_tfm *tfm = NULL;
815 struct mutex *tfm_mutex = NULL;
816 size_t key_rec_size;
817 size_t packet_size_length;
818 size_t cipher_code;
819
820 (*packet_size) = 0;
821 /* Check for a valid signature on the auth_tok */
822 for (i = 0; i < ECRYPTFS_SIG_SIZE_HEX; i++)
823 signature_is_valid |= auth_tok->token.password.signature[i];
824 if (!signature_is_valid)
825 BUG();
826 ecryptfs_from_hex((*key_rec).sig, auth_tok->token.password.signature,
827 ECRYPTFS_SIG_SIZE);
828 encrypted_session_key_valid = 0;
829 for (i = 0; i < crypt_stat->key_size; i++)
830 encrypted_session_key_valid |=
831 auth_tok->session_key.encrypted_key[i];
832 if (encrypted_session_key_valid) {
833 memcpy((*key_rec).enc_key,
834 auth_tok->session_key.encrypted_key,
835 auth_tok->session_key.encrypted_key_size);
836 goto encrypted_session_key_set;
837 }
838 if (auth_tok->session_key.encrypted_key_size == 0)
839 auth_tok->session_key.encrypted_key_size =
840 crypt_stat->key_size;
841 if (crypt_stat->key_size == 24
842 && strcmp("aes", crypt_stat->cipher) == 0) {
843 memset((crypt_stat->key + 24), 0, 8);
844 auth_tok->session_key.encrypted_key_size = 32;
845 }
846 (*key_rec).enc_key_size =
847 auth_tok->session_key.encrypted_key_size;
848 if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags,
849 ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET)) {
850 ecryptfs_printk(KERN_DEBUG, "Using previously generated "
851 "session key encryption key of size [%d]\n",
852 auth_tok->token.password.
853 session_key_encryption_key_bytes);
854 memcpy(session_key_encryption_key,
855 auth_tok->token.password.session_key_encryption_key,
856 crypt_stat->key_size);
857 ecryptfs_printk(KERN_DEBUG,
858 "Cached session key " "encryption key: \n");
859 if (ecryptfs_verbosity > 0)
860 ecryptfs_dump_hex(session_key_encryption_key, 16);
861 }
862 if (unlikely(ecryptfs_verbosity > 0)) {
863 ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n");
864 ecryptfs_dump_hex(session_key_encryption_key, 16);
865 }
866 rc = virt_to_scatterlist(crypt_stat->key,
867 (*key_rec).enc_key_size, src_sg, 2);
868 if (!rc) {
869 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
870 "for crypt_stat session key\n");
871 rc = -ENOMEM;
872 goto out;
873 }
874 rc = virt_to_scatterlist((*key_rec).enc_key,
875 (*key_rec).enc_key_size, dest_sg, 2);
876 if (!rc) {
877 ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
878 "for crypt_stat encrypted session key\n");
879 rc = -ENOMEM;
880 goto out;
881 }
882 if (!strcmp(crypt_stat->cipher,
883 crypt_stat->mount_crypt_stat->global_default_cipher_name)
884 && crypt_stat->mount_crypt_stat->global_key_tfm) {
885 tfm = crypt_stat->mount_crypt_stat->global_key_tfm;
886 tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex;
887 } else
888 tfm = crypto_alloc_tfm(crypt_stat->cipher, 0);
889 if (!tfm) {
890 ecryptfs_printk(KERN_ERR, "Could not initialize crypto "
891 "context for cipher [%s]\n",
892 crypt_stat->cipher);
893 rc = -EINVAL;
894 goto out;
895 }
896 if (tfm_mutex)
897 mutex_lock(tfm_mutex);
898 rc = crypto_cipher_setkey(tfm, session_key_encryption_key,
899 crypt_stat->key_size);
900 if (rc < 0) {
901 if (tfm_mutex)
902 mutex_unlock(tfm_mutex);
903 ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
904 "context\n");
905 goto out;
906 }
907 rc = 0;
908 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
909 crypt_stat->key_size);
910 crypto_cipher_encrypt(tfm, dest_sg, src_sg,
911 (*key_rec).enc_key_size);
912 if (tfm_mutex)
913 mutex_unlock(tfm_mutex);
914 ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
915 if (ecryptfs_verbosity > 0)
916 ecryptfs_dump_hex((*key_rec).enc_key,
917 (*key_rec).enc_key_size);
918encrypted_session_key_set:
919 /* Now we have a valid key_rec. Append it to the
920 * key_rec set. */
921 key_rec_size = (sizeof(struct ecryptfs_key_record)
922 - ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES
923 + ((*key_rec).enc_key_size));
924 /* TODO: Include a packet size limit as a parameter to this
925 * function once we have multi-packet headers (for versions
926 * later than 0.1 */
927 if (key_rec_size >= ECRYPTFS_MAX_KEYSET_SIZE) {
928 ecryptfs_printk(KERN_ERR, "Keyset too large\n");
929 rc = -EINVAL;
930 goto out;
931 }
932 /* TODO: Packet size limit */
933 /* We have 5 bytes of surrounding packet data */
934 if ((0x05 + ECRYPTFS_SALT_SIZE
935 + (*key_rec).enc_key_size) >= max) {
936 ecryptfs_printk(KERN_ERR, "Authentication token is too "
937 "large\n");
938 rc = -EINVAL;
939 goto out;
940 }
941 /* This format is inspired by OpenPGP; see RFC 2440
942 * packet tag 3 */
943 dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE;
944 /* ver+cipher+s2k+hash+salt+iter+enc_key */
945 rc = write_packet_length(&dest[(*packet_size)],
946 (0x05 + ECRYPTFS_SALT_SIZE
947 + (*key_rec).enc_key_size),
948 &packet_size_length);
949 if (rc) {
950 ecryptfs_printk(KERN_ERR, "Error generating tag 3 packet "
951 "header; cannot generate packet length\n");
952 goto out;
953 }
954 (*packet_size) += packet_size_length;
955 dest[(*packet_size)++] = 0x04; /* version 4 */
956 cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
957 if (cipher_code == 0) {
958 ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
959 "cipher [%s]\n", crypt_stat->cipher);
960 rc = -EINVAL;
961 goto out;
962 }
963 dest[(*packet_size)++] = cipher_code;
964 dest[(*packet_size)++] = 0x03; /* S2K */
965 dest[(*packet_size)++] = 0x01; /* MD5 (TODO: parameterize) */
966 memcpy(&dest[(*packet_size)], auth_tok->token.password.salt,
967 ECRYPTFS_SALT_SIZE);
968 (*packet_size) += ECRYPTFS_SALT_SIZE; /* salt */
969 dest[(*packet_size)++] = 0x60; /* hash iterations (65536) */
970 memcpy(&dest[(*packet_size)], (*key_rec).enc_key,
971 (*key_rec).enc_key_size);
972 (*packet_size) += (*key_rec).enc_key_size;
973out:
974 if (tfm && !tfm_mutex)
975 crypto_free_tfm(tfm);
976 if (rc)
977 (*packet_size) = 0;
978 return rc;
979}
980
981/**
982 * ecryptfs_generate_key_packet_set
983 * @dest: Virtual address from which to write the key record set
984 * @crypt_stat: The cryptographic context from which the
985 * authentication tokens will be retrieved
986 * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat
987 * for the global parameters
988 * @len: The amount written
989 * @max: The maximum amount of data allowed to be written
990 *
991 * Generates a key packet set and writes it to the virtual address
992 * passed in.
993 *
994 * Returns zero on success; non-zero on error.
995 */
996int
997ecryptfs_generate_key_packet_set(char *dest_base,
998 struct ecryptfs_crypt_stat *crypt_stat,
999 struct dentry *ecryptfs_dentry, size_t *len,
1000 size_t max)
1001{
1002 int rc = 0;
1003 struct ecryptfs_auth_tok *auth_tok;
1004 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
1005 &ecryptfs_superblock_to_private(
1006 ecryptfs_dentry->d_sb)->mount_crypt_stat;
1007 size_t written;
1008 struct ecryptfs_key_record key_rec;
1009
1010 (*len) = 0;
1011 if (mount_crypt_stat->global_auth_tok) {
1012 auth_tok = mount_crypt_stat->global_auth_tok;
1013 if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
1014 rc = write_tag_3_packet((dest_base + (*len)),
1015 max, auth_tok,
1016 crypt_stat, &key_rec,
1017 &written);
1018 if (rc) {
1019 ecryptfs_printk(KERN_WARNING, "Error "
1020 "writing tag 3 packet\n");
1021 goto out;
1022 }
1023 (*len) += written;
1024 /* Write auth tok signature packet */
1025 rc = write_tag_11_packet(
1026 (dest_base + (*len)),
1027 (max - (*len)),
1028 key_rec.sig, ECRYPTFS_SIG_SIZE, &written);
1029 if (rc) {
1030 ecryptfs_printk(KERN_ERR, "Error writing "
1031 "auth tok signature packet\n");
1032 goto out;
1033 }
1034 (*len) += written;
1035 } else {
1036 ecryptfs_printk(KERN_WARNING, "Unsupported "
1037 "authentication token type\n");
1038 rc = -EINVAL;
1039 goto out;
1040 }
1041 if (rc) {
1042 ecryptfs_printk(KERN_WARNING, "Error writing "
1043 "authentication token packet with sig "
1044 "= [%s]\n",
1045 mount_crypt_stat->global_auth_tok_sig);
1046 rc = -EIO;
1047 goto out;
1048 }
1049 } else
1050 BUG();
1051 if (likely((max - (*len)) > 0)) {
1052 dest_base[(*len)] = 0x00;
1053 } else {
1054 ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n");
1055 rc = -EIO;
1056 }
1057out:
1058 if (rc)
1059 (*len) = 0;
1060 return rc;
1061}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
new file mode 100644
index 000000000000..5938a232d11b
--- /dev/null
+++ b/fs/ecryptfs/main.c
@@ -0,0 +1,828 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 *
4 * Copyright (C) 1997-2003 Erez Zadok
5 * Copyright (C) 2001-2003 Stony Brook University
6 * Copyright (C) 2004-2006 International Business Machines Corp.
7 * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
8 * Michael C. Thompson <mcthomps@us.ibm.com>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 * 02111-1307, USA.
24 */
25
26#include <linux/dcache.h>
27#include <linux/file.h>
28#include <linux/module.h>
29#include <linux/namei.h>
30#include <linux/skbuff.h>
31#include <linux/crypto.h>
32#include <linux/netlink.h>
33#include <linux/mount.h>
34#include <linux/dcache.h>
35#include <linux/pagemap.h>
36#include <linux/key.h>
37#include <linux/parser.h>
38#include "ecryptfs_kernel.h"
39
40/**
41 * Module parameter that defines the ecryptfs_verbosity level.
42 */
43int ecryptfs_verbosity = 0;
44
45module_param(ecryptfs_verbosity, int, 0);
46MODULE_PARM_DESC(ecryptfs_verbosity,
47 "Initial verbosity level (0 or 1; defaults to "
48 "0, which is Quiet)");
49
50void __ecryptfs_printk(const char *fmt, ...)
51{
52 va_list args;
53 va_start(args, fmt);
54 if (fmt[1] == '7') { /* KERN_DEBUG */
55 if (ecryptfs_verbosity >= 1)
56 vprintk(fmt, args);
57 } else
58 vprintk(fmt, args);
59 va_end(args);
60}
61
62/**
63 * ecryptfs_interpose
64 * @lower_dentry: Existing dentry in the lower filesystem
65 * @dentry: ecryptfs' dentry
66 * @sb: ecryptfs's super_block
67 * @flag: If set to true, then d_add is called, else d_instantiate is called
68 *
69 * Interposes upper and lower dentries.
70 *
71 * Returns zero on success; non-zero otherwise
72 */
73int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
74 struct super_block *sb, int flag)
75{
76 struct inode *lower_inode;
77 struct inode *inode;
78 int rc = 0;
79
80 lower_inode = lower_dentry->d_inode;
81 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
82 rc = -EXDEV;
83 goto out;
84 }
85 if (!igrab(lower_inode)) {
86 rc = -ESTALE;
87 goto out;
88 }
89 inode = iget5_locked(sb, (unsigned long)lower_inode,
90 ecryptfs_inode_test, ecryptfs_inode_set,
91 lower_inode);
92 if (!inode) {
93 rc = -EACCES;
94 iput(lower_inode);
95 goto out;
96 }
97 if (inode->i_state & I_NEW)
98 unlock_new_inode(inode);
99 else
100 iput(lower_inode);
101 if (S_ISLNK(lower_inode->i_mode))
102 inode->i_op = &ecryptfs_symlink_iops;
103 else if (S_ISDIR(lower_inode->i_mode))
104 inode->i_op = &ecryptfs_dir_iops;
105 if (S_ISDIR(lower_inode->i_mode))
106 inode->i_fop = &ecryptfs_dir_fops;
107 if (special_file(lower_inode->i_mode))
108 init_special_inode(inode, lower_inode->i_mode,
109 lower_inode->i_rdev);
110 dentry->d_op = &ecryptfs_dops;
111 if (flag)
112 d_add(dentry, inode);
113 else
114 d_instantiate(dentry, inode);
115 ecryptfs_copy_attr_all(inode, lower_inode);
116 /* This size will be overwritten for real files w/ headers and
117 * other metadata */
118 ecryptfs_copy_inode_size(inode, lower_inode);
119out:
120 return rc;
121}
122
123enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_debug,
124 ecryptfs_opt_ecryptfs_debug, ecryptfs_opt_cipher,
125 ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes,
126 ecryptfs_opt_passthrough, ecryptfs_opt_err };
127
128static match_table_t tokens = {
129 {ecryptfs_opt_sig, "sig=%s"},
130 {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
131 {ecryptfs_opt_debug, "debug=%u"},
132 {ecryptfs_opt_ecryptfs_debug, "ecryptfs_debug=%u"},
133 {ecryptfs_opt_cipher, "cipher=%s"},
134 {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"},
135 {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"},
136 {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
137 {ecryptfs_opt_err, NULL}
138};
139
140/**
141 * ecryptfs_verify_version
142 * @version: The version number to confirm
143 *
144 * Returns zero on good version; non-zero otherwise
145 */
146static int ecryptfs_verify_version(u16 version)
147{
148 int rc = 0;
149 unsigned char major;
150 unsigned char minor;
151
152 major = ((version >> 8) & 0xFF);
153 minor = (version & 0xFF);
154 if (major != ECRYPTFS_VERSION_MAJOR) {
155 ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
156 "Expected [%d]; got [%d]\n",
157 ECRYPTFS_VERSION_MAJOR, major);
158 rc = -EINVAL;
159 goto out;
160 }
161 if (minor != ECRYPTFS_VERSION_MINOR) {
162 ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
163 "Expected [%d]; got [%d]\n",
164 ECRYPTFS_VERSION_MINOR, minor);
165 rc = -EINVAL;
166 goto out;
167 }
168out:
169 return rc;
170}
171
172/**
173 * ecryptfs_parse_options
174 * @sb: The ecryptfs super block
175 * @options: The options pased to the kernel
176 *
177 * Parse mount options:
178 * debug=N - ecryptfs_verbosity level for debug output
179 * sig=XXX - description(signature) of the key to use
180 *
181 * Returns the dentry object of the lower-level (lower/interposed)
182 * directory; We want to mount our stackable file system on top of
183 * that lower directory.
184 *
185 * The signature of the key to use must be the description of a key
186 * already in the keyring. Mounting will fail if the key can not be
187 * found.
188 *
189 * Returns zero on success; non-zero on error
190 */
191static int ecryptfs_parse_options(struct super_block *sb, char *options)
192{
193 char *p;
194 int rc = 0;
195 int sig_set = 0;
196 int cipher_name_set = 0;
197 int cipher_key_bytes;
198 int cipher_key_bytes_set = 0;
199 struct key *auth_tok_key = NULL;
200 struct ecryptfs_auth_tok *auth_tok = NULL;
201 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
202 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
203 substring_t args[MAX_OPT_ARGS];
204 int token;
205 char *sig_src;
206 char *sig_dst;
207 char *debug_src;
208 char *cipher_name_dst;
209 char *cipher_name_src;
210 char *cipher_key_bytes_src;
211 struct crypto_tfm *tmp_tfm;
212 int cipher_name_len;
213
214 if (!options) {
215 rc = -EINVAL;
216 goto out;
217 }
218 while ((p = strsep(&options, ",")) != NULL) {
219 if (!*p)
220 continue;
221 token = match_token(p, tokens, args);
222 switch (token) {
223 case ecryptfs_opt_sig:
224 case ecryptfs_opt_ecryptfs_sig:
225 sig_src = args[0].from;
226 sig_dst =
227 mount_crypt_stat->global_auth_tok_sig;
228 memcpy(sig_dst, sig_src, ECRYPTFS_SIG_SIZE_HEX);
229 sig_dst[ECRYPTFS_SIG_SIZE_HEX] = '\0';
230 ecryptfs_printk(KERN_DEBUG,
231 "The mount_crypt_stat "
232 "global_auth_tok_sig set to: "
233 "[%s]\n", sig_dst);
234 sig_set = 1;
235 break;
236 case ecryptfs_opt_debug:
237 case ecryptfs_opt_ecryptfs_debug:
238 debug_src = args[0].from;
239 ecryptfs_verbosity =
240 (int)simple_strtol(debug_src, &debug_src,
241 0);
242 ecryptfs_printk(KERN_DEBUG,
243 "Verbosity set to [%d]" "\n",
244 ecryptfs_verbosity);
245 break;
246 case ecryptfs_opt_cipher:
247 case ecryptfs_opt_ecryptfs_cipher:
248 cipher_name_src = args[0].from;
249 cipher_name_dst =
250 mount_crypt_stat->
251 global_default_cipher_name;
252 strncpy(cipher_name_dst, cipher_name_src,
253 ECRYPTFS_MAX_CIPHER_NAME_SIZE);
254 ecryptfs_printk(KERN_DEBUG,
255 "The mount_crypt_stat "
256 "global_default_cipher_name set to: "
257 "[%s]\n", cipher_name_dst);
258 cipher_name_set = 1;
259 break;
260 case ecryptfs_opt_ecryptfs_key_bytes:
261 cipher_key_bytes_src = args[0].from;
262 cipher_key_bytes =
263 (int)simple_strtol(cipher_key_bytes_src,
264 &cipher_key_bytes_src, 0);
265 mount_crypt_stat->global_default_cipher_key_size =
266 cipher_key_bytes;
267 ecryptfs_printk(KERN_DEBUG,
268 "The mount_crypt_stat "
269 "global_default_cipher_key_size "
270 "set to: [%d]\n", mount_crypt_stat->
271 global_default_cipher_key_size);
272 cipher_key_bytes_set = 1;
273 break;
274 case ecryptfs_opt_passthrough:
275 mount_crypt_stat->flags |=
276 ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED;
277 break;
278 case ecryptfs_opt_err:
279 default:
280 ecryptfs_printk(KERN_WARNING,
281 "eCryptfs: unrecognized option '%s'\n",
282 p);
283 }
284 }
285 /* Do not support lack of mount-wide signature in 0.1
286 * release */
287 if (!sig_set) {
288 rc = -EINVAL;
289 ecryptfs_printk(KERN_ERR, "You must supply a valid "
290 "passphrase auth tok signature as a mount "
291 "parameter; see the eCryptfs README\n");
292 goto out;
293 }
294 if (!cipher_name_set) {
295 cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
296 if (unlikely(cipher_name_len
297 >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) {
298 rc = -EINVAL;
299 BUG();
300 goto out;
301 }
302 memcpy(mount_crypt_stat->global_default_cipher_name,
303 ECRYPTFS_DEFAULT_CIPHER, cipher_name_len);
304 mount_crypt_stat->global_default_cipher_name[cipher_name_len]
305 = '\0';
306 }
307 if (!cipher_key_bytes_set) {
308 mount_crypt_stat->global_default_cipher_key_size =
309 ECRYPTFS_DEFAULT_KEY_BYTES;
310 ecryptfs_printk(KERN_DEBUG, "Cipher key size was not "
311 "specified. Defaulting to [%d]\n",
312 mount_crypt_stat->
313 global_default_cipher_key_size);
314 }
315 rc = ecryptfs_process_cipher(
316 &tmp_tfm,
317 &mount_crypt_stat->global_key_tfm,
318 mount_crypt_stat->global_default_cipher_name,
319 mount_crypt_stat->global_default_cipher_key_size);
320 if (tmp_tfm)
321 crypto_free_tfm(tmp_tfm);
322 if (rc) {
323 printk(KERN_ERR "Error attempting to initialize cipher [%s] "
324 "with key size [%Zd] bytes; rc = [%d]\n",
325 mount_crypt_stat->global_default_cipher_name,
326 mount_crypt_stat->global_default_cipher_key_size, rc);
327 rc = -EINVAL;
328 goto out;
329 }
330 mutex_init(&mount_crypt_stat->global_key_tfm_mutex);
331 ecryptfs_printk(KERN_DEBUG, "Requesting the key with description: "
332 "[%s]\n", mount_crypt_stat->global_auth_tok_sig);
333 /* The reference to this key is held until umount is done The
334 * call to key_put is done in ecryptfs_put_super() */
335 auth_tok_key = request_key(&key_type_user,
336 mount_crypt_stat->global_auth_tok_sig,
337 NULL);
338 if (!auth_tok_key || IS_ERR(auth_tok_key)) {
339 ecryptfs_printk(KERN_ERR, "Could not find key with "
340 "description: [%s]\n",
341 mount_crypt_stat->global_auth_tok_sig);
342 process_request_key_err(PTR_ERR(auth_tok_key));
343 rc = -EINVAL;
344 goto out;
345 }
346 auth_tok = ecryptfs_get_key_payload_data(auth_tok_key);
347 if (ecryptfs_verify_version(auth_tok->version)) {
348 ecryptfs_printk(KERN_ERR, "Data structure version mismatch. "
349 "Userspace tools must match eCryptfs kernel "
350 "module with major version [%d] and minor "
351 "version [%d]\n", ECRYPTFS_VERSION_MAJOR,
352 ECRYPTFS_VERSION_MINOR);
353 rc = -EINVAL;
354 goto out;
355 }
356 if (auth_tok->token_type != ECRYPTFS_PASSWORD) {
357 ecryptfs_printk(KERN_ERR, "Invalid auth_tok structure "
358 "returned from key\n");
359 rc = -EINVAL;
360 goto out;
361 }
362 mount_crypt_stat->global_auth_tok_key = auth_tok_key;
363 mount_crypt_stat->global_auth_tok = auth_tok;
364out:
365 return rc;
366}
367
368struct kmem_cache *ecryptfs_sb_info_cache;
369
370/**
371 * ecryptfs_fill_super
372 * @sb: The ecryptfs super block
373 * @raw_data: The options passed to mount
374 * @silent: Not used but required by function prototype
375 *
376 * Sets up what we can of the sb, rest is done in ecryptfs_read_super
377 *
378 * Returns zero on success; non-zero otherwise
379 */
380static int
381ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
382{
383 int rc = 0;
384
385 /* Released in ecryptfs_put_super() */
386 ecryptfs_set_superblock_private(sb,
387 kmem_cache_alloc(ecryptfs_sb_info_cache,
388 SLAB_KERNEL));
389 if (!ecryptfs_superblock_to_private(sb)) {
390 ecryptfs_printk(KERN_WARNING, "Out of memory\n");
391 rc = -ENOMEM;
392 goto out;
393 }
394 memset(ecryptfs_superblock_to_private(sb), 0,
395 sizeof(struct ecryptfs_sb_info));
396 sb->s_op = &ecryptfs_sops;
397 /* Released through deactivate_super(sb) from get_sb_nodev */
398 sb->s_root = d_alloc(NULL, &(const struct qstr) {
399 .hash = 0,.name = "/",.len = 1});
400 if (!sb->s_root) {
401 ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
402 rc = -ENOMEM;
403 goto out;
404 }
405 sb->s_root->d_op = &ecryptfs_dops;
406 sb->s_root->d_sb = sb;
407 sb->s_root->d_parent = sb->s_root;
408 /* Released in d_release when dput(sb->s_root) is called */
409 /* through deactivate_super(sb) from get_sb_nodev() */
410 ecryptfs_set_dentry_private(sb->s_root,
411 kmem_cache_alloc(ecryptfs_dentry_info_cache,
412 SLAB_KERNEL));
413 if (!ecryptfs_dentry_to_private(sb->s_root)) {
414 ecryptfs_printk(KERN_ERR,
415 "dentry_info_cache alloc failed\n");
416 rc = -ENOMEM;
417 goto out;
418 }
419 memset(ecryptfs_dentry_to_private(sb->s_root), 0,
420 sizeof(struct ecryptfs_dentry_info));
421 rc = 0;
422out:
423 /* Should be able to rely on deactivate_super called from
424 * get_sb_nodev */
425 return rc;
426}
427
428/**
429 * ecryptfs_read_super
430 * @sb: The ecryptfs super block
431 * @dev_name: The path to mount over
432 *
433 * Read the super block of the lower filesystem, and use
434 * ecryptfs_interpose to create our initial inode and super block
435 * struct.
436 */
437static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
438{
439 int rc;
440 struct nameidata nd;
441 struct dentry *lower_root;
442 struct vfsmount *lower_mnt;
443
444 memset(&nd, 0, sizeof(struct nameidata));
445 rc = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
446 if (rc) {
447 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
448 goto out_free;
449 }
450 lower_root = nd.dentry;
451 if (!lower_root->d_inode) {
452 ecryptfs_printk(KERN_WARNING,
453 "No directory to interpose on\n");
454 rc = -ENOENT;
455 goto out_free;
456 }
457 lower_mnt = nd.mnt;
458 ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
459 sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
460 ecryptfs_set_dentry_lower(sb->s_root, lower_root);
461 ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
462 if ((rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0)))
463 goto out_free;
464 rc = 0;
465 goto out;
466out_free:
467 path_release(&nd);
468out:
469 return rc;
470}
471
472/**
473 * ecryptfs_get_sb
474 * @fs_type
475 * @flags
476 * @dev_name: The path to mount over
477 * @raw_data: The options passed into the kernel
478 *
479 * The whole ecryptfs_get_sb process is broken into 4 functions:
480 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
481 * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
482 * with as much information as it can before needing
483 * the lower filesystem.
484 * ecryptfs_read_super(): this accesses the lower filesystem and uses
485 * ecryptfs_interpolate to perform most of the linking
486 * ecryptfs_interpolate(): links the lower filesystem into ecryptfs
487 */
488static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
489 const char *dev_name, void *raw_data,
490 struct vfsmount *mnt)
491{
492 int rc;
493 struct super_block *sb;
494
495 rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
496 if (rc < 0) {
497 printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
498 goto out;
499 }
500 sb = mnt->mnt_sb;
501 rc = ecryptfs_parse_options(sb, raw_data);
502 if (rc) {
503 printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
504 goto out_abort;
505 }
506 rc = ecryptfs_read_super(sb, dev_name);
507 if (rc) {
508 printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
509 goto out_abort;
510 }
511 goto out;
512out_abort:
513 dput(sb->s_root);
514 up_write(&sb->s_umount);
515 deactivate_super(sb);
516out:
517 return rc;
518}
519
520/**
521 * ecryptfs_kill_block_super
522 * @sb: The ecryptfs super block
523 *
524 * Used to bring the superblock down and free the private data.
525 * Private data is free'd in ecryptfs_put_super()
526 */
527static void ecryptfs_kill_block_super(struct super_block *sb)
528{
529 generic_shutdown_super(sb);
530}
531
532static struct file_system_type ecryptfs_fs_type = {
533 .owner = THIS_MODULE,
534 .name = "ecryptfs",
535 .get_sb = ecryptfs_get_sb,
536 .kill_sb = ecryptfs_kill_block_super,
537 .fs_flags = 0
538};
539
540/**
541 * inode_info_init_once
542 *
543 * Initializes the ecryptfs_inode_info_cache when it is created
544 */
545static void
546inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
547{
548 struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
549
550 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
551 SLAB_CTOR_CONSTRUCTOR)
552 inode_init_once(&ei->vfs_inode);
553}
554
555static struct ecryptfs_cache_info {
556 kmem_cache_t **cache;
557 const char *name;
558 size_t size;
559 void (*ctor)(void*, struct kmem_cache *, unsigned long);
560} ecryptfs_cache_infos[] = {
561 {
562 .cache = &ecryptfs_auth_tok_list_item_cache,
563 .name = "ecryptfs_auth_tok_list_item",
564 .size = sizeof(struct ecryptfs_auth_tok_list_item),
565 },
566 {
567 .cache = &ecryptfs_file_info_cache,
568 .name = "ecryptfs_file_cache",
569 .size = sizeof(struct ecryptfs_file_info),
570 },
571 {
572 .cache = &ecryptfs_dentry_info_cache,
573 .name = "ecryptfs_dentry_info_cache",
574 .size = sizeof(struct ecryptfs_dentry_info),
575 },
576 {
577 .cache = &ecryptfs_inode_info_cache,
578 .name = "ecryptfs_inode_cache",
579 .size = sizeof(struct ecryptfs_inode_info),
580 .ctor = inode_info_init_once,
581 },
582 {
583 .cache = &ecryptfs_sb_info_cache,
584 .name = "ecryptfs_sb_cache",
585 .size = sizeof(struct ecryptfs_sb_info),
586 },
587 {
588 .cache = &ecryptfs_header_cache_0,
589 .name = "ecryptfs_headers_0",
590 .size = PAGE_CACHE_SIZE,
591 },
592 {
593 .cache = &ecryptfs_header_cache_1,
594 .name = "ecryptfs_headers_1",
595 .size = PAGE_CACHE_SIZE,
596 },
597 {
598 .cache = &ecryptfs_header_cache_2,
599 .name = "ecryptfs_headers_2",
600 .size = PAGE_CACHE_SIZE,
601 },
602 {
603 .cache = &ecryptfs_lower_page_cache,
604 .name = "ecryptfs_lower_page_cache",
605 .size = PAGE_CACHE_SIZE,
606 },
607};
608
609static void ecryptfs_free_kmem_caches(void)
610{
611 int i;
612
613 for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
614 struct ecryptfs_cache_info *info;
615
616 info = &ecryptfs_cache_infos[i];
617 if (*(info->cache))
618 kmem_cache_destroy(*(info->cache));
619 }
620}
621
622/**
623 * ecryptfs_init_kmem_caches
624 *
625 * Returns zero on success; non-zero otherwise
626 */
627static int ecryptfs_init_kmem_caches(void)
628{
629 int i;
630
631 for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
632 struct ecryptfs_cache_info *info;
633
634 info = &ecryptfs_cache_infos[i];
635 *(info->cache) = kmem_cache_create(info->name, info->size,
636 0, SLAB_HWCACHE_ALIGN, info->ctor, NULL);
637 if (!*(info->cache)) {
638 ecryptfs_free_kmem_caches();
639 ecryptfs_printk(KERN_WARNING, "%s: "
640 "kmem_cache_create failed\n",
641 info->name);
642 return -ENOMEM;
643 }
644 }
645 return 0;
646}
647
648struct ecryptfs_obj {
649 char *name;
650 struct list_head slot_list;
651 struct kobject kobj;
652};
653
654struct ecryptfs_attribute {
655 struct attribute attr;
656 ssize_t(*show) (struct ecryptfs_obj *, char *);
657 ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
658};
659
660static ssize_t
661ecryptfs_attr_store(struct kobject *kobj,
662 struct attribute *attr, const char *buf, size_t len)
663{
664 struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
665 kobj);
666 struct ecryptfs_attribute *attribute =
667 container_of(attr, struct ecryptfs_attribute, attr);
668
669 return (attribute->store ? attribute->store(obj, buf, len) : 0);
670}
671
672static ssize_t
673ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
674{
675 struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
676 kobj);
677 struct ecryptfs_attribute *attribute =
678 container_of(attr, struct ecryptfs_attribute, attr);
679
680 return (attribute->show ? attribute->show(obj, buf) : 0);
681}
682
683static struct sysfs_ops ecryptfs_sysfs_ops = {
684 .show = ecryptfs_attr_show,
685 .store = ecryptfs_attr_store
686};
687
688static struct kobj_type ecryptfs_ktype = {
689 .sysfs_ops = &ecryptfs_sysfs_ops
690};
691
692static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL);
693
694static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
695{
696 return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
697}
698
699static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
700
701struct ecryptfs_version_str_map_elem {
702 u32 flag;
703 char *str;
704} ecryptfs_version_str_map[] = {
705 {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
706 {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
707 {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
708 {ECRYPTFS_VERSIONING_POLICY, "policy"}
709};
710
711static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
712{
713 int i;
714 int remaining = PAGE_SIZE;
715 int total_written = 0;
716
717 buff[0] = '\0';
718 for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
719 int entry_size;
720
721 if (!(ECRYPTFS_VERSIONING_MASK
722 & ecryptfs_version_str_map[i].flag))
723 continue;
724 entry_size = strlen(ecryptfs_version_str_map[i].str);
725 if ((entry_size + 2) > remaining)
726 goto out;
727 memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
728 buff[entry_size++] = '\n';
729 buff[entry_size] = '\0';
730 buff += entry_size;
731 total_written += entry_size;
732 remaining -= entry_size;
733 }
734out:
735 return total_written;
736}
737
738static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
739
740static int do_sysfs_registration(void)
741{
742 int rc;
743
744 if ((rc = subsystem_register(&ecryptfs_subsys))) {
745 printk(KERN_ERR
746 "Unable to register ecryptfs sysfs subsystem\n");
747 goto out;
748 }
749 rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
750 &sysfs_attr_version.attr);
751 if (rc) {
752 printk(KERN_ERR
753 "Unable to create ecryptfs version attribute\n");
754 subsystem_unregister(&ecryptfs_subsys);
755 goto out;
756 }
757 rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
758 &sysfs_attr_version_str.attr);
759 if (rc) {
760 printk(KERN_ERR
761 "Unable to create ecryptfs version_str attribute\n");
762 sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
763 &sysfs_attr_version.attr);
764 subsystem_unregister(&ecryptfs_subsys);
765 goto out;
766 }
767out:
768 return rc;
769}
770
771static int __init ecryptfs_init(void)
772{
773 int rc;
774
775 if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
776 rc = -EINVAL;
777 ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
778 "larger than the host's page size, and so "
779 "eCryptfs cannot run on this system. The "
780 "default eCryptfs extent size is [%d] bytes; "
781 "the page size is [%d] bytes.\n",
782 ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
783 goto out;
784 }
785 rc = ecryptfs_init_kmem_caches();
786 if (rc) {
787 printk(KERN_ERR
788 "Failed to allocate one or more kmem_cache objects\n");
789 goto out;
790 }
791 rc = register_filesystem(&ecryptfs_fs_type);
792 if (rc) {
793 printk(KERN_ERR "Failed to register filesystem\n");
794 ecryptfs_free_kmem_caches();
795 goto out;
796 }
797 kset_set_kset_s(&ecryptfs_subsys, fs_subsys);
798 sysfs_attr_version.attr.owner = THIS_MODULE;
799 sysfs_attr_version_str.attr.owner = THIS_MODULE;
800 rc = do_sysfs_registration();
801 if (rc) {
802 printk(KERN_ERR "sysfs registration failed\n");
803 unregister_filesystem(&ecryptfs_fs_type);
804 ecryptfs_free_kmem_caches();
805 goto out;
806 }
807out:
808 return rc;
809}
810
811static void __exit ecryptfs_exit(void)
812{
813 sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
814 &sysfs_attr_version.attr);
815 sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
816 &sysfs_attr_version_str.attr);
817 subsystem_unregister(&ecryptfs_subsys);
818 unregister_filesystem(&ecryptfs_fs_type);
819 ecryptfs_free_kmem_caches();
820}
821
822MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>");
823MODULE_DESCRIPTION("eCryptfs");
824
825MODULE_LICENSE("GPL");
826
827module_init(ecryptfs_init)
828module_exit(ecryptfs_exit)
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
new file mode 100644
index 000000000000..924dd90a4cf5
--- /dev/null
+++ b/fs/ecryptfs/mmap.c
@@ -0,0 +1,788 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 * This is where eCryptfs coordinates the symmetric encryption and
4 * decryption of the file data as it passes between the lower
5 * encrypted file and the upper decrypted file.
6 *
7 * Copyright (C) 1997-2003 Erez Zadok
8 * Copyright (C) 2001-2003 Stony Brook University
9 * Copyright (C) 2004-2006 International Business Machines Corp.
10 * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful, but
18 * WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
25 * 02111-1307, USA.
26 */
27
28#include <linux/pagemap.h>
29#include <linux/writeback.h>
30#include <linux/page-flags.h>
31#include <linux/mount.h>
32#include <linux/file.h>
33#include <linux/crypto.h>
34#include <linux/scatterlist.h>
35#include "ecryptfs_kernel.h"
36
37struct kmem_cache *ecryptfs_lower_page_cache;
38
39/**
40 * ecryptfs_get1page
41 *
42 * Get one page from cache or lower f/s, return error otherwise.
43 *
44 * Returns unlocked and up-to-date page (if ok), with increased
45 * refcnt.
46 */
47static struct page *ecryptfs_get1page(struct file *file, int index)
48{
49 struct page *page;
50 struct dentry *dentry;
51 struct inode *inode;
52 struct address_space *mapping;
53
54 dentry = file->f_dentry;
55 inode = dentry->d_inode;
56 mapping = inode->i_mapping;
57 page = read_cache_page(mapping, index,
58 (filler_t *)mapping->a_ops->readpage,
59 (void *)file);
60 if (IS_ERR(page))
61 goto out;
62 wait_on_page_locked(page);
63out:
64 return page;
65}
66
67static
68int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros);
69
70/**
71 * ecryptfs_fill_zeros
72 * @file: The ecryptfs file
73 * @new_length: The new length of the data in the underlying file;
74 * everything between the prior end of the file and the
75 * new end of the file will be filled with zero's.
76 * new_length must be greater than current length
77 *
78 * Function for handling lseek-ing past the end of the file.
79 *
80 * This function does not support shrinking, only growing a file.
81 *
82 * Returns zero on success; non-zero otherwise.
83 */
84int ecryptfs_fill_zeros(struct file *file, loff_t new_length)
85{
86 int rc = 0;
87 struct dentry *dentry = file->f_dentry;
88 struct inode *inode = dentry->d_inode;
89 pgoff_t old_end_page_index = 0;
90 pgoff_t index = old_end_page_index;
91 int old_end_pos_in_page = -1;
92 pgoff_t new_end_page_index;
93 int new_end_pos_in_page;
94 loff_t cur_length = i_size_read(inode);
95
96 if (cur_length != 0) {
97 index = old_end_page_index =
98 ((cur_length - 1) >> PAGE_CACHE_SHIFT);
99 old_end_pos_in_page = ((cur_length - 1) & ~PAGE_CACHE_MASK);
100 }
101 new_end_page_index = ((new_length - 1) >> PAGE_CACHE_SHIFT);
102 new_end_pos_in_page = ((new_length - 1) & ~PAGE_CACHE_MASK);
103 ecryptfs_printk(KERN_DEBUG, "old_end_page_index = [0x%.16x]; "
104 "old_end_pos_in_page = [%d]; "
105 "new_end_page_index = [0x%.16x]; "
106 "new_end_pos_in_page = [%d]\n",
107 old_end_page_index, old_end_pos_in_page,
108 new_end_page_index, new_end_pos_in_page);
109 if (old_end_page_index == new_end_page_index) {
110 /* Start and end are in the same page; we just need to
111 * set a portion of the existing page to zero's */
112 rc = write_zeros(file, index, (old_end_pos_in_page + 1),
113 (new_end_pos_in_page - old_end_pos_in_page));
114 if (rc)
115 ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
116 "index=[0x%.16x], "
117 "old_end_pos_in_page=[d], "
118 "(PAGE_CACHE_SIZE - new_end_pos_in_page"
119 "=[%d]"
120 ")=[d]) returned [%d]\n", file, index,
121 old_end_pos_in_page,
122 new_end_pos_in_page,
123 (PAGE_CACHE_SIZE - new_end_pos_in_page),
124 rc);
125 goto out;
126 }
127 /* Fill the remainder of the previous last page with zeros */
128 rc = write_zeros(file, index, (old_end_pos_in_page + 1),
129 ((PAGE_CACHE_SIZE - 1) - old_end_pos_in_page));
130 if (rc) {
131 ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
132 "index=[0x%.16x], old_end_pos_in_page=[d], "
133 "(PAGE_CACHE_SIZE - old_end_pos_in_page)=[d]) "
134 "returned [%d]\n", file, index,
135 old_end_pos_in_page,
136 (PAGE_CACHE_SIZE - old_end_pos_in_page), rc);
137 goto out;
138 }
139 index++;
140 while (index < new_end_page_index) {
141 /* Fill all intermediate pages with zeros */
142 rc = write_zeros(file, index, 0, PAGE_CACHE_SIZE);
143 if (rc) {
144 ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
145 "index=[0x%.16x], "
146 "old_end_pos_in_page=[d], "
147 "(PAGE_CACHE_SIZE - new_end_pos_in_page"
148 "=[%d]"
149 ")=[d]) returned [%d]\n", file, index,
150 old_end_pos_in_page,
151 new_end_pos_in_page,
152 (PAGE_CACHE_SIZE - new_end_pos_in_page),
153 rc);
154 goto out;
155 }
156 index++;
157 }
158 /* Fill the portion at the beginning of the last new page with
159 * zero's */
160 rc = write_zeros(file, index, 0, (new_end_pos_in_page + 1));
161 if (rc) {
162 ecryptfs_printk(KERN_ERR, "write_zeros(file="
163 "[%p], index=[0x%.16x], 0, "
164 "new_end_pos_in_page=[%d]"
165 "returned [%d]\n", file, index,
166 new_end_pos_in_page, rc);
167 goto out;
168 }
169out:
170 return rc;
171}
172
173/**
174 * ecryptfs_writepage
175 * @page: Page that is locked before this call is made
176 *
177 * Returns zero on success; non-zero otherwise
178 */
179static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
180{
181 struct ecryptfs_page_crypt_context ctx;
182 int rc;
183
184 ctx.page = page;
185 ctx.mode = ECRYPTFS_WRITEPAGE_MODE;
186 ctx.param.wbc = wbc;
187 rc = ecryptfs_encrypt_page(&ctx);
188 if (rc) {
189 ecryptfs_printk(KERN_WARNING, "Error encrypting "
190 "page (upper index [0x%.16x])\n", page->index);
191 ClearPageUptodate(page);
192 goto out;
193 }
194 SetPageUptodate(page);
195 unlock_page(page);
196out:
197 return rc;
198}
199
200/**
201 * Reads the data from the lower file file at index lower_page_index
202 * and copies that data into page.
203 *
204 * @param page Page to fill
205 * @param lower_page_index Index of the page in the lower file to get
206 */
207int ecryptfs_do_readpage(struct file *file, struct page *page,
208 pgoff_t lower_page_index)
209{
210 int rc;
211 struct dentry *dentry;
212 struct file *lower_file;
213 struct dentry *lower_dentry;
214 struct inode *inode;
215 struct inode *lower_inode;
216 char *page_data;
217 struct page *lower_page = NULL;
218 char *lower_page_data;
219 const struct address_space_operations *lower_a_ops;
220
221 dentry = file->f_dentry;
222 lower_file = ecryptfs_file_to_lower(file);
223 lower_dentry = ecryptfs_dentry_to_lower(dentry);
224 inode = dentry->d_inode;
225 lower_inode = ecryptfs_inode_to_lower(inode);
226 lower_a_ops = lower_inode->i_mapping->a_ops;
227 lower_page = read_cache_page(lower_inode->i_mapping, lower_page_index,
228 (filler_t *)lower_a_ops->readpage,
229 (void *)lower_file);
230 if (IS_ERR(lower_page)) {
231 rc = PTR_ERR(lower_page);
232 lower_page = NULL;
233 ecryptfs_printk(KERN_ERR, "Error reading from page cache\n");
234 goto out;
235 }
236 wait_on_page_locked(lower_page);
237 page_data = (char *)kmap(page);
238 if (!page_data) {
239 rc = -ENOMEM;
240 ecryptfs_printk(KERN_ERR, "Error mapping page\n");
241 goto out;
242 }
243 lower_page_data = (char *)kmap(lower_page);
244 if (!lower_page_data) {
245 rc = -ENOMEM;
246 ecryptfs_printk(KERN_ERR, "Error mapping page\n");
247 kunmap(page);
248 goto out;
249 }
250 memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
251 kunmap(lower_page);
252 kunmap(page);
253 rc = 0;
254out:
255 if (likely(lower_page))
256 page_cache_release(lower_page);
257 if (rc == 0)
258 SetPageUptodate(page);
259 else
260 ClearPageUptodate(page);
261 return rc;
262}
263
264/**
265 * ecryptfs_readpage
266 * @file: This is an ecryptfs file
267 * @page: ecryptfs associated page to stick the read data into
268 *
269 * Read in a page, decrypting if necessary.
270 *
271 * Returns zero on success; non-zero on error.
272 */
273static int ecryptfs_readpage(struct file *file, struct page *page)
274{
275 int rc = 0;
276 struct ecryptfs_crypt_stat *crypt_stat;
277
278 BUG_ON(!(file && file->f_dentry && file->f_dentry->d_inode));
279 crypt_stat =
280 &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat;
281 if (!crypt_stat
282 || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)
283 || ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) {
284 ecryptfs_printk(KERN_DEBUG,
285 "Passing through unencrypted page\n");
286 rc = ecryptfs_do_readpage(file, page, page->index);
287 if (rc) {
288 ecryptfs_printk(KERN_ERR, "Error reading page; rc = "
289 "[%d]\n", rc);
290 goto out;
291 }
292 } else {
293 rc = ecryptfs_decrypt_page(file, page);
294 if (rc) {
295
296 ecryptfs_printk(KERN_ERR, "Error decrypting page; "
297 "rc = [%d]\n", rc);
298 goto out;
299 }
300 }
301 SetPageUptodate(page);
302out:
303 if (rc)
304 ClearPageUptodate(page);
305 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
306 page->index);
307 unlock_page(page);
308 return rc;
309}
310
311static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
312{
313 struct inode *inode = page->mapping->host;
314 int end_byte_in_page;
315 int rc = 0;
316 char *page_virt;
317
318 if ((i_size_read(inode) / PAGE_CACHE_SIZE) == page->index) {
319 end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
320 if (to > end_byte_in_page)
321 end_byte_in_page = to;
322 page_virt = kmap(page);
323 if (!page_virt) {
324 rc = -ENOMEM;
325 ecryptfs_printk(KERN_WARNING,
326 "Could not map page\n");
327 goto out;
328 }
329 memset((page_virt + end_byte_in_page), 0,
330 (PAGE_CACHE_SIZE - end_byte_in_page));
331 kunmap(page);
332 }
333out:
334 return rc;
335}
336
337static int ecryptfs_prepare_write(struct file *file, struct page *page,
338 unsigned from, unsigned to)
339{
340 int rc = 0;
341
342 kmap(page);
343 if (from == 0 && to == PAGE_CACHE_SIZE)
344 goto out; /* If we are writing a full page, it will be
345 up to date. */
346 if (!PageUptodate(page))
347 rc = ecryptfs_do_readpage(file, page, page->index);
348out:
349 return rc;
350}
351
352int ecryptfs_grab_and_map_lower_page(struct page **lower_page,
353 char **lower_virt,
354 struct inode *lower_inode,
355 unsigned long lower_page_index)
356{
357 int rc = 0;
358
359 (*lower_page) = grab_cache_page(lower_inode->i_mapping,
360 lower_page_index);
361 if (!(*lower_page)) {
362 ecryptfs_printk(KERN_ERR, "grab_cache_page for "
363 "lower_page_index = [0x%.16x] failed\n",
364 lower_page_index);
365 rc = -EINVAL;
366 goto out;
367 }
368 if (lower_virt)
369 (*lower_virt) = kmap((*lower_page));
370 else
371 kmap((*lower_page));
372out:
373 return rc;
374}
375
376int ecryptfs_writepage_and_release_lower_page(struct page *lower_page,
377 struct inode *lower_inode,
378 struct writeback_control *wbc)
379{
380 int rc = 0;
381
382 rc = lower_inode->i_mapping->a_ops->writepage(lower_page, wbc);
383 if (rc) {
384 ecryptfs_printk(KERN_ERR, "Error calling lower writepage(); "
385 "rc = [%d]\n", rc);
386 goto out;
387 }
388 lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
389 page_cache_release(lower_page);
390out:
391 return rc;
392}
393
394static void ecryptfs_unmap_and_release_lower_page(struct page *lower_page)
395{
396 kunmap(lower_page);
397 ecryptfs_printk(KERN_DEBUG, "Unlocking lower page with index = "
398 "[0x%.16x]\n", lower_page->index);
399 unlock_page(lower_page);
400 page_cache_release(lower_page);
401}
402
403/**
404 * ecryptfs_write_inode_size_to_header
405 *
406 * Writes the lower file size to the first 8 bytes of the header.
407 *
408 * Returns zero on success; non-zero on error.
409 */
410int
411ecryptfs_write_inode_size_to_header(struct file *lower_file,
412 struct inode *lower_inode,
413 struct inode *inode)
414{
415 int rc = 0;
416 struct page *header_page;
417 char *header_virt;
418 const struct address_space_operations *lower_a_ops;
419 u64 file_size;
420
421 rc = ecryptfs_grab_and_map_lower_page(&header_page, &header_virt,
422 lower_inode, 0);
423 if (rc) {
424 ecryptfs_printk(KERN_ERR, "grab_cache_page for header page "
425 "failed\n");
426 goto out;
427 }
428 lower_a_ops = lower_inode->i_mapping->a_ops;
429 rc = lower_a_ops->prepare_write(lower_file, header_page, 0, 8);
430 file_size = (u64)i_size_read(inode);
431 ecryptfs_printk(KERN_DEBUG, "Writing size: [0x%.16x]\n", file_size);
432 file_size = cpu_to_be64(file_size);
433 memcpy(header_virt, &file_size, sizeof(u64));
434 rc = lower_a_ops->commit_write(lower_file, header_page, 0, 8);
435 if (rc < 0)
436 ecryptfs_printk(KERN_ERR, "Error commiting header page "
437 "write\n");
438 ecryptfs_unmap_and_release_lower_page(header_page);
439 lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
440 mark_inode_dirty_sync(inode);
441out:
442 return rc;
443}
444
445int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
446 struct file *lower_file,
447 unsigned long lower_page_index, int byte_offset,
448 int region_bytes)
449{
450 int rc = 0;
451
452 rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL, lower_inode,
453 lower_page_index);
454 if (rc) {
455 ecryptfs_printk(KERN_ERR, "Error attempting to grab and map "
456 "lower page with index [0x%.16x]\n",
457 lower_page_index);
458 goto out;
459 }
460 rc = lower_inode->i_mapping->a_ops->prepare_write(lower_file,
461 (*lower_page),
462 byte_offset,
463 region_bytes);
464 if (rc) {
465 ecryptfs_printk(KERN_ERR, "prepare_write for "
466 "lower_page_index = [0x%.16x] failed; rc = "
467 "[%d]\n", lower_page_index, rc);
468 }
469out:
470 if (rc && (*lower_page)) {
471 ecryptfs_unmap_and_release_lower_page(*lower_page);
472 (*lower_page) = NULL;
473 }
474 return rc;
475}
476
477/**
478 * ecryptfs_commit_lower_page
479 *
480 * Returns zero on success; non-zero on error
481 */
482int
483ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
484 struct file *lower_file, int byte_offset,
485 int region_size)
486{
487 int rc = 0;
488
489 rc = lower_inode->i_mapping->a_ops->commit_write(
490 lower_file, lower_page, byte_offset, region_size);
491 if (rc < 0) {
492 ecryptfs_printk(KERN_ERR,
493 "Error committing write; rc = [%d]\n", rc);
494 } else
495 rc = 0;
496 ecryptfs_unmap_and_release_lower_page(lower_page);
497 return rc;
498}
499
500/**
501 * ecryptfs_copy_page_to_lower
502 *
503 * Used for plaintext pass-through; no page index interpolation
504 * required.
505 */
506int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode,
507 struct file *lower_file)
508{
509 int rc = 0;
510 struct page *lower_page;
511
512 rc = ecryptfs_get_lower_page(&lower_page, lower_inode, lower_file,
513 page->index, 0, PAGE_CACHE_SIZE);
514 if (rc) {
515 ecryptfs_printk(KERN_ERR, "Error attempting to get page "
516 "at index [0x%.16x]\n", page->index);
517 goto out;
518 }
519 /* TODO: aops */
520 memcpy((char *)page_address(lower_page), page_address(page),
521 PAGE_CACHE_SIZE);
522 rc = ecryptfs_commit_lower_page(lower_page, lower_inode, lower_file,
523 0, PAGE_CACHE_SIZE);
524 if (rc)
525 ecryptfs_printk(KERN_ERR, "Error attempting to commit page "
526 "at index [0x%.16x]\n", page->index);
527out:
528 return rc;
529}
530
531static int
532process_new_file(struct ecryptfs_crypt_stat *crypt_stat,
533 struct file *file, struct inode *inode)
534{
535 struct page *header_page;
536 const struct address_space_operations *lower_a_ops;
537 struct inode *lower_inode;
538 struct file *lower_file;
539 char *header_virt;
540 int rc = 0;
541 int current_header_page = 0;
542 int header_pages;
543 int more_header_data_to_be_written = 1;
544
545 lower_inode = ecryptfs_inode_to_lower(inode);
546 lower_file = ecryptfs_file_to_lower(file);
547 lower_a_ops = lower_inode->i_mapping->a_ops;
548 header_pages = ((crypt_stat->header_extent_size
549 * crypt_stat->num_header_extents_at_front)
550 / PAGE_CACHE_SIZE);
551 BUG_ON(header_pages < 1);
552 while (current_header_page < header_pages) {
553 rc = ecryptfs_grab_and_map_lower_page(&header_page,
554 &header_virt,
555 lower_inode,
556 current_header_page);
557 if (rc) {
558 ecryptfs_printk(KERN_ERR, "grab_cache_page for "
559 "header page [%d] failed; rc = [%d]\n",
560 current_header_page, rc);
561 goto out;
562 }
563 rc = lower_a_ops->prepare_write(lower_file, header_page, 0,
564 PAGE_CACHE_SIZE);
565 if (rc) {
566 ecryptfs_printk(KERN_ERR, "Error preparing to write "
567 "header page out; rc = [%d]\n", rc);
568 goto out;
569 }
570 memset(header_virt, 0, PAGE_CACHE_SIZE);
571 if (more_header_data_to_be_written) {
572 rc = ecryptfs_write_headers_virt(header_virt,
573 crypt_stat,
574 file->f_dentry);
575 if (rc) {
576 ecryptfs_printk(KERN_WARNING, "Error "
577 "generating header; rc = "
578 "[%d]\n", rc);
579 rc = -EIO;
580 memset(header_virt, 0, PAGE_CACHE_SIZE);
581 ecryptfs_unmap_and_release_lower_page(
582 header_page);
583 goto out;
584 }
585 if (current_header_page == 0)
586 memset(header_virt, 0, 8);
587 more_header_data_to_be_written = 0;
588 }
589 rc = lower_a_ops->commit_write(lower_file, header_page, 0,
590 PAGE_CACHE_SIZE);
591 ecryptfs_unmap_and_release_lower_page(header_page);
592 if (rc < 0) {
593 ecryptfs_printk(KERN_ERR,
594 "Error commiting header page write; "
595 "rc = [%d]\n", rc);
596 break;
597 }
598 current_header_page++;
599 }
600 if (rc >= 0) {
601 rc = 0;
602 ecryptfs_printk(KERN_DEBUG, "lower_inode->i_blocks = "
603 "[0x%.16x]\n", lower_inode->i_blocks);
604 i_size_write(inode, 0);
605 lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
606 mark_inode_dirty_sync(inode);
607 }
608 ecryptfs_printk(KERN_DEBUG, "Clearing ECRYPTFS_NEW_FILE flag in "
609 "crypt_stat at memory location [%p]\n", crypt_stat);
610 ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE);
611out:
612 return rc;
613}
614
615/**
616 * ecryptfs_commit_write
617 * @file: The eCryptfs file object
618 * @page: The eCryptfs page
619 * @from: Ignored (we rotate the page IV on each write)
620 * @to: Ignored
621 *
622 * This is where we encrypt the data and pass the encrypted data to
623 * the lower filesystem. In OpenPGP-compatible mode, we operate on
624 * entire underlying packets.
625 */
626static int ecryptfs_commit_write(struct file *file, struct page *page,
627 unsigned from, unsigned to)
628{
629 struct ecryptfs_page_crypt_context ctx;
630 loff_t pos;
631 struct inode *inode;
632 struct inode *lower_inode;
633 struct file *lower_file;
634 struct ecryptfs_crypt_stat *crypt_stat;
635 int rc;
636
637 inode = page->mapping->host;
638 lower_inode = ecryptfs_inode_to_lower(inode);
639 lower_file = ecryptfs_file_to_lower(file);
640 mutex_lock(&lower_inode->i_mutex);
641 crypt_stat =
642 &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat;
643 if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) {
644 ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in "
645 "crypt_stat at memory location [%p]\n", crypt_stat);
646 rc = process_new_file(crypt_stat, file, inode);
647 if (rc) {
648 ecryptfs_printk(KERN_ERR, "Error processing new "
649 "file; rc = [%d]\n", rc);
650 goto out;
651 }
652 } else
653 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
654 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
655 "(page w/ index = [0x%.16x], to = [%d])\n", page->index,
656 to);
657 rc = fill_zeros_to_end_of_page(page, to);
658 if (rc) {
659 ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
660 "zeros in page with index = [0x%.16x]\n",
661 page->index);
662 goto out;
663 }
664 ctx.page = page;
665 ctx.mode = ECRYPTFS_PREPARE_COMMIT_MODE;
666 ctx.param.lower_file = lower_file;
667 rc = ecryptfs_encrypt_page(&ctx);
668 if (rc) {
669 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
670 "index [0x%.16x])\n", page->index);
671 goto out;
672 }
673 rc = 0;
674 inode->i_blocks = lower_inode->i_blocks;
675 pos = (page->index << PAGE_CACHE_SHIFT) + to;
676 if (pos > i_size_read(inode)) {
677 i_size_write(inode, pos);
678 ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
679 "[0x%.16x]\n", i_size_read(inode));
680 }
681 ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode);
682 lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
683 mark_inode_dirty_sync(inode);
684out:
685 kunmap(page); /* mapped in prior call (prepare_write) */
686 if (rc < 0)
687 ClearPageUptodate(page);
688 else
689 SetPageUptodate(page);
690 mutex_unlock(&lower_inode->i_mutex);
691 return rc;
692}
693
694/**
695 * write_zeros
696 * @file: The ecryptfs file
697 * @index: The index in which we are writing
698 * @start: The position after the last block of data
699 * @num_zeros: The number of zeros to write
700 *
701 * Write a specified number of zero's to a page.
702 *
703 * (start + num_zeros) must be less than or equal to PAGE_CACHE_SIZE
704 */
705static
706int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros)
707{
708 int rc = 0;
709 struct page *tmp_page;
710
711 tmp_page = ecryptfs_get1page(file, index);
712 if (IS_ERR(tmp_page)) {
713 ecryptfs_printk(KERN_ERR, "Error getting page at index "
714 "[0x%.16x]\n", index);
715 rc = PTR_ERR(tmp_page);
716 goto out;
717 }
718 kmap(tmp_page);
719 rc = ecryptfs_prepare_write(file, tmp_page, start, start + num_zeros);
720 if (rc) {
721 ecryptfs_printk(KERN_ERR, "Error preparing to write zero's "
722 "to remainder of page at index [0x%.16x]\n",
723 index);
724 kunmap(tmp_page);
725 page_cache_release(tmp_page);
726 goto out;
727 }
728 memset(((char *)page_address(tmp_page) + start), 0, num_zeros);
729 rc = ecryptfs_commit_write(file, tmp_page, start, start + num_zeros);
730 if (rc < 0) {
731 ecryptfs_printk(KERN_ERR, "Error attempting to write zero's "
732 "to remainder of page at index [0x%.16x]\n",
733 index);
734 kunmap(tmp_page);
735 page_cache_release(tmp_page);
736 goto out;
737 }
738 rc = 0;
739 kunmap(tmp_page);
740 page_cache_release(tmp_page);
741out:
742 return rc;
743}
744
745static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
746{
747 int rc = 0;
748 struct inode *inode;
749 struct inode *lower_inode;
750
751 inode = (struct inode *)mapping->host;
752 lower_inode = ecryptfs_inode_to_lower(inode);
753 if (lower_inode->i_mapping->a_ops->bmap)
754 rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping,
755 block);
756 return rc;
757}
758
759static void ecryptfs_sync_page(struct page *page)
760{
761 struct inode *inode;
762 struct inode *lower_inode;
763 struct page *lower_page;
764
765 inode = page->mapping->host;
766 lower_inode = ecryptfs_inode_to_lower(inode);
767 /* NOTE: Recently swapped with grab_cache_page(), since
768 * sync_page() just makes sure that pending I/O gets done. */
769 lower_page = find_lock_page(lower_inode->i_mapping, page->index);
770 if (!lower_page) {
771 ecryptfs_printk(KERN_DEBUG, "find_lock_page failed\n");
772 return;
773 }
774 lower_page->mapping->a_ops->sync_page(lower_page);
775 ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
776 lower_page->index);
777 unlock_page(lower_page);
778 page_cache_release(lower_page);
779}
780
781struct address_space_operations ecryptfs_aops = {
782 .writepage = ecryptfs_writepage,
783 .readpage = ecryptfs_readpage,
784 .prepare_write = ecryptfs_prepare_write,
785 .commit_write = ecryptfs_commit_write,
786 .bmap = ecryptfs_bmap,
787 .sync_page = ecryptfs_sync_page,
788};
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
new file mode 100644
index 000000000000..c337c0410fb1
--- /dev/null
+++ b/fs/ecryptfs/super.c
@@ -0,0 +1,198 @@
1/**
2 * eCryptfs: Linux filesystem encryption layer
3 *
4 * Copyright (C) 1997-2003 Erez Zadok
5 * Copyright (C) 2001-2003 Stony Brook University
6 * Copyright (C) 2004-2006 International Business Machines Corp.
7 * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
8 * Michael C. Thompson <mcthomps@us.ibm.com>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 * 02111-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/mount.h>
28#include <linux/key.h>
29#include <linux/seq_file.h>
30#include <linux/crypto.h>
31#include "ecryptfs_kernel.h"
32
33struct kmem_cache *ecryptfs_inode_info_cache;
34
35/**
36 * ecryptfs_alloc_inode - allocate an ecryptfs inode
37 * @sb: Pointer to the ecryptfs super block
38 *
39 * Called to bring an inode into existence.
40 *
41 * Only handle allocation, setting up structures should be done in
42 * ecryptfs_read_inode. This is because the kernel, between now and
43 * then, will 0 out the private data pointer.
44 *
45 * Returns a pointer to a newly allocated inode, NULL otherwise
46 */
47static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
48{
49 struct ecryptfs_inode_info *ecryptfs_inode;
50 struct inode *inode = NULL;
51
52 ecryptfs_inode = kmem_cache_alloc(ecryptfs_inode_info_cache,
53 SLAB_KERNEL);
54 if (unlikely(!ecryptfs_inode))
55 goto out;
56 ecryptfs_init_crypt_stat(&ecryptfs_inode->crypt_stat);
57 inode = &ecryptfs_inode->vfs_inode;
58out:
59 return inode;
60}
61
62/**
63 * ecryptfs_destroy_inode
64 * @inode: The ecryptfs inode
65 *
66 * This is used during the final destruction of the inode.
67 * All allocation of memory related to the inode, including allocated
68 * memory in the crypt_stat struct, will be released here.
69 * There should be no chance that this deallocation will be missed.
70 */
71static void ecryptfs_destroy_inode(struct inode *inode)
72{
73 struct ecryptfs_inode_info *inode_info;
74
75 inode_info = ecryptfs_inode_to_private(inode);
76 ecryptfs_destruct_crypt_stat(&inode_info->crypt_stat);
77 kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
78}
79
80/**
81 * ecryptfs_init_inode
82 * @inode: The ecryptfs inode
83 *
84 * Set up the ecryptfs inode.
85 */
86void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
87{
88 ecryptfs_set_inode_lower(inode, lower_inode);
89 inode->i_ino = lower_inode->i_ino;
90 inode->i_version++;
91 inode->i_op = &ecryptfs_main_iops;
92 inode->i_fop = &ecryptfs_main_fops;
93 inode->i_mapping->a_ops = &ecryptfs_aops;
94}
95
96/**
97 * ecryptfs_put_super
98 * @sb: Pointer to the ecryptfs super block
99 *
100 * Final actions when unmounting a file system.
101 * This will handle deallocation and release of our private data.
102 */
103static void ecryptfs_put_super(struct super_block *sb)
104{
105 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
106
107 ecryptfs_destruct_mount_crypt_stat(&sb_info->mount_crypt_stat);
108 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
109 ecryptfs_set_superblock_private(sb, NULL);
110}
111
112/**
113 * ecryptfs_statfs
114 * @sb: The ecryptfs super block
115 * @buf: The struct kstatfs to fill in with stats
116 *
117 * Get the filesystem statistics. Currently, we let this pass right through
118 * to the lower filesystem and take no action ourselves.
119 */
120static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
121{
122 return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf);
123}
124
125/**
126 * ecryptfs_clear_inode
127 * @inode - The ecryptfs inode
128 *
129 * Called by iput() when the inode reference count reached zero
130 * and the inode is not hashed anywhere. Used to clear anything
131 * that needs to be, before the inode is completely destroyed and put
132 * on the inode free list. We use this to drop out reference to the
133 * lower inode.
134 */
135static void ecryptfs_clear_inode(struct inode *inode)
136{
137 iput(ecryptfs_inode_to_lower(inode));
138}
139
140/**
141 * ecryptfs_umount_begin
142 *
143 * Called in do_umount().
144 */
145static void ecryptfs_umount_begin(struct vfsmount *vfsmnt, int flags)
146{
147 struct vfsmount *lower_mnt =
148 ecryptfs_dentry_to_lower_mnt(vfsmnt->mnt_sb->s_root);
149 struct super_block *lower_sb;
150
151 mntput(lower_mnt);
152 lower_sb = lower_mnt->mnt_sb;
153 if (lower_sb->s_op->umount_begin)
154 lower_sb->s_op->umount_begin(lower_mnt, flags);
155}
156
157/**
158 * ecryptfs_show_options
159 *
160 * Prints the directory we are currently mounted over.
161 * Returns zero on success; non-zero otherwise
162 */
163static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
164{
165 struct super_block *sb = mnt->mnt_sb;
166 struct dentry *lower_root_dentry = ecryptfs_dentry_to_lower(sb->s_root);
167 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(sb->s_root);
168 char *tmp_page;
169 char *path;
170 int rc = 0;
171
172 tmp_page = (char *)__get_free_page(GFP_KERNEL);
173 if (!tmp_page) {
174 rc = -ENOMEM;
175 goto out;
176 }
177 path = d_path(lower_root_dentry, lower_mnt, tmp_page, PAGE_SIZE);
178 if (IS_ERR(path)) {
179 rc = PTR_ERR(path);
180 goto out;
181 }
182 seq_printf(m, ",dir=%s", path);
183 free_page((unsigned long)tmp_page);
184out:
185 return rc;
186}
187
188struct super_operations ecryptfs_sops = {
189 .alloc_inode = ecryptfs_alloc_inode,
190 .destroy_inode = ecryptfs_destroy_inode,
191 .drop_inode = generic_delete_inode,
192 .put_super = ecryptfs_put_super,
193 .statfs = ecryptfs_statfs,
194 .remount_fs = NULL,
195 .clear_inode = ecryptfs_clear_inode,
196 .umount_begin = ecryptfs_umount_begin,
197 .show_options = ecryptfs_show_options
198};
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 557d5b614fae..ae228ec54e94 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -105,6 +105,8 @@
105/* Maximum msec timeout value storeable in a long int */ 105/* Maximum msec timeout value storeable in a long int */
106#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 106#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
107 107
108#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
109
108 110
109struct epoll_filefd { 111struct epoll_filefd {
110 struct file *file; 112 struct file *file;
@@ -497,7 +499,7 @@ void eventpoll_release_file(struct file *file)
497 */ 499 */
498asmlinkage long sys_epoll_create(int size) 500asmlinkage long sys_epoll_create(int size)
499{ 501{
500 int error, fd; 502 int error, fd = -1;
501 struct eventpoll *ep; 503 struct eventpoll *ep;
502 struct inode *inode; 504 struct inode *inode;
503 struct file *file; 505 struct file *file;
@@ -640,7 +642,6 @@ eexit_1:
640 return error; 642 return error;
641} 643}
642 644
643#define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
644 645
645/* 646/*
646 * Implement the event wait interface for the eventpoll file. It is the kernel 647 * Implement the event wait interface for the eventpoll file. It is the kernel
@@ -657,7 +658,7 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
657 current, epfd, events, maxevents, timeout)); 658 current, epfd, events, maxevents, timeout));
658 659
659 /* The maximum number of event must be greater than zero */ 660 /* The maximum number of event must be greater than zero */
660 if (maxevents <= 0 || maxevents > MAX_EVENTS) 661 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
661 return -EINVAL; 662 return -EINVAL;
662 663
663 /* Verify that the area passed by the user is writeable */ 664 /* Verify that the area passed by the user is writeable */
@@ -699,6 +700,55 @@ eexit_1:
699} 700}
700 701
701 702
703#ifdef TIF_RESTORE_SIGMASK
704
705/*
706 * Implement the event wait interface for the eventpoll file. It is the kernel
707 * part of the user space epoll_pwait(2).
708 */
709asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
710 int maxevents, int timeout, const sigset_t __user *sigmask,
711 size_t sigsetsize)
712{
713 int error;
714 sigset_t ksigmask, sigsaved;
715
716 /*
717 * If the caller wants a certain signal mask to be set during the wait,
718 * we apply it here.
719 */
720 if (sigmask) {
721 if (sigsetsize != sizeof(sigset_t))
722 return -EINVAL;
723 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
724 return -EFAULT;
725 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
726 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
727 }
728
729 error = sys_epoll_wait(epfd, events, maxevents, timeout);
730
731 /*
732 * If we changed the signal mask, we need to restore the original one.
733 * In case we've got a signal while waiting, we do not restore the
734 * signal mask yet, and we allow do_signal() to deliver the signal on
735 * the way back to userspace, before the signal mask is restored.
736 */
737 if (sigmask) {
738 if (error == -EINTR) {
739 memcpy(&current->saved_sigmask, &sigsaved,
740 sizeof(sigsaved));
741 set_thread_flag(TIF_RESTORE_SIGMASK);
742 } else
743 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
744 }
745
746 return error;
747}
748
749#endif /* #ifdef TIF_RESTORE_SIGMASK */
750
751
702/* 752/*
703 * Creates the file descriptor to be used by the epoll interface. 753 * Creates the file descriptor to be used by the epoll interface.
704 */ 754 */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 513cd421ac0b..d8b9abd95d07 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -364,7 +364,6 @@ static int parse_options (char * options,
364{ 364{
365 char * p; 365 char * p;
366 substring_t args[MAX_OPT_ARGS]; 366 substring_t args[MAX_OPT_ARGS];
367 unsigned long kind = EXT2_MOUNT_ERRORS_CONT;
368 int option; 367 int option;
369 368
370 if (!options) 369 if (!options)
@@ -404,13 +403,19 @@ static int parse_options (char * options,
404 /* *sb_block = match_int(&args[0]); */ 403 /* *sb_block = match_int(&args[0]); */
405 break; 404 break;
406 case Opt_err_panic: 405 case Opt_err_panic:
407 kind = EXT2_MOUNT_ERRORS_PANIC; 406 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
407 clear_opt (sbi->s_mount_opt, ERRORS_RO);
408 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
408 break; 409 break;
409 case Opt_err_ro: 410 case Opt_err_ro:
410 kind = EXT2_MOUNT_ERRORS_RO; 411 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
412 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
413 set_opt (sbi->s_mount_opt, ERRORS_RO);
411 break; 414 break;
412 case Opt_err_cont: 415 case Opt_err_cont:
413 kind = EXT2_MOUNT_ERRORS_CONT; 416 clear_opt (sbi->s_mount_opt, ERRORS_RO);
417 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
418 set_opt (sbi->s_mount_opt, ERRORS_CONT);
414 break; 419 break;
415 case Opt_nouid32: 420 case Opt_nouid32:
416 set_opt (sbi->s_mount_opt, NO_UID32); 421 set_opt (sbi->s_mount_opt, NO_UID32);
@@ -489,7 +494,6 @@ static int parse_options (char * options,
489 return 0; 494 return 0;
490 } 495 }
491 } 496 }
492 sbi->s_mount_opt |= kind;
493 return 1; 497 return 1;
494} 498}
495 499
@@ -715,6 +719,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
715 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 719 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
716 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO) 720 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO)
717 set_opt(sbi->s_mount_opt, ERRORS_RO); 721 set_opt(sbi->s_mount_opt, ERRORS_RO);
722 else
723 set_opt(sbi->s_mount_opt, ERRORS_CONT);
718 724
719 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 725 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
720 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 726 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8bfd56ef18ca..afc2d4f42d77 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1470,6 +1470,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1470 set_opt(sbi->s_mount_opt, ERRORS_PANIC); 1470 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1471 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO) 1471 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
1472 set_opt(sbi->s_mount_opt, ERRORS_RO); 1472 set_opt(sbi->s_mount_opt, ERRORS_RO);
1473 else
1474 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1473 1475
1474 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 1476 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1475 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 1477 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 000000000000..a6acb96ebeb9
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,12 @@
1#
2# Makefile for the linux ext4-filesystem routines.
3#
4
5obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
9
10ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
12ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 000000000000..9e882546d91a
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,551 @@
1/*
2 * linux/fs/ext4/acl.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */
6
7#include <linux/init.h>
8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include <linux/ext4_jbd2.h>
13#include <linux/ext4_fs.h>
14#include "xattr.h"
15#include "acl.h"
16
17/*
18 * Convert from filesystem to in-memory representation.
19 */
20static struct posix_acl *
21ext4_acl_from_disk(const void *value, size_t size)
22{
23 const char *end = (char *)value + size;
24 int n, count;
25 struct posix_acl *acl;
26
27 if (!value)
28 return NULL;
29 if (size < sizeof(ext4_acl_header))
30 return ERR_PTR(-EINVAL);
31 if (((ext4_acl_header *)value)->a_version !=
32 cpu_to_le32(EXT4_ACL_VERSION))
33 return ERR_PTR(-EINVAL);
34 value = (char *)value + sizeof(ext4_acl_header);
35 count = ext4_acl_count(size);
36 if (count < 0)
37 return ERR_PTR(-EINVAL);
38 if (count == 0)
39 return NULL;
40 acl = posix_acl_alloc(count, GFP_KERNEL);
41 if (!acl)
42 return ERR_PTR(-ENOMEM);
43 for (n=0; n < count; n++) {
44 ext4_acl_entry *entry =
45 (ext4_acl_entry *)value;
46 if ((char *)value + sizeof(ext4_acl_entry_short) > end)
47 goto fail;
48 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
49 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
50 switch(acl->a_entries[n].e_tag) {
51 case ACL_USER_OBJ:
52 case ACL_GROUP_OBJ:
53 case ACL_MASK:
54 case ACL_OTHER:
55 value = (char *)value +
56 sizeof(ext4_acl_entry_short);
57 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
58 break;
59
60 case ACL_USER:
61 case ACL_GROUP:
62 value = (char *)value + sizeof(ext4_acl_entry);
63 if ((char *)value > end)
64 goto fail;
65 acl->a_entries[n].e_id =
66 le32_to_cpu(entry->e_id);
67 break;
68
69 default:
70 goto fail;
71 }
72 }
73 if (value != end)
74 goto fail;
75 return acl;
76
77fail:
78 posix_acl_release(acl);
79 return ERR_PTR(-EINVAL);
80}
81
82/*
83 * Convert from in-memory to filesystem representation.
84 */
85static void *
86ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
87{
88 ext4_acl_header *ext_acl;
89 char *e;
90 size_t n;
91
92 *size = ext4_acl_size(acl->a_count);
93 ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
94 sizeof(ext4_acl_entry), GFP_KERNEL);
95 if (!ext_acl)
96 return ERR_PTR(-ENOMEM);
97 ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
98 e = (char *)ext_acl + sizeof(ext4_acl_header);
99 for (n=0; n < acl->a_count; n++) {
100 ext4_acl_entry *entry = (ext4_acl_entry *)e;
101 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
102 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
103 switch(acl->a_entries[n].e_tag) {
104 case ACL_USER:
105 case ACL_GROUP:
106 entry->e_id =
107 cpu_to_le32(acl->a_entries[n].e_id);
108 e += sizeof(ext4_acl_entry);
109 break;
110
111 case ACL_USER_OBJ:
112 case ACL_GROUP_OBJ:
113 case ACL_MASK:
114 case ACL_OTHER:
115 e += sizeof(ext4_acl_entry_short);
116 break;
117
118 default:
119 goto fail;
120 }
121 }
122 return (char *)ext_acl;
123
124fail:
125 kfree(ext_acl);
126 return ERR_PTR(-EINVAL);
127}
128
129static inline struct posix_acl *
130ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
131{
132 struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
133
134 spin_lock(&inode->i_lock);
135 if (*i_acl != EXT4_ACL_NOT_CACHED)
136 acl = posix_acl_dup(*i_acl);
137 spin_unlock(&inode->i_lock);
138
139 return acl;
140}
141
142static inline void
143ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
144 struct posix_acl *acl)
145{
146 spin_lock(&inode->i_lock);
147 if (*i_acl != EXT4_ACL_NOT_CACHED)
148 posix_acl_release(*i_acl);
149 *i_acl = posix_acl_dup(acl);
150 spin_unlock(&inode->i_lock);
151}
152
153/*
154 * Inode operation get_posix_acl().
155 *
156 * inode->i_mutex: don't care
157 */
158static struct posix_acl *
159ext4_get_acl(struct inode *inode, int type)
160{
161 struct ext4_inode_info *ei = EXT4_I(inode);
162 int name_index;
163 char *value = NULL;
164 struct posix_acl *acl;
165 int retval;
166
167 if (!test_opt(inode->i_sb, POSIX_ACL))
168 return NULL;
169
170 switch(type) {
171 case ACL_TYPE_ACCESS:
172 acl = ext4_iget_acl(inode, &ei->i_acl);
173 if (acl != EXT4_ACL_NOT_CACHED)
174 return acl;
175 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
176 break;
177
178 case ACL_TYPE_DEFAULT:
179 acl = ext4_iget_acl(inode, &ei->i_default_acl);
180 if (acl != EXT4_ACL_NOT_CACHED)
181 return acl;
182 name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
183 break;
184
185 default:
186 return ERR_PTR(-EINVAL);
187 }
188 retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
189 if (retval > 0) {
190 value = kmalloc(retval, GFP_KERNEL);
191 if (!value)
192 return ERR_PTR(-ENOMEM);
193 retval = ext4_xattr_get(inode, name_index, "", value, retval);
194 }
195 if (retval > 0)
196 acl = ext4_acl_from_disk(value, retval);
197 else if (retval == -ENODATA || retval == -ENOSYS)
198 acl = NULL;
199 else
200 acl = ERR_PTR(retval);
201 kfree(value);
202
203 if (!IS_ERR(acl)) {
204 switch(type) {
205 case ACL_TYPE_ACCESS:
206 ext4_iset_acl(inode, &ei->i_acl, acl);
207 break;
208
209 case ACL_TYPE_DEFAULT:
210 ext4_iset_acl(inode, &ei->i_default_acl, acl);
211 break;
212 }
213 }
214 return acl;
215}
216
217/*
218 * Set the access or default ACL of an inode.
219 *
220 * inode->i_mutex: down unless called from ext4_new_inode
221 */
222static int
223ext4_set_acl(handle_t *handle, struct inode *inode, int type,
224 struct posix_acl *acl)
225{
226 struct ext4_inode_info *ei = EXT4_I(inode);
227 int name_index;
228 void *value = NULL;
229 size_t size = 0;
230 int error;
231
232 if (S_ISLNK(inode->i_mode))
233 return -EOPNOTSUPP;
234
235 switch(type) {
236 case ACL_TYPE_ACCESS:
237 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
238 if (acl) {
239 mode_t mode = inode->i_mode;
240 error = posix_acl_equiv_mode(acl, &mode);
241 if (error < 0)
242 return error;
243 else {
244 inode->i_mode = mode;
245 ext4_mark_inode_dirty(handle, inode);
246 if (error == 0)
247 acl = NULL;
248 }
249 }
250 break;
251
252 case ACL_TYPE_DEFAULT:
253 name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
254 if (!S_ISDIR(inode->i_mode))
255 return acl ? -EACCES : 0;
256 break;
257
258 default:
259 return -EINVAL;
260 }
261 if (acl) {
262 value = ext4_acl_to_disk(acl, &size);
263 if (IS_ERR(value))
264 return (int)PTR_ERR(value);
265 }
266
267 error = ext4_xattr_set_handle(handle, inode, name_index, "",
268 value, size, 0);
269
270 kfree(value);
271 if (!error) {
272 switch(type) {
273 case ACL_TYPE_ACCESS:
274 ext4_iset_acl(inode, &ei->i_acl, acl);
275 break;
276
277 case ACL_TYPE_DEFAULT:
278 ext4_iset_acl(inode, &ei->i_default_acl, acl);
279 break;
280 }
281 }
282 return error;
283}
284
285static int
286ext4_check_acl(struct inode *inode, int mask)
287{
288 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
289
290 if (IS_ERR(acl))
291 return PTR_ERR(acl);
292 if (acl) {
293 int error = posix_acl_permission(inode, acl, mask);
294 posix_acl_release(acl);
295 return error;
296 }
297
298 return -EAGAIN;
299}
300
301int
302ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
303{
304 return generic_permission(inode, mask, ext4_check_acl);
305}
306
307/*
308 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
309 *
310 * dir->i_mutex: down
311 * inode->i_mutex: up (access to inode is still exclusive)
312 */
313int
314ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
315{
316 struct posix_acl *acl = NULL;
317 int error = 0;
318
319 if (!S_ISLNK(inode->i_mode)) {
320 if (test_opt(dir->i_sb, POSIX_ACL)) {
321 acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
322 if (IS_ERR(acl))
323 return PTR_ERR(acl);
324 }
325 if (!acl)
326 inode->i_mode &= ~current->fs->umask;
327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone;
330 mode_t mode;
331
332 if (S_ISDIR(inode->i_mode)) {
333 error = ext4_set_acl(handle, inode,
334 ACL_TYPE_DEFAULT, acl);
335 if (error)
336 goto cleanup;
337 }
338 clone = posix_acl_clone(acl, GFP_KERNEL);
339 error = -ENOMEM;
340 if (!clone)
341 goto cleanup;
342
343 mode = inode->i_mode;
344 error = posix_acl_create_masq(clone, &mode);
345 if (error >= 0) {
346 inode->i_mode = mode;
347 if (error > 0) {
348 /* This is an extended ACL */
349 error = ext4_set_acl(handle, inode,
350 ACL_TYPE_ACCESS, clone);
351 }
352 }
353 posix_acl_release(clone);
354 }
355cleanup:
356 posix_acl_release(acl);
357 return error;
358}
359
360/*
361 * Does chmod for an inode that may have an Access Control List. The
362 * inode->i_mode field must be updated to the desired value by the caller
363 * before calling this function.
364 * Returns 0 on success, or a negative error number.
365 *
366 * We change the ACL rather than storing some ACL entries in the file
367 * mode permission bits (which would be more efficient), because that
368 * would break once additional permissions (like ACL_APPEND, ACL_DELETE
369 * for directories) are added. There are no more bits available in the
370 * file mode.
371 *
372 * inode->i_mutex: down
373 */
374int
375ext4_acl_chmod(struct inode *inode)
376{
377 struct posix_acl *acl, *clone;
378 int error;
379
380 if (S_ISLNK(inode->i_mode))
381 return -EOPNOTSUPP;
382 if (!test_opt(inode->i_sb, POSIX_ACL))
383 return 0;
384 acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
385 if (IS_ERR(acl) || !acl)
386 return PTR_ERR(acl);
387 clone = posix_acl_clone(acl, GFP_KERNEL);
388 posix_acl_release(acl);
389 if (!clone)
390 return -ENOMEM;
391 error = posix_acl_chmod_masq(clone, inode->i_mode);
392 if (!error) {
393 handle_t *handle;
394 int retries = 0;
395
396 retry:
397 handle = ext4_journal_start(inode,
398 EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
399 if (IS_ERR(handle)) {
400 error = PTR_ERR(handle);
401 ext4_std_error(inode->i_sb, error);
402 goto out;
403 }
404 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
405 ext4_journal_stop(handle);
406 if (error == -ENOSPC &&
407 ext4_should_retry_alloc(inode->i_sb, &retries))
408 goto retry;
409 }
410out:
411 posix_acl_release(clone);
412 return error;
413}
414
415/*
416 * Extended attribute handlers
417 */
418static size_t
419ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
420 const char *name, size_t name_len)
421{
422 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
423
424 if (!test_opt(inode->i_sb, POSIX_ACL))
425 return 0;
426 if (list && size <= list_len)
427 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
428 return size;
429}
430
431static size_t
432ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
433 const char *name, size_t name_len)
434{
435 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
436
437 if (!test_opt(inode->i_sb, POSIX_ACL))
438 return 0;
439 if (list && size <= list_len)
440 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
441 return size;
442}
443
444static int
445ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
446{
447 struct posix_acl *acl;
448 int error;
449
450 if (!test_opt(inode->i_sb, POSIX_ACL))
451 return -EOPNOTSUPP;
452
453 acl = ext4_get_acl(inode, type);
454 if (IS_ERR(acl))
455 return PTR_ERR(acl);
456 if (acl == NULL)
457 return -ENODATA;
458 error = posix_acl_to_xattr(acl, buffer, size);
459 posix_acl_release(acl);
460
461 return error;
462}
463
464static int
465ext4_xattr_get_acl_access(struct inode *inode, const char *name,
466 void *buffer, size_t size)
467{
468 if (strcmp(name, "") != 0)
469 return -EINVAL;
470 return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
471}
472
473static int
474ext4_xattr_get_acl_default(struct inode *inode, const char *name,
475 void *buffer, size_t size)
476{
477 if (strcmp(name, "") != 0)
478 return -EINVAL;
479 return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
480}
481
482static int
483ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
484 size_t size)
485{
486 handle_t *handle;
487 struct posix_acl *acl;
488 int error, retries = 0;
489
490 if (!test_opt(inode->i_sb, POSIX_ACL))
491 return -EOPNOTSUPP;
492 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
493 return -EPERM;
494
495 if (value) {
496 acl = posix_acl_from_xattr(value, size);
497 if (IS_ERR(acl))
498 return PTR_ERR(acl);
499 else if (acl) {
500 error = posix_acl_valid(acl);
501 if (error)
502 goto release_and_out;
503 }
504 } else
505 acl = NULL;
506
507retry:
508 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
509 if (IS_ERR(handle))
510 return PTR_ERR(handle);
511 error = ext4_set_acl(handle, inode, type, acl);
512 ext4_journal_stop(handle);
513 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
514 goto retry;
515
516release_and_out:
517 posix_acl_release(acl);
518 return error;
519}
520
521static int
522ext4_xattr_set_acl_access(struct inode *inode, const char *name,
523 const void *value, size_t size, int flags)
524{
525 if (strcmp(name, "") != 0)
526 return -EINVAL;
527 return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
528}
529
530static int
531ext4_xattr_set_acl_default(struct inode *inode, const char *name,
532 const void *value, size_t size, int flags)
533{
534 if (strcmp(name, "") != 0)
535 return -EINVAL;
536 return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
537}
538
539struct xattr_handler ext4_xattr_acl_access_handler = {
540 .prefix = POSIX_ACL_XATTR_ACCESS,
541 .list = ext4_xattr_list_acl_access,
542 .get = ext4_xattr_get_acl_access,
543 .set = ext4_xattr_set_acl_access,
544};
545
546struct xattr_handler ext4_xattr_acl_default_handler = {
547 .prefix = POSIX_ACL_XATTR_DEFAULT,
548 .list = ext4_xattr_list_acl_default,
549 .get = ext4_xattr_get_acl_default,
550 .set = ext4_xattr_set_acl_default,
551};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 000000000000..26a5c1abf147
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,81 @@
1/*
2 File: fs/ext4/acl.h
3
4 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
5*/
6
7#include <linux/posix_acl_xattr.h>
8
9#define EXT4_ACL_VERSION 0x0001
10
11typedef struct {
12 __le16 e_tag;
13 __le16 e_perm;
14 __le32 e_id;
15} ext4_acl_entry;
16
17typedef struct {
18 __le16 e_tag;
19 __le16 e_perm;
20} ext4_acl_entry_short;
21
22typedef struct {
23 __le32 a_version;
24} ext4_acl_header;
25
26static inline size_t ext4_acl_size(int count)
27{
28 if (count <= 4) {
29 return sizeof(ext4_acl_header) +
30 count * sizeof(ext4_acl_entry_short);
31 } else {
32 return sizeof(ext4_acl_header) +
33 4 * sizeof(ext4_acl_entry_short) +
34 (count - 4) * sizeof(ext4_acl_entry);
35 }
36}
37
38static inline int ext4_acl_count(size_t size)
39{
40 ssize_t s;
41 size -= sizeof(ext4_acl_header);
42 s = size - 4 * sizeof(ext4_acl_entry_short);
43 if (s < 0) {
44 if (size % sizeof(ext4_acl_entry_short))
45 return -1;
46 return size / sizeof(ext4_acl_entry_short);
47 } else {
48 if (s % sizeof(ext4_acl_entry))
49 return -1;
50 return s / sizeof(ext4_acl_entry) + 4;
51 }
52}
53
54#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
55
56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
57 if the ACL has not been cached */
58#define EXT4_ACL_NOT_CACHED ((void *)-1)
59
60/* acl.c */
61extern int ext4_permission (struct inode *, int, struct nameidata *);
62extern int ext4_acl_chmod (struct inode *);
63extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
64
65#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */
66#include <linux/sched.h>
67#define ext4_permission NULL
68
69static inline int
70ext4_acl_chmod(struct inode *inode)
71{
72 return 0;
73}
74
75static inline int
76ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
77{
78 return 0;
79}
80#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */
81
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 000000000000..5d45582f9517
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,1833 @@
1/*
2 * linux/fs/ext4/balloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10 * Big-endian to little-endian byte-swapping/bitmaps by
11 * David S. Miller (davem@caip.rutgers.edu), 1995
12 */
13
14#include <linux/time.h>
15#include <linux/capability.h>
16#include <linux/fs.h>
17#include <linux/jbd2.h>
18#include <linux/ext4_fs.h>
19#include <linux/ext4_jbd2.h>
20#include <linux/quotaops.h>
21#include <linux/buffer_head.h>
22
23/*
24 * balloc.c contains the blocks allocation and deallocation routines
25 */
26
27/*
28 * Calculate the block group number and offset, given a block number
29 */
30void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
31 unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
32{
33 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
34 ext4_grpblk_t offset;
35
36 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
37 offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
38 if (offsetp)
39 *offsetp = offset;
40 if (blockgrpp)
41 *blockgrpp = blocknr;
42
43}
44
45/*
46 * The free blocks are managed by bitmaps. A file system contains several
47 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
48 * block for inodes, N blocks for the inode table and data blocks.
49 *
50 * The file system contains group descriptors which are located after the
51 * super block. Each descriptor contains the number of the bitmap block and
52 * the free blocks count in the block. The descriptors are loaded in memory
53 * when a file system is mounted (see ext4_read_super).
54 */
55
56
57#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
58
59/**
60 * ext4_get_group_desc() -- load group descriptor from disk
61 * @sb: super block
62 * @block_group: given block group
63 * @bh: pointer to the buffer head to store the block
64 * group descriptor
65 */
66struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
67 unsigned int block_group,
68 struct buffer_head ** bh)
69{
70 unsigned long group_desc;
71 unsigned long offset;
72 struct ext4_group_desc * desc;
73 struct ext4_sb_info *sbi = EXT4_SB(sb);
74
75 if (block_group >= sbi->s_groups_count) {
76 ext4_error (sb, "ext4_get_group_desc",
77 "block_group >= groups_count - "
78 "block_group = %d, groups_count = %lu",
79 block_group, sbi->s_groups_count);
80
81 return NULL;
82 }
83 smp_rmb();
84
85 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
86 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
87 if (!sbi->s_group_desc[group_desc]) {
88 ext4_error (sb, "ext4_get_group_desc",
89 "Group descriptor not loaded - "
90 "block_group = %d, group_desc = %lu, desc = %lu",
91 block_group, group_desc, offset);
92 return NULL;
93 }
94
95 desc = (struct ext4_group_desc *)(
96 (__u8 *)sbi->s_group_desc[group_desc]->b_data +
97 offset * EXT4_DESC_SIZE(sb));
98 if (bh)
99 *bh = sbi->s_group_desc[group_desc];
100 return desc;
101}
102
103/**
104 * read_block_bitmap()
105 * @sb: super block
106 * @block_group: given block group
107 *
108 * Read the bitmap for a given block_group, reading into the specified
109 * slot in the superblock's bitmap cache.
110 *
111 * Return buffer_head on success or NULL in case of failure.
112 */
113static struct buffer_head *
114read_block_bitmap(struct super_block *sb, unsigned int block_group)
115{
116 struct ext4_group_desc * desc;
117 struct buffer_head * bh = NULL;
118
119 desc = ext4_get_group_desc (sb, block_group, NULL);
120 if (!desc)
121 goto error_out;
122 bh = sb_bread(sb, ext4_block_bitmap(sb, desc));
123 if (!bh)
124 ext4_error (sb, "read_block_bitmap",
125 "Cannot read block bitmap - "
126 "block_group = %d, block_bitmap = %llu",
127 block_group,
128 ext4_block_bitmap(sb, desc));
129error_out:
130 return bh;
131}
132/*
133 * The reservation window structure operations
134 * --------------------------------------------
135 * Operations include:
136 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
137 *
138 * We use a red-black tree to represent per-filesystem reservation
139 * windows.
140 *
141 */
142
143/**
144 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
145 * @rb_root: root of per-filesystem reservation rb tree
146 * @verbose: verbose mode
147 * @fn: function which wishes to dump the reservation map
148 *
149 * If verbose is turned on, it will print the whole block reservation
150 * windows(start, end). Otherwise, it will only print out the "bad" windows,
151 * those windows that overlap with their immediate neighbors.
152 */
153#if 1
154static void __rsv_window_dump(struct rb_root *root, int verbose,
155 const char *fn)
156{
157 struct rb_node *n;
158 struct ext4_reserve_window_node *rsv, *prev;
159 int bad;
160
161restart:
162 n = rb_first(root);
163 bad = 0;
164 prev = NULL;
165
166 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
167 while (n) {
168 rsv = list_entry(n, struct ext4_reserve_window_node, rsv_node);
169 if (verbose)
170 printk("reservation window 0x%p "
171 "start: %llu, end: %llu\n",
172 rsv, rsv->rsv_start, rsv->rsv_end);
173 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
174 printk("Bad reservation %p (start >= end)\n",
175 rsv);
176 bad = 1;
177 }
178 if (prev && prev->rsv_end >= rsv->rsv_start) {
179 printk("Bad reservation %p (prev->end >= start)\n",
180 rsv);
181 bad = 1;
182 }
183 if (bad) {
184 if (!verbose) {
185 printk("Restarting reservation walk in verbose mode\n");
186 verbose = 1;
187 goto restart;
188 }
189 }
190 n = rb_next(n);
191 prev = rsv;
192 }
193 printk("Window map complete.\n");
194 if (bad)
195 BUG();
196}
197#define rsv_window_dump(root, verbose) \
198 __rsv_window_dump((root), (verbose), __FUNCTION__)
199#else
200#define rsv_window_dump(root, verbose) do {} while (0)
201#endif
202
203/**
204 * goal_in_my_reservation()
205 * @rsv: inode's reservation window
206 * @grp_goal: given goal block relative to the allocation block group
207 * @group: the current allocation block group
208 * @sb: filesystem super block
209 *
210 * Test if the given goal block (group relative) is within the file's
211 * own block reservation window range.
212 *
213 * If the reservation window is outside the goal allocation group, return 0;
214 * grp_goal (given goal block) could be -1, which means no specific
215 * goal block. In this case, always return 1.
216 * If the goal block is within the reservation window, return 1;
217 * otherwise, return 0;
218 */
219static int
220goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
221 unsigned int group, struct super_block * sb)
222{
223 ext4_fsblk_t group_first_block, group_last_block;
224
225 group_first_block = ext4_group_first_block_no(sb, group);
226 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
227
228 if ((rsv->_rsv_start > group_last_block) ||
229 (rsv->_rsv_end < group_first_block))
230 return 0;
231 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
232 || (grp_goal + group_first_block > rsv->_rsv_end)))
233 return 0;
234 return 1;
235}
236
237/**
238 * search_reserve_window()
239 * @rb_root: root of reservation tree
240 * @goal: target allocation block
241 *
242 * Find the reserved window which includes the goal, or the previous one
243 * if the goal is not in any window.
244 * Returns NULL if there are no windows or if all windows start after the goal.
245 */
246static struct ext4_reserve_window_node *
247search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
248{
249 struct rb_node *n = root->rb_node;
250 struct ext4_reserve_window_node *rsv;
251
252 if (!n)
253 return NULL;
254
255 do {
256 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
257
258 if (goal < rsv->rsv_start)
259 n = n->rb_left;
260 else if (goal > rsv->rsv_end)
261 n = n->rb_right;
262 else
263 return rsv;
264 } while (n);
265 /*
266 * We've fallen off the end of the tree: the goal wasn't inside
267 * any particular node. OK, the previous node must be to one
268 * side of the interval containing the goal. If it's the RHS,
269 * we need to back up one.
270 */
271 if (rsv->rsv_start > goal) {
272 n = rb_prev(&rsv->rsv_node);
273 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
274 }
275 return rsv;
276}
277
278/**
279 * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
280 * @sb: super block
281 * @rsv: reservation window to add
282 *
283 * Must be called with rsv_lock hold.
284 */
285void ext4_rsv_window_add(struct super_block *sb,
286 struct ext4_reserve_window_node *rsv)
287{
288 struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
289 struct rb_node *node = &rsv->rsv_node;
290 ext4_fsblk_t start = rsv->rsv_start;
291
292 struct rb_node ** p = &root->rb_node;
293 struct rb_node * parent = NULL;
294 struct ext4_reserve_window_node *this;
295
296 while (*p)
297 {
298 parent = *p;
299 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
300
301 if (start < this->rsv_start)
302 p = &(*p)->rb_left;
303 else if (start > this->rsv_end)
304 p = &(*p)->rb_right;
305 else {
306 rsv_window_dump(root, 1);
307 BUG();
308 }
309 }
310
311 rb_link_node(node, parent, p);
312 rb_insert_color(node, root);
313}
314
315/**
316 * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
317 * @sb: super block
318 * @rsv: reservation window to remove
319 *
320 * Mark the block reservation window as not allocated, and unlink it
321 * from the filesystem reservation window rb tree. Must be called with
322 * rsv_lock hold.
323 */
324static void rsv_window_remove(struct super_block *sb,
325 struct ext4_reserve_window_node *rsv)
326{
327 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
328 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
329 rsv->rsv_alloc_hit = 0;
330 rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
331}
332
333/*
334 * rsv_is_empty() -- Check if the reservation window is allocated.
335 * @rsv: given reservation window to check
336 *
337 * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
338 */
339static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
340{
341 /* a valid reservation end block could not be 0 */
342 return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
343}
344
345/**
346 * ext4_init_block_alloc_info()
347 * @inode: file inode structure
348 *
349 * Allocate and initialize the reservation window structure, and
350 * link the window to the ext4 inode structure at last
351 *
352 * The reservation window structure is only dynamically allocated
353 * and linked to ext4 inode the first time the open file
354 * needs a new block. So, before every ext4_new_block(s) call, for
355 * regular files, we should check whether the reservation window
356 * structure exists or not. In the latter case, this function is called.
357 * Fail to do so will result in block reservation being turned off for that
358 * open file.
359 *
360 * This function is called from ext4_get_blocks_handle(), also called
361 * when setting the reservation window size through ioctl before the file
362 * is open for write (needs block allocation).
363 *
364 * Needs truncate_mutex protection prior to call this function.
365 */
366void ext4_init_block_alloc_info(struct inode *inode)
367{
368 struct ext4_inode_info *ei = EXT4_I(inode);
369 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
370 struct super_block *sb = inode->i_sb;
371
372 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
373 if (block_i) {
374 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
375
376 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
377 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
378
379 /*
380 * if filesystem is mounted with NORESERVATION, the goal
381 * reservation window size is set to zero to indicate
382 * block reservation is off
383 */
384 if (!test_opt(sb, RESERVATION))
385 rsv->rsv_goal_size = 0;
386 else
387 rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
388 rsv->rsv_alloc_hit = 0;
389 block_i->last_alloc_logical_block = 0;
390 block_i->last_alloc_physical_block = 0;
391 }
392 ei->i_block_alloc_info = block_i;
393}
394
395/**
396 * ext4_discard_reservation()
397 * @inode: inode
398 *
399 * Discard(free) block reservation window on last file close, or truncate
400 * or at last iput().
401 *
402 * It is being called in three cases:
403 * ext4_release_file(): last writer close the file
404 * ext4_clear_inode(): last iput(), when nobody link to this file.
405 * ext4_truncate(): when the block indirect map is about to change.
406 *
407 */
408void ext4_discard_reservation(struct inode *inode)
409{
410 struct ext4_inode_info *ei = EXT4_I(inode);
411 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
412 struct ext4_reserve_window_node *rsv;
413 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
414
415 if (!block_i)
416 return;
417
418 rsv = &block_i->rsv_window_node;
419 if (!rsv_is_empty(&rsv->rsv_window)) {
420 spin_lock(rsv_lock);
421 if (!rsv_is_empty(&rsv->rsv_window))
422 rsv_window_remove(inode->i_sb, rsv);
423 spin_unlock(rsv_lock);
424 }
425}
426
427/**
428 * ext4_free_blocks_sb() -- Free given blocks and update quota
429 * @handle: handle to this transaction
430 * @sb: super block
431 * @block: start physcial block to free
432 * @count: number of blocks to free
433 * @pdquot_freed_blocks: pointer to quota
434 */
435void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
436 ext4_fsblk_t block, unsigned long count,
437 unsigned long *pdquot_freed_blocks)
438{
439 struct buffer_head *bitmap_bh = NULL;
440 struct buffer_head *gd_bh;
441 unsigned long block_group;
442 ext4_grpblk_t bit;
443 unsigned long i;
444 unsigned long overflow;
445 struct ext4_group_desc * desc;
446 struct ext4_super_block * es;
447 struct ext4_sb_info *sbi;
448 int err = 0, ret;
449 ext4_grpblk_t group_freed;
450
451 *pdquot_freed_blocks = 0;
452 sbi = EXT4_SB(sb);
453 es = sbi->s_es;
454 if (block < le32_to_cpu(es->s_first_data_block) ||
455 block + count < block ||
456 block + count > ext4_blocks_count(es)) {
457 ext4_error (sb, "ext4_free_blocks",
458 "Freeing blocks not in datazone - "
459 "block = %llu, count = %lu", block, count);
460 goto error_return;
461 }
462
463 ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
464
465do_more:
466 overflow = 0;
467 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
468 /*
469 * Check to see if we are freeing blocks across a group
470 * boundary.
471 */
472 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
473 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
474 count -= overflow;
475 }
476 brelse(bitmap_bh);
477 bitmap_bh = read_block_bitmap(sb, block_group);
478 if (!bitmap_bh)
479 goto error_return;
480 desc = ext4_get_group_desc (sb, block_group, &gd_bh);
481 if (!desc)
482 goto error_return;
483
484 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
485 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
486 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
487 in_range(block + count - 1, ext4_inode_table(sb, desc),
488 sbi->s_itb_per_group))
489 ext4_error (sb, "ext4_free_blocks",
490 "Freeing blocks in system zones - "
491 "Block = %llu, count = %lu",
492 block, count);
493
494 /*
495 * We are about to start releasing blocks in the bitmap,
496 * so we need undo access.
497 */
498 /* @@@ check errors */
499 BUFFER_TRACE(bitmap_bh, "getting undo access");
500 err = ext4_journal_get_undo_access(handle, bitmap_bh);
501 if (err)
502 goto error_return;
503
504 /*
505 * We are about to modify some metadata. Call the journal APIs
506 * to unshare ->b_data if a currently-committing transaction is
507 * using it
508 */
509 BUFFER_TRACE(gd_bh, "get_write_access");
510 err = ext4_journal_get_write_access(handle, gd_bh);
511 if (err)
512 goto error_return;
513
514 jbd_lock_bh_state(bitmap_bh);
515
516 for (i = 0, group_freed = 0; i < count; i++) {
517 /*
518 * An HJ special. This is expensive...
519 */
520#ifdef CONFIG_JBD_DEBUG
521 jbd_unlock_bh_state(bitmap_bh);
522 {
523 struct buffer_head *debug_bh;
524 debug_bh = sb_find_get_block(sb, block + i);
525 if (debug_bh) {
526 BUFFER_TRACE(debug_bh, "Deleted!");
527 if (!bh2jh(bitmap_bh)->b_committed_data)
528 BUFFER_TRACE(debug_bh,
529 "No commited data in bitmap");
530 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
531 __brelse(debug_bh);
532 }
533 }
534 jbd_lock_bh_state(bitmap_bh);
535#endif
536 if (need_resched()) {
537 jbd_unlock_bh_state(bitmap_bh);
538 cond_resched();
539 jbd_lock_bh_state(bitmap_bh);
540 }
541 /* @@@ This prevents newly-allocated data from being
542 * freed and then reallocated within the same
543 * transaction.
544 *
545 * Ideally we would want to allow that to happen, but to
546 * do so requires making jbd2_journal_forget() capable of
547 * revoking the queued write of a data block, which
548 * implies blocking on the journal lock. *forget()
549 * cannot block due to truncate races.
550 *
551 * Eventually we can fix this by making jbd2_journal_forget()
552 * return a status indicating whether or not it was able
553 * to revoke the buffer. On successful revoke, it is
554 * safe not to set the allocation bit in the committed
555 * bitmap, because we know that there is no outstanding
556 * activity on the buffer any more and so it is safe to
557 * reallocate it.
558 */
559 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
560 J_ASSERT_BH(bitmap_bh,
561 bh2jh(bitmap_bh)->b_committed_data != NULL);
562 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
563 bh2jh(bitmap_bh)->b_committed_data);
564
565 /*
566 * We clear the bit in the bitmap after setting the committed
567 * data bit, because this is the reverse order to that which
568 * the allocator uses.
569 */
570 BUFFER_TRACE(bitmap_bh, "clear bit");
571 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
572 bit + i, bitmap_bh->b_data)) {
573 jbd_unlock_bh_state(bitmap_bh);
574 ext4_error(sb, __FUNCTION__,
575 "bit already cleared for block %llu",
576 (ext4_fsblk_t)(block + i));
577 jbd_lock_bh_state(bitmap_bh);
578 BUFFER_TRACE(bitmap_bh, "bit already cleared");
579 } else {
580 group_freed++;
581 }
582 }
583 jbd_unlock_bh_state(bitmap_bh);
584
585 spin_lock(sb_bgl_lock(sbi, block_group));
586 desc->bg_free_blocks_count =
587 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
588 group_freed);
589 spin_unlock(sb_bgl_lock(sbi, block_group));
590 percpu_counter_mod(&sbi->s_freeblocks_counter, count);
591
592 /* We dirtied the bitmap block */
593 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
594 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
595
596 /* And the group descriptor block */
597 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
598 ret = ext4_journal_dirty_metadata(handle, gd_bh);
599 if (!err) err = ret;
600 *pdquot_freed_blocks += group_freed;
601
602 if (overflow && !err) {
603 block += count;
604 count = overflow;
605 goto do_more;
606 }
607 sb->s_dirt = 1;
608error_return:
609 brelse(bitmap_bh);
610 ext4_std_error(sb, err);
611 return;
612}
613
614/**
615 * ext4_free_blocks() -- Free given blocks and update quota
616 * @handle: handle for this transaction
617 * @inode: inode
618 * @block: start physical block to free
619 * @count: number of blocks to count
620 */
621void ext4_free_blocks(handle_t *handle, struct inode *inode,
622 ext4_fsblk_t block, unsigned long count)
623{
624 struct super_block * sb;
625 unsigned long dquot_freed_blocks;
626
627 sb = inode->i_sb;
628 if (!sb) {
629 printk ("ext4_free_blocks: nonexistent device");
630 return;
631 }
632 ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
633 if (dquot_freed_blocks)
634 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
635 return;
636}
637
638/**
639 * ext4_test_allocatable()
640 * @nr: given allocation block group
641 * @bh: bufferhead contains the bitmap of the given block group
642 *
643 * For ext4 allocations, we must not reuse any blocks which are
644 * allocated in the bitmap buffer's "last committed data" copy. This
645 * prevents deletes from freeing up the page for reuse until we have
646 * committed the delete transaction.
647 *
648 * If we didn't do this, then deleting something and reallocating it as
649 * data would allow the old block to be overwritten before the
650 * transaction committed (because we force data to disk before commit).
651 * This would lead to corruption if we crashed between overwriting the
652 * data and committing the delete.
653 *
654 * @@@ We may want to make this allocation behaviour conditional on
655 * data-writes at some point, and disable it for metadata allocations or
656 * sync-data inodes.
657 */
658static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
659{
660 int ret;
661 struct journal_head *jh = bh2jh(bh);
662
663 if (ext4_test_bit(nr, bh->b_data))
664 return 0;
665
666 jbd_lock_bh_state(bh);
667 if (!jh->b_committed_data)
668 ret = 1;
669 else
670 ret = !ext4_test_bit(nr, jh->b_committed_data);
671 jbd_unlock_bh_state(bh);
672 return ret;
673}
674
675/**
676 * bitmap_search_next_usable_block()
677 * @start: the starting block (group relative) of the search
678 * @bh: bufferhead contains the block group bitmap
679 * @maxblocks: the ending block (group relative) of the reservation
680 *
681 * The bitmap search --- search forward alternately through the actual
682 * bitmap on disk and the last-committed copy in journal, until we find a
683 * bit free in both bitmaps.
684 */
685static ext4_grpblk_t
686bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
687 ext4_grpblk_t maxblocks)
688{
689 ext4_grpblk_t next;
690 struct journal_head *jh = bh2jh(bh);
691
692 while (start < maxblocks) {
693 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
694 if (next >= maxblocks)
695 return -1;
696 if (ext4_test_allocatable(next, bh))
697 return next;
698 jbd_lock_bh_state(bh);
699 if (jh->b_committed_data)
700 start = ext4_find_next_zero_bit(jh->b_committed_data,
701 maxblocks, next);
702 jbd_unlock_bh_state(bh);
703 }
704 return -1;
705}
706
707/**
708 * find_next_usable_block()
709 * @start: the starting block (group relative) to find next
710 * allocatable block in bitmap.
711 * @bh: bufferhead contains the block group bitmap
712 * @maxblocks: the ending block (group relative) for the search
713 *
714 * Find an allocatable block in a bitmap. We honor both the bitmap and
715 * its last-committed copy (if that exists), and perform the "most
716 * appropriate allocation" algorithm of looking for a free block near
717 * the initial goal; then for a free byte somewhere in the bitmap; then
718 * for any free bit in the bitmap.
719 */
720static ext4_grpblk_t
721find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
722 ext4_grpblk_t maxblocks)
723{
724 ext4_grpblk_t here, next;
725 char *p, *r;
726
727 if (start > 0) {
728 /*
729 * The goal was occupied; search forward for a free
730 * block within the next XX blocks.
731 *
732 * end_goal is more or less random, but it has to be
733 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
734 * next 64-bit boundary is simple..
735 */
736 ext4_grpblk_t end_goal = (start + 63) & ~63;
737 if (end_goal > maxblocks)
738 end_goal = maxblocks;
739 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
740 if (here < end_goal && ext4_test_allocatable(here, bh))
741 return here;
742 ext4_debug("Bit not found near goal\n");
743 }
744
745 here = start;
746 if (here < 0)
747 here = 0;
748
749 p = ((char *)bh->b_data) + (here >> 3);
750 r = memscan(p, 0, (maxblocks - here + 7) >> 3);
751 next = (r - ((char *)bh->b_data)) << 3;
752
753 if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
754 return next;
755
756 /*
757 * The bitmap search --- search forward alternately through the actual
758 * bitmap and the last-committed copy until we find a bit free in
759 * both
760 */
761 here = bitmap_search_next_usable_block(here, bh, maxblocks);
762 return here;
763}
764
765/**
766 * claim_block()
767 * @block: the free block (group relative) to allocate
768 * @bh: the bufferhead containts the block group bitmap
769 *
770 * We think we can allocate this block in this bitmap. Try to set the bit.
771 * If that succeeds then check that nobody has allocated and then freed the
772 * block since we saw that is was not marked in b_committed_data. If it _was_
773 * allocated and freed then clear the bit in the bitmap again and return
774 * zero (failure).
775 */
776static inline int
777claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
778{
779 struct journal_head *jh = bh2jh(bh);
780 int ret;
781
782 if (ext4_set_bit_atomic(lock, block, bh->b_data))
783 return 0;
784 jbd_lock_bh_state(bh);
785 if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
786 ext4_clear_bit_atomic(lock, block, bh->b_data);
787 ret = 0;
788 } else {
789 ret = 1;
790 }
791 jbd_unlock_bh_state(bh);
792 return ret;
793}
794
795/**
796 * ext4_try_to_allocate()
797 * @sb: superblock
798 * @handle: handle to this transaction
799 * @group: given allocation block group
800 * @bitmap_bh: bufferhead holds the block bitmap
801 * @grp_goal: given target block within the group
802 * @count: target number of blocks to allocate
803 * @my_rsv: reservation window
804 *
805 * Attempt to allocate blocks within a give range. Set the range of allocation
806 * first, then find the first free bit(s) from the bitmap (within the range),
807 * and at last, allocate the blocks by claiming the found free bit as allocated.
808 *
809 * To set the range of this allocation:
810 * if there is a reservation window, only try to allocate block(s) from the
811 * file's own reservation window;
812 * Otherwise, the allocation range starts from the give goal block, ends at
813 * the block group's last block.
814 *
815 * If we failed to allocate the desired block then we may end up crossing to a
816 * new bitmap. In that case we must release write access to the old one via
817 * ext4_journal_release_buffer(), else we'll run out of credits.
818 */
819static ext4_grpblk_t
820ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
821 struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal,
822 unsigned long *count, struct ext4_reserve_window *my_rsv)
823{
824 ext4_fsblk_t group_first_block;
825 ext4_grpblk_t start, end;
826 unsigned long num = 0;
827
828 /* we do allocation within the reservation window if we have a window */
829 if (my_rsv) {
830 group_first_block = ext4_group_first_block_no(sb, group);
831 if (my_rsv->_rsv_start >= group_first_block)
832 start = my_rsv->_rsv_start - group_first_block;
833 else
834 /* reservation window cross group boundary */
835 start = 0;
836 end = my_rsv->_rsv_end - group_first_block + 1;
837 if (end > EXT4_BLOCKS_PER_GROUP(sb))
838 /* reservation window crosses group boundary */
839 end = EXT4_BLOCKS_PER_GROUP(sb);
840 if ((start <= grp_goal) && (grp_goal < end))
841 start = grp_goal;
842 else
843 grp_goal = -1;
844 } else {
845 if (grp_goal > 0)
846 start = grp_goal;
847 else
848 start = 0;
849 end = EXT4_BLOCKS_PER_GROUP(sb);
850 }
851
852 BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
853
854repeat:
855 if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
856 grp_goal = find_next_usable_block(start, bitmap_bh, end);
857 if (grp_goal < 0)
858 goto fail_access;
859 if (!my_rsv) {
860 int i;
861
862 for (i = 0; i < 7 && grp_goal > start &&
863 ext4_test_allocatable(grp_goal - 1,
864 bitmap_bh);
865 i++, grp_goal--)
866 ;
867 }
868 }
869 start = grp_goal;
870
871 if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
872 grp_goal, bitmap_bh)) {
873 /*
874 * The block was allocated by another thread, or it was
875 * allocated and then freed by another thread
876 */
877 start++;
878 grp_goal++;
879 if (start >= end)
880 goto fail_access;
881 goto repeat;
882 }
883 num++;
884 grp_goal++;
885 while (num < *count && grp_goal < end
886 && ext4_test_allocatable(grp_goal, bitmap_bh)
887 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
888 grp_goal, bitmap_bh)) {
889 num++;
890 grp_goal++;
891 }
892 *count = num;
893 return grp_goal - num;
894fail_access:
895 *count = num;
896 return -1;
897}
898
899/**
900 * find_next_reservable_window():
901 * find a reservable space within the given range.
902 * It does not allocate the reservation window for now:
903 * alloc_new_reservation() will do the work later.
904 *
905 * @search_head: the head of the searching list;
906 * This is not necessarily the list head of the whole filesystem
907 *
908 * We have both head and start_block to assist the search
909 * for the reservable space. The list starts from head,
910 * but we will shift to the place where start_block is,
911 * then start from there, when looking for a reservable space.
912 *
913 * @size: the target new reservation window size
914 *
915 * @group_first_block: the first block we consider to start
916 * the real search from
917 *
918 * @last_block:
919 * the maximum block number that our goal reservable space
920 * could start from. This is normally the last block in this
921 * group. The search will end when we found the start of next
922 * possible reservable space is out of this boundary.
923 * This could handle the cross boundary reservation window
924 * request.
925 *
926 * basically we search from the given range, rather than the whole
927 * reservation double linked list, (start_block, last_block)
928 * to find a free region that is of my size and has not
929 * been reserved.
930 *
931 */
932static int find_next_reservable_window(
933 struct ext4_reserve_window_node *search_head,
934 struct ext4_reserve_window_node *my_rsv,
935 struct super_block * sb,
936 ext4_fsblk_t start_block,
937 ext4_fsblk_t last_block)
938{
939 struct rb_node *next;
940 struct ext4_reserve_window_node *rsv, *prev;
941 ext4_fsblk_t cur;
942 int size = my_rsv->rsv_goal_size;
943
944 /* TODO: make the start of the reservation window byte-aligned */
945 /* cur = *start_block & ~7;*/
946 cur = start_block;
947 rsv = search_head;
948 if (!rsv)
949 return -1;
950
951 while (1) {
952 if (cur <= rsv->rsv_end)
953 cur = rsv->rsv_end + 1;
954
955 /* TODO?
956 * in the case we could not find a reservable space
957 * that is what is expected, during the re-search, we could
958 * remember what's the largest reservable space we could have
959 * and return that one.
960 *
961 * For now it will fail if we could not find the reservable
962 * space with expected-size (or more)...
963 */
964 if (cur > last_block)
965 return -1; /* fail */
966
967 prev = rsv;
968 next = rb_next(&rsv->rsv_node);
969 rsv = list_entry(next,struct ext4_reserve_window_node,rsv_node);
970
971 /*
972 * Reached the last reservation, we can just append to the
973 * previous one.
974 */
975 if (!next)
976 break;
977
978 if (cur + size <= rsv->rsv_start) {
979 /*
980 * Found a reserveable space big enough. We could
981 * have a reservation across the group boundary here
982 */
983 break;
984 }
985 }
986 /*
987 * we come here either :
988 * when we reach the end of the whole list,
989 * and there is empty reservable space after last entry in the list.
990 * append it to the end of the list.
991 *
992 * or we found one reservable space in the middle of the list,
993 * return the reservation window that we could append to.
994 * succeed.
995 */
996
997 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
998 rsv_window_remove(sb, my_rsv);
999
1000 /*
1001 * Let's book the whole avaliable window for now. We will check the
1002 * disk bitmap later and then, if there are free blocks then we adjust
1003 * the window size if it's larger than requested.
1004 * Otherwise, we will remove this node from the tree next time
1005 * call find_next_reservable_window.
1006 */
1007 my_rsv->rsv_start = cur;
1008 my_rsv->rsv_end = cur + size - 1;
1009 my_rsv->rsv_alloc_hit = 0;
1010
1011 if (prev != my_rsv)
1012 ext4_rsv_window_add(sb, my_rsv);
1013
1014 return 0;
1015}
1016
1017/**
1018 * alloc_new_reservation()--allocate a new reservation window
1019 *
1020 * To make a new reservation, we search part of the filesystem
1021 * reservation list (the list that inside the group). We try to
1022 * allocate a new reservation window near the allocation goal,
1023 * or the beginning of the group, if there is no goal.
1024 *
1025 * We first find a reservable space after the goal, then from
1026 * there, we check the bitmap for the first free block after
1027 * it. If there is no free block until the end of group, then the
1028 * whole group is full, we failed. Otherwise, check if the free
1029 * block is inside the expected reservable space, if so, we
1030 * succeed.
1031 * If the first free block is outside the reservable space, then
1032 * start from the first free block, we search for next available
1033 * space, and go on.
1034 *
1035 * on succeed, a new reservation will be found and inserted into the list
1036 * It contains at least one free block, and it does not overlap with other
1037 * reservation windows.
1038 *
1039 * failed: we failed to find a reservation window in this group
1040 *
1041 * @rsv: the reservation
1042 *
1043 * @grp_goal: The goal (group-relative). It is where the search for a
1044 * free reservable space should start from.
1045 * if we have a grp_goal(grp_goal >0 ), then start from there,
1046 * no grp_goal(grp_goal = -1), we start from the first block
1047 * of the group.
1048 *
1049 * @sb: the super block
1050 * @group: the group we are trying to allocate in
1051 * @bitmap_bh: the block group block bitmap
1052 *
1053 */
1054static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1055 ext4_grpblk_t grp_goal, struct super_block *sb,
1056 unsigned int group, struct buffer_head *bitmap_bh)
1057{
1058 struct ext4_reserve_window_node *search_head;
1059 ext4_fsblk_t group_first_block, group_end_block, start_block;
1060 ext4_grpblk_t first_free_block;
1061 struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1062 unsigned long size;
1063 int ret;
1064 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1065
1066 group_first_block = ext4_group_first_block_no(sb, group);
1067 group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1068
1069 if (grp_goal < 0)
1070 start_block = group_first_block;
1071 else
1072 start_block = grp_goal + group_first_block;
1073
1074 size = my_rsv->rsv_goal_size;
1075
1076 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1077 /*
1078 * if the old reservation is cross group boundary
1079 * and if the goal is inside the old reservation window,
1080 * we will come here when we just failed to allocate from
1081 * the first part of the window. We still have another part
1082 * that belongs to the next group. In this case, there is no
1083 * point to discard our window and try to allocate a new one
1084 * in this group(which will fail). we should
1085 * keep the reservation window, just simply move on.
1086 *
1087 * Maybe we could shift the start block of the reservation
1088 * window to the first block of next group.
1089 */
1090
1091 if ((my_rsv->rsv_start <= group_end_block) &&
1092 (my_rsv->rsv_end > group_end_block) &&
1093 (start_block >= my_rsv->rsv_start))
1094 return -1;
1095
1096 if ((my_rsv->rsv_alloc_hit >
1097 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1098 /*
1099 * if the previously allocation hit ratio is
1100 * greater than 1/2, then we double the size of
1101 * the reservation window the next time,
1102 * otherwise we keep the same size window
1103 */
1104 size = size * 2;
1105 if (size > EXT4_MAX_RESERVE_BLOCKS)
1106 size = EXT4_MAX_RESERVE_BLOCKS;
1107 my_rsv->rsv_goal_size= size;
1108 }
1109 }
1110
1111 spin_lock(rsv_lock);
1112 /*
1113 * shift the search start to the window near the goal block
1114 */
1115 search_head = search_reserve_window(fs_rsv_root, start_block);
1116
1117 /*
1118 * find_next_reservable_window() simply finds a reservable window
1119 * inside the given range(start_block, group_end_block).
1120 *
1121 * To make sure the reservation window has a free bit inside it, we
1122 * need to check the bitmap after we found a reservable window.
1123 */
1124retry:
1125 ret = find_next_reservable_window(search_head, my_rsv, sb,
1126 start_block, group_end_block);
1127
1128 if (ret == -1) {
1129 if (!rsv_is_empty(&my_rsv->rsv_window))
1130 rsv_window_remove(sb, my_rsv);
1131 spin_unlock(rsv_lock);
1132 return -1;
1133 }
1134
1135 /*
1136 * On success, find_next_reservable_window() returns the
1137 * reservation window where there is a reservable space after it.
1138 * Before we reserve this reservable space, we need
1139 * to make sure there is at least a free block inside this region.
1140 *
1141 * searching the first free bit on the block bitmap and copy of
1142 * last committed bitmap alternatively, until we found a allocatable
1143 * block. Search start from the start block of the reservable space
1144 * we just found.
1145 */
1146 spin_unlock(rsv_lock);
1147 first_free_block = bitmap_search_next_usable_block(
1148 my_rsv->rsv_start - group_first_block,
1149 bitmap_bh, group_end_block - group_first_block + 1);
1150
1151 if (first_free_block < 0) {
1152 /*
1153 * no free block left on the bitmap, no point
1154 * to reserve the space. return failed.
1155 */
1156 spin_lock(rsv_lock);
1157 if (!rsv_is_empty(&my_rsv->rsv_window))
1158 rsv_window_remove(sb, my_rsv);
1159 spin_unlock(rsv_lock);
1160 return -1; /* failed */
1161 }
1162
1163 start_block = first_free_block + group_first_block;
1164 /*
1165 * check if the first free block is within the
1166 * free space we just reserved
1167 */
1168 if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
1169 return 0; /* success */
1170 /*
1171 * if the first free bit we found is out of the reservable space
1172 * continue search for next reservable space,
1173 * start from where the free block is,
1174 * we also shift the list head to where we stopped last time
1175 */
1176 search_head = my_rsv;
1177 spin_lock(rsv_lock);
1178 goto retry;
1179}
1180
1181/**
1182 * try_to_extend_reservation()
1183 * @my_rsv: given reservation window
1184 * @sb: super block
1185 * @size: the delta to extend
1186 *
1187 * Attempt to expand the reservation window large enough to have
1188 * required number of free blocks
1189 *
1190 * Since ext4_try_to_allocate() will always allocate blocks within
1191 * the reservation window range, if the window size is too small,
1192 * multiple blocks allocation has to stop at the end of the reservation
1193 * window. To make this more efficient, given the total number of
1194 * blocks needed and the current size of the window, we try to
1195 * expand the reservation window size if necessary on a best-effort
1196 * basis before ext4_new_blocks() tries to allocate blocks,
1197 */
1198static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1199 struct super_block *sb, int size)
1200{
1201 struct ext4_reserve_window_node *next_rsv;
1202 struct rb_node *next;
1203 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1204
1205 if (!spin_trylock(rsv_lock))
1206 return;
1207
1208 next = rb_next(&my_rsv->rsv_node);
1209
1210 if (!next)
1211 my_rsv->rsv_end += size;
1212 else {
1213 next_rsv = list_entry(next, struct ext4_reserve_window_node, rsv_node);
1214
1215 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1216 my_rsv->rsv_end += size;
1217 else
1218 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1219 }
1220 spin_unlock(rsv_lock);
1221}
1222
1223/**
1224 * ext4_try_to_allocate_with_rsv()
1225 * @sb: superblock
1226 * @handle: handle to this transaction
1227 * @group: given allocation block group
1228 * @bitmap_bh: bufferhead holds the block bitmap
1229 * @grp_goal: given target block within the group
1230 * @count: target number of blocks to allocate
1231 * @my_rsv: reservation window
1232 * @errp: pointer to store the error code
1233 *
1234 * This is the main function used to allocate a new block and its reservation
1235 * window.
1236 *
1237 * Each time when a new block allocation is need, first try to allocate from
1238 * its own reservation. If it does not have a reservation window, instead of
1239 * looking for a free bit on bitmap first, then look up the reservation list to
1240 * see if it is inside somebody else's reservation window, we try to allocate a
1241 * reservation window for it starting from the goal first. Then do the block
1242 * allocation within the reservation window.
1243 *
1244 * This will avoid keeping on searching the reservation list again and
1245 * again when somebody is looking for a free block (without
1246 * reservation), and there are lots of free blocks, but they are all
1247 * being reserved.
1248 *
1249 * We use a red-black tree for the per-filesystem reservation list.
1250 *
1251 */
1252static ext4_grpblk_t
1253ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1254 unsigned int group, struct buffer_head *bitmap_bh,
1255 ext4_grpblk_t grp_goal,
1256 struct ext4_reserve_window_node * my_rsv,
1257 unsigned long *count, int *errp)
1258{
1259 ext4_fsblk_t group_first_block, group_last_block;
1260 ext4_grpblk_t ret = 0;
1261 int fatal;
1262 unsigned long num = *count;
1263
1264 *errp = 0;
1265
1266 /*
1267 * Make sure we use undo access for the bitmap, because it is critical
1268 * that we do the frozen_data COW on bitmap buffers in all cases even
1269 * if the buffer is in BJ_Forget state in the committing transaction.
1270 */
1271 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1272 fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1273 if (fatal) {
1274 *errp = fatal;
1275 return -1;
1276 }
1277
1278 /*
1279 * we don't deal with reservation when
1280 * filesystem is mounted without reservation
1281 * or the file is not a regular file
1282 * or last attempt to allocate a block with reservation turned on failed
1283 */
1284 if (my_rsv == NULL ) {
1285 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1286 grp_goal, count, NULL);
1287 goto out;
1288 }
1289 /*
1290 * grp_goal is a group relative block number (if there is a goal)
1291 * 0 < grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1292 * first block is a filesystem wide block number
1293 * first block is the block number of the first block in this group
1294 */
1295 group_first_block = ext4_group_first_block_no(sb, group);
1296 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1297
1298 /*
1299 * Basically we will allocate a new block from inode's reservation
1300 * window.
1301 *
1302 * We need to allocate a new reservation window, if:
1303 * a) inode does not have a reservation window; or
1304 * b) last attempt to allocate a block from existing reservation
1305 * failed; or
1306 * c) we come here with a goal and with a reservation window
1307 *
1308 * We do not need to allocate a new reservation window if we come here
1309 * at the beginning with a goal and the goal is inside the window, or
1310 * we don't have a goal but already have a reservation window.
1311 * then we could go to allocate from the reservation window directly.
1312 */
1313 while (1) {
1314 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1315 !goal_in_my_reservation(&my_rsv->rsv_window,
1316 grp_goal, group, sb)) {
1317 if (my_rsv->rsv_goal_size < *count)
1318 my_rsv->rsv_goal_size = *count;
1319 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1320 group, bitmap_bh);
1321 if (ret < 0)
1322 break; /* failed */
1323
1324 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1325 grp_goal, group, sb))
1326 grp_goal = -1;
1327 } else if (grp_goal > 0 &&
1328 (my_rsv->rsv_end-grp_goal+1) < *count)
1329 try_to_extend_reservation(my_rsv, sb,
1330 *count-my_rsv->rsv_end + grp_goal - 1);
1331
1332 if ((my_rsv->rsv_start > group_last_block) ||
1333 (my_rsv->rsv_end < group_first_block)) {
1334 rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1335 BUG();
1336 }
1337 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1338 grp_goal, &num, &my_rsv->rsv_window);
1339 if (ret >= 0) {
1340 my_rsv->rsv_alloc_hit += num;
1341 *count = num;
1342 break; /* succeed */
1343 }
1344 num = *count;
1345 }
1346out:
1347 if (ret >= 0) {
1348 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1349 "bitmap block");
1350 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1351 if (fatal) {
1352 *errp = fatal;
1353 return -1;
1354 }
1355 return ret;
1356 }
1357
1358 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1359 ext4_journal_release_buffer(handle, bitmap_bh);
1360 return ret;
1361}
1362
1363/**
1364 * ext4_has_free_blocks()
1365 * @sbi: in-core super block structure.
1366 *
1367 * Check if filesystem has at least 1 free block available for allocation.
1368 */
1369static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
1370{
1371 ext4_fsblk_t free_blocks, root_blocks;
1372
1373 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1374 root_blocks = ext4_r_blocks_count(sbi->s_es);
1375 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1376 sbi->s_resuid != current->fsuid &&
1377 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1378 return 0;
1379 }
1380 return 1;
1381}
1382
1383/**
1384 * ext4_should_retry_alloc()
1385 * @sb: super block
1386 * @retries number of attemps has been made
1387 *
1388 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
1389 * it is profitable to retry the operation, this function will wait
1390 * for the current or commiting transaction to complete, and then
1391 * return TRUE.
1392 *
1393 * if the total number of retries exceed three times, return FALSE.
1394 */
1395int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1396{
1397 if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
1398 return 0;
1399
1400 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1401
1402 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1403}
1404
1405/**
1406 * ext4_new_blocks() -- core block(s) allocation function
1407 * @handle: handle to this transaction
1408 * @inode: file inode
1409 * @goal: given target block(filesystem wide)
1410 * @count: target number of blocks to allocate
1411 * @errp: error code
1412 *
1413 * ext4_new_blocks uses a goal block to assist allocation. It tries to
1414 * allocate block(s) from the block group contains the goal block first. If that
1415 * fails, it will try to allocate block(s) from other block groups without
1416 * any specific goal block.
1417 *
1418 */
1419ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1420 ext4_fsblk_t goal, unsigned long *count, int *errp)
1421{
1422 struct buffer_head *bitmap_bh = NULL;
1423 struct buffer_head *gdp_bh;
1424 unsigned long group_no;
1425 int goal_group;
1426 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1427 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1428 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1429 int bgi; /* blockgroup iteration index */
1430 int fatal = 0, err;
1431 int performed_allocation = 0;
1432 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
1433 struct super_block *sb;
1434 struct ext4_group_desc *gdp;
1435 struct ext4_super_block *es;
1436 struct ext4_sb_info *sbi;
1437 struct ext4_reserve_window_node *my_rsv = NULL;
1438 struct ext4_block_alloc_info *block_i;
1439 unsigned short windowsz = 0;
1440#ifdef EXT4FS_DEBUG
1441 static int goal_hits, goal_attempts;
1442#endif
1443 unsigned long ngroups;
1444 unsigned long num = *count;
1445
1446 *errp = -ENOSPC;
1447 sb = inode->i_sb;
1448 if (!sb) {
1449 printk("ext4_new_block: nonexistent device");
1450 return 0;
1451 }
1452
1453 /*
1454 * Check quota for allocation of this block.
1455 */
1456 if (DQUOT_ALLOC_BLOCK(inode, num)) {
1457 *errp = -EDQUOT;
1458 return 0;
1459 }
1460
1461 sbi = EXT4_SB(sb);
1462 es = EXT4_SB(sb)->s_es;
1463 ext4_debug("goal=%lu.\n", goal);
1464 /*
1465 * Allocate a block from reservation only when
1466 * filesystem is mounted with reservation(default,-o reservation), and
1467 * it's a regular file, and
1468 * the desired window size is greater than 0 (One could use ioctl
1469 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1470 * reservation on that particular file)
1471 */
1472 block_i = EXT4_I(inode)->i_block_alloc_info;
1473 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1474 my_rsv = &block_i->rsv_window_node;
1475
1476 if (!ext4_has_free_blocks(sbi)) {
1477 *errp = -ENOSPC;
1478 goto out;
1479 }
1480
1481 /*
1482 * First, test whether the goal block is free.
1483 */
1484 if (goal < le32_to_cpu(es->s_first_data_block) ||
1485 goal >= ext4_blocks_count(es))
1486 goal = le32_to_cpu(es->s_first_data_block);
1487 ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1488 goal_group = group_no;
1489retry_alloc:
1490 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1491 if (!gdp)
1492 goto io_error;
1493
1494 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1495 /*
1496 * if there is not enough free blocks to make a new resevation
1497 * turn off reservation for this allocation
1498 */
1499 if (my_rsv && (free_blocks < windowsz)
1500 && (rsv_is_empty(&my_rsv->rsv_window)))
1501 my_rsv = NULL;
1502
1503 if (free_blocks > 0) {
1504 bitmap_bh = read_block_bitmap(sb, group_no);
1505 if (!bitmap_bh)
1506 goto io_error;
1507 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1508 group_no, bitmap_bh, grp_target_blk,
1509 my_rsv, &num, &fatal);
1510 if (fatal)
1511 goto out;
1512 if (grp_alloc_blk >= 0)
1513 goto allocated;
1514 }
1515
1516 ngroups = EXT4_SB(sb)->s_groups_count;
1517 smp_rmb();
1518
1519 /*
1520 * Now search the rest of the groups. We assume that
1521 * i and gdp correctly point to the last group visited.
1522 */
1523 for (bgi = 0; bgi < ngroups; bgi++) {
1524 group_no++;
1525 if (group_no >= ngroups)
1526 group_no = 0;
1527 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1528 if (!gdp) {
1529 *errp = -EIO;
1530 goto out;
1531 }
1532 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1533 /*
1534 * skip this group if the number of
1535 * free blocks is less than half of the reservation
1536 * window size.
1537 */
1538 if (free_blocks <= (windowsz/2))
1539 continue;
1540
1541 brelse(bitmap_bh);
1542 bitmap_bh = read_block_bitmap(sb, group_no);
1543 if (!bitmap_bh)
1544 goto io_error;
1545 /*
1546 * try to allocate block(s) from this group, without a goal(-1).
1547 */
1548 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1549 group_no, bitmap_bh, -1, my_rsv,
1550 &num, &fatal);
1551 if (fatal)
1552 goto out;
1553 if (grp_alloc_blk >= 0)
1554 goto allocated;
1555 }
1556 /*
1557 * We may end up a bogus ealier ENOSPC error due to
1558 * filesystem is "full" of reservations, but
1559 * there maybe indeed free blocks avaliable on disk
1560 * In this case, we just forget about the reservations
1561 * just do block allocation as without reservations.
1562 */
1563 if (my_rsv) {
1564 my_rsv = NULL;
1565 group_no = goal_group;
1566 goto retry_alloc;
1567 }
1568 /* No space left on the device */
1569 *errp = -ENOSPC;
1570 goto out;
1571
1572allocated:
1573
1574 ext4_debug("using block group %d(%d)\n",
1575 group_no, gdp->bg_free_blocks_count);
1576
1577 BUFFER_TRACE(gdp_bh, "get_write_access");
1578 fatal = ext4_journal_get_write_access(handle, gdp_bh);
1579 if (fatal)
1580 goto out;
1581
1582 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1583
1584 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1585 in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1586 in_range(ret_block, ext4_inode_table(sb, gdp),
1587 EXT4_SB(sb)->s_itb_per_group) ||
1588 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1589 EXT4_SB(sb)->s_itb_per_group))
1590 ext4_error(sb, "ext4_new_block",
1591 "Allocating block in system zone - "
1592 "blocks from %llu, length %lu",
1593 ret_block, num);
1594
1595 performed_allocation = 1;
1596
1597#ifdef CONFIG_JBD_DEBUG
1598 {
1599 struct buffer_head *debug_bh;
1600
1601 /* Record bitmap buffer state in the newly allocated block */
1602 debug_bh = sb_find_get_block(sb, ret_block);
1603 if (debug_bh) {
1604 BUFFER_TRACE(debug_bh, "state when allocated");
1605 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1606 brelse(debug_bh);
1607 }
1608 }
1609 jbd_lock_bh_state(bitmap_bh);
1610 spin_lock(sb_bgl_lock(sbi, group_no));
1611 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1612 int i;
1613
1614 for (i = 0; i < num; i++) {
1615 if (ext4_test_bit(grp_alloc_blk+i,
1616 bh2jh(bitmap_bh)->b_committed_data)) {
1617 printk("%s: block was unexpectedly set in "
1618 "b_committed_data\n", __FUNCTION__);
1619 }
1620 }
1621 }
1622 ext4_debug("found bit %d\n", grp_alloc_blk);
1623 spin_unlock(sb_bgl_lock(sbi, group_no));
1624 jbd_unlock_bh_state(bitmap_bh);
1625#endif
1626
1627 if (ret_block + num - 1 >= ext4_blocks_count(es)) {
1628 ext4_error(sb, "ext4_new_block",
1629 "block(%llu) >= blocks count(%llu) - "
1630 "block_group = %lu, es == %p ", ret_block,
1631 ext4_blocks_count(es), group_no, es);
1632 goto out;
1633 }
1634
1635 /*
1636 * It is up to the caller to add the new buffer to a journal
1637 * list of some description. We don't know in advance whether
1638 * the caller wants to use it as metadata or data.
1639 */
1640 ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
1641 ret_block, goal_hits, goal_attempts);
1642
1643 spin_lock(sb_bgl_lock(sbi, group_no));
1644 gdp->bg_free_blocks_count =
1645 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
1646 spin_unlock(sb_bgl_lock(sbi, group_no));
1647 percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
1648
1649 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1650 err = ext4_journal_dirty_metadata(handle, gdp_bh);
1651 if (!fatal)
1652 fatal = err;
1653
1654 sb->s_dirt = 1;
1655 if (fatal)
1656 goto out;
1657
1658 *errp = 0;
1659 brelse(bitmap_bh);
1660 DQUOT_FREE_BLOCK(inode, *count-num);
1661 *count = num;
1662 return ret_block;
1663
1664io_error:
1665 *errp = -EIO;
1666out:
1667 if (fatal) {
1668 *errp = fatal;
1669 ext4_std_error(sb, fatal);
1670 }
1671 /*
1672 * Undo the block allocation
1673 */
1674 if (!performed_allocation)
1675 DQUOT_FREE_BLOCK(inode, *count);
1676 brelse(bitmap_bh);
1677 return 0;
1678}
1679
1680ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
1681 ext4_fsblk_t goal, int *errp)
1682{
1683 unsigned long count = 1;
1684
1685 return ext4_new_blocks(handle, inode, goal, &count, errp);
1686}
1687
1688/**
1689 * ext4_count_free_blocks() -- count filesystem free blocks
1690 * @sb: superblock
1691 *
1692 * Adds up the number of free blocks from each block group.
1693 */
1694ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1695{
1696 ext4_fsblk_t desc_count;
1697 struct ext4_group_desc *gdp;
1698 int i;
1699 unsigned long ngroups = EXT4_SB(sb)->s_groups_count;
1700#ifdef EXT4FS_DEBUG
1701 struct ext4_super_block *es;
1702 ext4_fsblk_t bitmap_count;
1703 unsigned long x;
1704 struct buffer_head *bitmap_bh = NULL;
1705
1706 es = EXT4_SB(sb)->s_es;
1707 desc_count = 0;
1708 bitmap_count = 0;
1709 gdp = NULL;
1710
1711 smp_rmb();
1712 for (i = 0; i < ngroups; i++) {
1713 gdp = ext4_get_group_desc(sb, i, NULL);
1714 if (!gdp)
1715 continue;
1716 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1717 brelse(bitmap_bh);
1718 bitmap_bh = read_block_bitmap(sb, i);
1719 if (bitmap_bh == NULL)
1720 continue;
1721
1722 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
1723 printk("group %d: stored = %d, counted = %lu\n",
1724 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1725 bitmap_count += x;
1726 }
1727 brelse(bitmap_bh);
1728 printk("ext4_count_free_blocks: stored = %llu"
1729 ", computed = %llu, %llu\n",
1730 EXT4_FREE_BLOCKS_COUNT(es),
1731 desc_count, bitmap_count);
1732 return bitmap_count;
1733#else
1734 desc_count = 0;
1735 smp_rmb();
1736 for (i = 0; i < ngroups; i++) {
1737 gdp = ext4_get_group_desc(sb, i, NULL);
1738 if (!gdp)
1739 continue;
1740 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1741 }
1742
1743 return desc_count;
1744#endif
1745}
1746
1747static inline int
1748block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
1749{
1750 ext4_grpblk_t offset;
1751
1752 ext4_get_group_no_and_offset(sb, block, NULL, &offset);
1753 return ext4_test_bit (offset, map);
1754}
1755
1756static inline int test_root(int a, int b)
1757{
1758 int num = b;
1759
1760 while (a > num)
1761 num *= b;
1762 return num == a;
1763}
1764
1765static int ext4_group_sparse(int group)
1766{
1767 if (group <= 1)
1768 return 1;
1769 if (!(group & 1))
1770 return 0;
1771 return (test_root(group, 7) || test_root(group, 5) ||
1772 test_root(group, 3));
1773}
1774
1775/**
1776 * ext4_bg_has_super - number of blocks used by the superblock in group
1777 * @sb: superblock for filesystem
1778 * @group: group number to check
1779 *
1780 * Return the number of blocks used by the superblock (primary or backup)
1781 * in this group. Currently this will be only 0 or 1.
1782 */
1783int ext4_bg_has_super(struct super_block *sb, int group)
1784{
1785 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1786 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1787 !ext4_group_sparse(group))
1788 return 0;
1789 return 1;
1790}
1791
1792static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group)
1793{
1794 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
1795 unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb);
1796 unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
1797
1798 if (group == first || group == first + 1 || group == last)
1799 return 1;
1800 return 0;
1801}
1802
1803static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
1804{
1805 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1806 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1807 !ext4_group_sparse(group))
1808 return 0;
1809 return EXT4_SB(sb)->s_gdb_count;
1810}
1811
1812/**
1813 * ext4_bg_num_gdb - number of blocks used by the group table in group
1814 * @sb: superblock for filesystem
1815 * @group: group number to check
1816 *
1817 * Return the number of blocks used by the group descriptor table
1818 * (primary or backup) in this group. In the future there may be a
1819 * different number of descriptor blocks in each group.
1820 */
1821unsigned long ext4_bg_num_gdb(struct super_block *sb, int group)
1822{
1823 unsigned long first_meta_bg =
1824 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
1825 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
1826
1827 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
1828 metagroup < first_meta_bg)
1829 return ext4_bg_num_gdb_nometa(sb,group);
1830
1831 return ext4_bg_num_gdb_meta(sb,group);
1832
1833}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 000000000000..11e93c169bcf
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,32 @@
1/*
2 * linux/fs/ext4/bitmap.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/buffer_head.h>
11#include <linux/jbd2.h>
12#include <linux/ext4_fs.h>
13
14#ifdef EXT4FS_DEBUG
15
16static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17
18unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
19{
20 unsigned int i;
21 unsigned long sum = 0;
22
23 if (!map)
24 return (0);
25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum);
29}
30
31#endif /* EXT4FS_DEBUG */
32
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 000000000000..f8595787a70e
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,518 @@
1/*
2 * linux/fs/ext4/dir.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/dir.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext4 directory handling functions
16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 *
22 */
23
24#include <linux/fs.h>
25#include <linux/jbd2.h>
26#include <linux/ext4_fs.h>
27#include <linux/buffer_head.h>
28#include <linux/smp_lock.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32static unsigned char ext4_filetype_table[] = {
33 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
34};
35
36static int ext4_readdir(struct file *, void *, filldir_t);
37static int ext4_dx_readdir(struct file * filp,
38 void * dirent, filldir_t filldir);
39static int ext4_release_dir (struct inode * inode,
40 struct file * filp);
41
42const struct file_operations ext4_dir_operations = {
43 .llseek = generic_file_llseek,
44 .read = generic_read_dir,
45 .readdir = ext4_readdir, /* we take BKL. needed?*/
46 .ioctl = ext4_ioctl, /* BKL held */
47#ifdef CONFIG_COMPAT
48 .compat_ioctl = ext4_compat_ioctl,
49#endif
50 .fsync = ext4_sync_file, /* BKL held */
51#ifdef CONFIG_EXT4_INDEX
52 .release = ext4_release_dir,
53#endif
54};
55
56
57static unsigned char get_dtype(struct super_block *sb, int filetype)
58{
59 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
60 (filetype >= EXT4_FT_MAX))
61 return DT_UNKNOWN;
62
63 return (ext4_filetype_table[filetype]);
64}
65
66
67int ext4_check_dir_entry (const char * function, struct inode * dir,
68 struct ext4_dir_entry_2 * de,
69 struct buffer_head * bh,
70 unsigned long offset)
71{
72 const char * error_msg = NULL;
73 const int rlen = le16_to_cpu(de->rec_len);
74
75 if (rlen < EXT4_DIR_REC_LEN(1))
76 error_msg = "rec_len is smaller than minimal";
77 else if (rlen % 4 != 0)
78 error_msg = "rec_len % 4 != 0";
79 else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
80 error_msg = "rec_len is too small for name_len";
81 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
82 error_msg = "directory entry across blocks";
83 else if (le32_to_cpu(de->inode) >
84 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
85 error_msg = "inode out of bounds";
86
87 if (error_msg != NULL)
88 ext4_error (dir->i_sb, function,
89 "bad entry in directory #%lu: %s - "
90 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
91 dir->i_ino, error_msg, offset,
92 (unsigned long) le32_to_cpu(de->inode),
93 rlen, de->name_len);
94 return error_msg == NULL ? 1 : 0;
95}
96
97static int ext4_readdir(struct file * filp,
98 void * dirent, filldir_t filldir)
99{
100 int error = 0;
101 unsigned long offset;
102 int i, stored;
103 struct ext4_dir_entry_2 *de;
104 struct super_block *sb;
105 int err;
106 struct inode *inode = filp->f_dentry->d_inode;
107 int ret = 0;
108
109 sb = inode->i_sb;
110
111#ifdef CONFIG_EXT4_INDEX
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) {
118 ret = err;
119 goto out;
120 }
121 /*
122 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk.
124 */
125 EXT4_I(filp->f_dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
126 }
127#endif
128 stored = 0;
129 offset = filp->f_pos & (sb->s_blocksize - 1);
130
131 while (!error && !stored && filp->f_pos < inode->i_size) {
132 unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
133 struct buffer_head map_bh;
134 struct buffer_head *bh = NULL;
135
136 map_bh.b_state = 0;
137 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
138 if (err > 0) {
139 page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
140 &filp->f_ra,
141 filp,
142 map_bh.b_blocknr >>
143 (PAGE_CACHE_SHIFT - inode->i_blkbits),
144 1);
145 bh = ext4_bread(NULL, inode, blk, 0, &err);
146 }
147
148 /*
149 * We ignore I/O errors on directories so users have a chance
150 * of recovering data when there's a bad sector
151 */
152 if (!bh) {
153 ext4_error (sb, "ext4_readdir",
154 "directory #%lu contains a hole at offset %lu",
155 inode->i_ino, (unsigned long)filp->f_pos);
156 filp->f_pos += sb->s_blocksize - offset;
157 continue;
158 }
159
160revalidate:
161 /* If the dir block has changed since the last call to
162 * readdir(2), then we might be pointing to an invalid
163 * dirent right now. Scan from the start of the block
164 * to make sure. */
165 if (filp->f_version != inode->i_version) {
166 for (i = 0; i < sb->s_blocksize && i < offset; ) {
167 de = (struct ext4_dir_entry_2 *)
168 (bh->b_data + i);
169 /* It's too expensive to do a full
170 * dirent test each time round this
171 * loop, but we do have to test at
172 * least that it is non-zero. A
173 * failure will be detected in the
174 * dirent test below. */
175 if (le16_to_cpu(de->rec_len) <
176 EXT4_DIR_REC_LEN(1))
177 break;
178 i += le16_to_cpu(de->rec_len);
179 }
180 offset = i;
181 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
182 | offset;
183 filp->f_version = inode->i_version;
184 }
185
186 while (!error && filp->f_pos < inode->i_size
187 && offset < sb->s_blocksize) {
188 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
189 if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
190 bh, offset)) {
191 /*
192 * On error, skip the f_pos to the next block
193 */
194 filp->f_pos = (filp->f_pos |
195 (sb->s_blocksize - 1)) + 1;
196 brelse (bh);
197 ret = stored;
198 goto out;
199 }
200 offset += le16_to_cpu(de->rec_len);
201 if (le32_to_cpu(de->inode)) {
202 /* We might block in the next section
203 * if the data destination is
204 * currently swapped out. So, use a
205 * version stamp to detect whether or
206 * not the directory has been modified
207 * during the copy operation.
208 */
209 unsigned long version = filp->f_version;
210
211 error = filldir(dirent, de->name,
212 de->name_len,
213 filp->f_pos,
214 le32_to_cpu(de->inode),
215 get_dtype(sb, de->file_type));
216 if (error)
217 break;
218 if (version != filp->f_version)
219 goto revalidate;
220 stored ++;
221 }
222 filp->f_pos += le16_to_cpu(de->rec_len);
223 }
224 offset = 0;
225 brelse (bh);
226 }
227out:
228 return ret;
229}
230
231#ifdef CONFIG_EXT4_INDEX
232/*
233 * These functions convert from the major/minor hash to an f_pos
234 * value.
235 *
236 * Currently we only use major hash numer. This is unfortunate, but
237 * on 32-bit machines, the same VFS interface is used for lseek and
238 * llseek, so if we use the 64 bit offset, then the 32-bit versions of
239 * lseek/telldir/seekdir will blow out spectacularly, and from within
240 * the ext2 low-level routine, we don't know if we're being called by
241 * a 64-bit version of the system call or the 32-bit version of the
242 * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
243 * cookie. Sigh.
244 */
245#define hash2pos(major, minor) (major >> 1)
246#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
247#define pos2min_hash(pos) (0)
248
249/*
250 * This structure holds the nodes of the red-black tree used to store
251 * the directory entry in hash order.
252 */
253struct fname {
254 __u32 hash;
255 __u32 minor_hash;
256 struct rb_node rb_hash;
257 struct fname *next;
258 __u32 inode;
259 __u8 name_len;
260 __u8 file_type;
261 char name[0];
262};
263
264/*
265 * This functoin implements a non-recursive way of freeing all of the
266 * nodes in the red-black tree.
267 */
268static void free_rb_tree_fname(struct rb_root *root)
269{
270 struct rb_node *n = root->rb_node;
271 struct rb_node *parent;
272 struct fname *fname;
273
274 while (n) {
275 /* Do the node's children first */
276 if ((n)->rb_left) {
277 n = n->rb_left;
278 continue;
279 }
280 if (n->rb_right) {
281 n = n->rb_right;
282 continue;
283 }
284 /*
285 * The node has no children; free it, and then zero
286 * out parent's link to it. Finally go to the
287 * beginning of the loop and try to free the parent
288 * node.
289 */
290 parent = rb_parent(n);
291 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) {
293 struct fname * old = fname;
294 fname = fname->next;
295 kfree (old);
296 }
297 if (!parent)
298 root->rb_node = NULL;
299 else if (parent->rb_left == n)
300 parent->rb_left = NULL;
301 else if (parent->rb_right == n)
302 parent->rb_right = NULL;
303 n = parent;
304 }
305 root->rb_node = NULL;
306}
307
308
309static struct dir_private_info *create_dir_info(loff_t pos)
310{
311 struct dir_private_info *p;
312
313 p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
314 if (!p)
315 return NULL;
316 p->root.rb_node = NULL;
317 p->curr_node = NULL;
318 p->extra_fname = NULL;
319 p->last_pos = 0;
320 p->curr_hash = pos2maj_hash(pos);
321 p->curr_minor_hash = pos2min_hash(pos);
322 p->next_hash = 0;
323 return p;
324}
325
326void ext4_htree_free_dir_info(struct dir_private_info *p)
327{
328 free_rb_tree_fname(&p->root);
329 kfree(p);
330}
331
332/*
333 * Given a directory entry, enter it into the fname rb tree.
334 */
335int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
336 __u32 minor_hash,
337 struct ext4_dir_entry_2 *dirent)
338{
339 struct rb_node **p, *parent = NULL;
340 struct fname * fname, *new_fn;
341 struct dir_private_info *info;
342 int len;
343
344 info = (struct dir_private_info *) dir_file->private_data;
345 p = &info->root.rb_node;
346
347 /* Create and allocate the fname structure */
348 len = sizeof(struct fname) + dirent->name_len + 1;
349 new_fn = kzalloc(len, GFP_KERNEL);
350 if (!new_fn)
351 return -ENOMEM;
352 new_fn->hash = hash;
353 new_fn->minor_hash = minor_hash;
354 new_fn->inode = le32_to_cpu(dirent->inode);
355 new_fn->name_len = dirent->name_len;
356 new_fn->file_type = dirent->file_type;
357 memcpy(new_fn->name, dirent->name, dirent->name_len);
358 new_fn->name[dirent->name_len] = 0;
359
360 while (*p) {
361 parent = *p;
362 fname = rb_entry(parent, struct fname, rb_hash);
363
364 /*
365 * If the hash and minor hash match up, then we put
366 * them on a linked list. This rarely happens...
367 */
368 if ((new_fn->hash == fname->hash) &&
369 (new_fn->minor_hash == fname->minor_hash)) {
370 new_fn->next = fname->next;
371 fname->next = new_fn;
372 return 0;
373 }
374
375 if (new_fn->hash < fname->hash)
376 p = &(*p)->rb_left;
377 else if (new_fn->hash > fname->hash)
378 p = &(*p)->rb_right;
379 else if (new_fn->minor_hash < fname->minor_hash)
380 p = &(*p)->rb_left;
381 else /* if (new_fn->minor_hash > fname->minor_hash) */
382 p = &(*p)->rb_right;
383 }
384
385 rb_link_node(&new_fn->rb_hash, parent, p);
386 rb_insert_color(&new_fn->rb_hash, &info->root);
387 return 0;
388}
389
390
391
392/*
393 * This is a helper function for ext4_dx_readdir. It calls filldir
394 * for all entres on the fname linked list. (Normally there is only
395 * one entry on the linked list, unless there are 62 bit hash collisions.)
396 */
397static int call_filldir(struct file * filp, void * dirent,
398 filldir_t filldir, struct fname *fname)
399{
400 struct dir_private_info *info = filp->private_data;
401 loff_t curr_pos;
402 struct inode *inode = filp->f_dentry->d_inode;
403 struct super_block * sb;
404 int error;
405
406 sb = inode->i_sb;
407
408 if (!fname) {
409 printk("call_filldir: called with null fname?!?\n");
410 return 0;
411 }
412 curr_pos = hash2pos(fname->hash, fname->minor_hash);
413 while (fname) {
414 error = filldir(dirent, fname->name,
415 fname->name_len, curr_pos,
416 fname->inode,
417 get_dtype(sb, fname->file_type));
418 if (error) {
419 filp->f_pos = curr_pos;
420 info->extra_fname = fname->next;
421 return error;
422 }
423 fname = fname->next;
424 }
425 return 0;
426}
427
428static int ext4_dx_readdir(struct file * filp,
429 void * dirent, filldir_t filldir)
430{
431 struct dir_private_info *info = filp->private_data;
432 struct inode *inode = filp->f_dentry->d_inode;
433 struct fname *fname;
434 int ret;
435
436 if (!info) {
437 info = create_dir_info(filp->f_pos);
438 if (!info)
439 return -ENOMEM;
440 filp->private_data = info;
441 }
442
443 if (filp->f_pos == EXT4_HTREE_EOF)
444 return 0; /* EOF */
445
446 /* Some one has messed with f_pos; reset the world */
447 if (info->last_pos != filp->f_pos) {
448 free_rb_tree_fname(&info->root);
449 info->curr_node = NULL;
450 info->extra_fname = NULL;
451 info->curr_hash = pos2maj_hash(filp->f_pos);
452 info->curr_minor_hash = pos2min_hash(filp->f_pos);
453 }
454
455 /*
456 * If there are any leftover names on the hash collision
457 * chain, return them first.
458 */
459 if (info->extra_fname &&
460 call_filldir(filp, dirent, filldir, info->extra_fname))
461 goto finished;
462
463 if (!info->curr_node)
464 info->curr_node = rb_first(&info->root);
465
466 while (1) {
467 /*
468 * Fill the rbtree if we have no more entries,
469 * or the inode has changed since we last read in the
470 * cached entries.
471 */
472 if ((!info->curr_node) ||
473 (filp->f_version != inode->i_version)) {
474 info->curr_node = NULL;
475 free_rb_tree_fname(&info->root);
476 filp->f_version = inode->i_version;
477 ret = ext4_htree_fill_tree(filp, info->curr_hash,
478 info->curr_minor_hash,
479 &info->next_hash);
480 if (ret < 0)
481 return ret;
482 if (ret == 0) {
483 filp->f_pos = EXT4_HTREE_EOF;
484 break;
485 }
486 info->curr_node = rb_first(&info->root);
487 }
488
489 fname = rb_entry(info->curr_node, struct fname, rb_hash);
490 info->curr_hash = fname->hash;
491 info->curr_minor_hash = fname->minor_hash;
492 if (call_filldir(filp, dirent, filldir, fname))
493 break;
494
495 info->curr_node = rb_next(info->curr_node);
496 if (!info->curr_node) {
497 if (info->next_hash == ~0) {
498 filp->f_pos = EXT4_HTREE_EOF;
499 break;
500 }
501 info->curr_hash = info->next_hash;
502 info->curr_minor_hash = 0;
503 }
504 }
505finished:
506 info->last_pos = filp->f_pos;
507 return 0;
508}
509
510static int ext4_release_dir (struct inode * inode, struct file * filp)
511{
512 if (filp->private_data)
513 ext4_htree_free_dir_info(filp->private_data);
514
515 return 0;
516}
517
518#endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
new file mode 100644
index 000000000000..2608dce18f3e
--- /dev/null
+++ b/fs/ext4/extents.c
@@ -0,0 +1,2152 @@
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 * Copyright (c) 2005, Bull S.A.
7 * Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 * - ext4*_error() should be used in some situations
28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 * - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/ext4_jbd2.h>
36#include <linux/jbd.h>
37#include <linux/smp_lock.h>
38#include <linux/highuid.h>
39#include <linux/pagemap.h>
40#include <linux/quotaops.h>
41#include <linux/string.h>
42#include <linux/slab.h>
43#include <linux/ext4_fs_extents.h>
44#include <asm/uaccess.h>
45
46
47/*
48 * ext_pblock:
49 * combine low and high parts of physical block number into ext4_fsblk_t
50 */
51static inline ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
52{
53 ext4_fsblk_t block;
54
55 block = le32_to_cpu(ex->ee_start);
56 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
57 return block;
58}
59
60/*
61 * idx_pblock:
62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63 */
64static inline ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65{
66 ext4_fsblk_t block;
67
68 block = le32_to_cpu(ix->ei_leaf);
69 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
70 return block;
71}
72
73/*
74 * ext4_ext_store_pblock:
75 * stores a large physical block number into an extent struct,
76 * breaking it into parts
77 */
78static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79{
80 ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
82}
83
84/*
85 * ext4_idx_store_pblock:
86 * stores a large physical block number into an index struct,
87 * breaking it into parts
88 */
89static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
90{
91 ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff));
92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93}
94
95static int ext4_ext_check_header(const char *function, struct inode *inode,
96 struct ext4_extent_header *eh)
97{
98 const char *error_msg = NULL;
99
100 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
101 error_msg = "invalid magic";
102 goto corrupted;
103 }
104 if (unlikely(eh->eh_max == 0)) {
105 error_msg = "invalid eh_max";
106 goto corrupted;
107 }
108 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
109 error_msg = "invalid eh_entries";
110 goto corrupted;
111 }
112 return 0;
113
114corrupted:
115 ext4_error(inode->i_sb, function,
116 "bad header in inode #%lu: %s - magic %x, "
117 "entries %u, max %u, depth %u",
118 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
119 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
120 le16_to_cpu(eh->eh_depth));
121
122 return -EIO;
123}
124
125static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
126{
127 int err;
128
129 if (handle->h_buffer_credits > needed)
130 return handle;
131 if (!ext4_journal_extend(handle, needed))
132 return handle;
133 err = ext4_journal_restart(handle, needed);
134
135 return handle;
136}
137
138/*
139 * could return:
140 * - EROFS
141 * - ENOMEM
142 */
143static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
144 struct ext4_ext_path *path)
145{
146 if (path->p_bh) {
147 /* path points to block */
148 return ext4_journal_get_write_access(handle, path->p_bh);
149 }
150 /* path points to leaf/index in inode body */
151 /* we use in-core data, no need to protect them */
152 return 0;
153}
154
155/*
156 * could return:
157 * - EROFS
158 * - ENOMEM
159 * - EIO
160 */
161static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
162 struct ext4_ext_path *path)
163{
164 int err;
165 if (path->p_bh) {
166 /* path points to block */
167 err = ext4_journal_dirty_metadata(handle, path->p_bh);
168 } else {
169 /* path points to leaf/index in inode body */
170 err = ext4_mark_inode_dirty(handle, inode);
171 }
172 return err;
173}
174
175static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
176 struct ext4_ext_path *path,
177 ext4_fsblk_t block)
178{
179 struct ext4_inode_info *ei = EXT4_I(inode);
180 ext4_fsblk_t bg_start;
181 ext4_grpblk_t colour;
182 int depth;
183
184 if (path) {
185 struct ext4_extent *ex;
186 depth = path->p_depth;
187
188 /* try to predict block placement */
189 if ((ex = path[depth].p_ext))
190 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
191
192 /* it looks like index is empty;
193 * try to find starting block from index itself */
194 if (path[depth].p_bh)
195 return path[depth].p_bh->b_blocknr;
196 }
197
198 /* OK. use inode's group */
199 bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
200 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
201 colour = (current->pid % 16) *
202 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
203 return bg_start + colour + block;
204}
205
206static ext4_fsblk_t
207ext4_ext_new_block(handle_t *handle, struct inode *inode,
208 struct ext4_ext_path *path,
209 struct ext4_extent *ex, int *err)
210{
211 ext4_fsblk_t goal, newblock;
212
213 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
214 newblock = ext4_new_block(handle, inode, goal, err);
215 return newblock;
216}
217
218static inline int ext4_ext_space_block(struct inode *inode)
219{
220 int size;
221
222 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
223 / sizeof(struct ext4_extent);
224#ifdef AGRESSIVE_TEST
225 if (size > 6)
226 size = 6;
227#endif
228 return size;
229}
230
231static inline int ext4_ext_space_block_idx(struct inode *inode)
232{
233 int size;
234
235 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
236 / sizeof(struct ext4_extent_idx);
237#ifdef AGRESSIVE_TEST
238 if (size > 5)
239 size = 5;
240#endif
241 return size;
242}
243
244static inline int ext4_ext_space_root(struct inode *inode)
245{
246 int size;
247
248 size = sizeof(EXT4_I(inode)->i_data);
249 size -= sizeof(struct ext4_extent_header);
250 size /= sizeof(struct ext4_extent);
251#ifdef AGRESSIVE_TEST
252 if (size > 3)
253 size = 3;
254#endif
255 return size;
256}
257
258static inline int ext4_ext_space_root_idx(struct inode *inode)
259{
260 int size;
261
262 size = sizeof(EXT4_I(inode)->i_data);
263 size -= sizeof(struct ext4_extent_header);
264 size /= sizeof(struct ext4_extent_idx);
265#ifdef AGRESSIVE_TEST
266 if (size > 4)
267 size = 4;
268#endif
269 return size;
270}
271
272#ifdef EXT_DEBUG
273static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
274{
275 int k, l = path->p_depth;
276
277 ext_debug("path:");
278 for (k = 0; k <= l; k++, path++) {
279 if (path->p_idx) {
280 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
281 idx_pblock(path->p_idx));
282 } else if (path->p_ext) {
283 ext_debug(" %d:%d:%llu ",
284 le32_to_cpu(path->p_ext->ee_block),
285 le16_to_cpu(path->p_ext->ee_len),
286 ext_pblock(path->p_ext));
287 } else
288 ext_debug(" []");
289 }
290 ext_debug("\n");
291}
292
293static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
294{
295 int depth = ext_depth(inode);
296 struct ext4_extent_header *eh;
297 struct ext4_extent *ex;
298 int i;
299
300 if (!path)
301 return;
302
303 eh = path[depth].p_hdr;
304 ex = EXT_FIRST_EXTENT(eh);
305
306 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
307 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
308 le16_to_cpu(ex->ee_len), ext_pblock(ex));
309 }
310 ext_debug("\n");
311}
312#else
313#define ext4_ext_show_path(inode,path)
314#define ext4_ext_show_leaf(inode,path)
315#endif
316
317static void ext4_ext_drop_refs(struct ext4_ext_path *path)
318{
319 int depth = path->p_depth;
320 int i;
321
322 for (i = 0; i <= depth; i++, path++)
323 if (path->p_bh) {
324 brelse(path->p_bh);
325 path->p_bh = NULL;
326 }
327}
328
329/*
330 * ext4_ext_binsearch_idx:
331 * binary search for the closest index of the given block
332 */
333static void
334ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
335{
336 struct ext4_extent_header *eh = path->p_hdr;
337 struct ext4_extent_idx *r, *l, *m;
338
339 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
340 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
341 BUG_ON(le16_to_cpu(eh->eh_entries) <= 0);
342
343 ext_debug("binsearch for %d(idx): ", block);
344
345 l = EXT_FIRST_INDEX(eh) + 1;
346 r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1;
347 while (l <= r) {
348 m = l + (r - l) / 2;
349 if (block < le32_to_cpu(m->ei_block))
350 r = m - 1;
351 else
352 l = m + 1;
353 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block,
354 m, m->ei_block, r, r->ei_block);
355 }
356
357 path->p_idx = l - 1;
358 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
359 idx_block(path->p_idx));
360
361#ifdef CHECK_BINSEARCH
362 {
363 struct ext4_extent_idx *chix, *ix;
364 int k;
365
366 chix = ix = EXT_FIRST_INDEX(eh);
367 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
368 if (k != 0 &&
369 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
370 printk("k=%d, ix=0x%p, first=0x%p\n", k,
371 ix, EXT_FIRST_INDEX(eh));
372 printk("%u <= %u\n",
373 le32_to_cpu(ix->ei_block),
374 le32_to_cpu(ix[-1].ei_block));
375 }
376 BUG_ON(k && le32_to_cpu(ix->ei_block)
377 <= le32_to_cpu(ix[-1].ei_block));
378 if (block < le32_to_cpu(ix->ei_block))
379 break;
380 chix = ix;
381 }
382 BUG_ON(chix != path->p_idx);
383 }
384#endif
385
386}
387
388/*
389 * ext4_ext_binsearch:
390 * binary search for closest extent of the given block
391 */
392static void
393ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
394{
395 struct ext4_extent_header *eh = path->p_hdr;
396 struct ext4_extent *r, *l, *m;
397
398 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
399 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
400
401 if (eh->eh_entries == 0) {
402 /*
403 * this leaf is empty:
404 * we get such a leaf in split/add case
405 */
406 return;
407 }
408
409 ext_debug("binsearch for %d: ", block);
410
411 l = EXT_FIRST_EXTENT(eh) + 1;
412 r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1;
413
414 while (l <= r) {
415 m = l + (r - l) / 2;
416 if (block < le32_to_cpu(m->ee_block))
417 r = m - 1;
418 else
419 l = m + 1;
420 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block,
421 m, m->ee_block, r, r->ee_block);
422 }
423
424 path->p_ext = l - 1;
425 ext_debug(" -> %d:%llu:%d ",
426 le32_to_cpu(path->p_ext->ee_block),
427 ext_pblock(path->p_ext),
428 le16_to_cpu(path->p_ext->ee_len));
429
430#ifdef CHECK_BINSEARCH
431 {
432 struct ext4_extent *chex, *ex;
433 int k;
434
435 chex = ex = EXT_FIRST_EXTENT(eh);
436 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
437 BUG_ON(k && le32_to_cpu(ex->ee_block)
438 <= le32_to_cpu(ex[-1].ee_block));
439 if (block < le32_to_cpu(ex->ee_block))
440 break;
441 chex = ex;
442 }
443 BUG_ON(chex != path->p_ext);
444 }
445#endif
446
447}
448
449int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
450{
451 struct ext4_extent_header *eh;
452
453 eh = ext_inode_hdr(inode);
454 eh->eh_depth = 0;
455 eh->eh_entries = 0;
456 eh->eh_magic = EXT4_EXT_MAGIC;
457 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
458 ext4_mark_inode_dirty(handle, inode);
459 ext4_ext_invalidate_cache(inode);
460 return 0;
461}
462
463struct ext4_ext_path *
464ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
465{
466 struct ext4_extent_header *eh;
467 struct buffer_head *bh;
468 short int depth, i, ppos = 0, alloc = 0;
469
470 eh = ext_inode_hdr(inode);
471 BUG_ON(eh == NULL);
472 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
473 return ERR_PTR(-EIO);
474
475 i = depth = ext_depth(inode);
476
477 /* account possible depth increase */
478 if (!path) {
479 path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 2),
480 GFP_NOFS);
481 if (!path)
482 return ERR_PTR(-ENOMEM);
483 alloc = 1;
484 }
485 memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
486 path[0].p_hdr = eh;
487
488 /* walk through the tree */
489 while (i) {
490 ext_debug("depth %d: num %d, max %d\n",
491 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
492 ext4_ext_binsearch_idx(inode, path + ppos, block);
493 path[ppos].p_block = idx_pblock(path[ppos].p_idx);
494 path[ppos].p_depth = i;
495 path[ppos].p_ext = NULL;
496
497 bh = sb_bread(inode->i_sb, path[ppos].p_block);
498 if (!bh)
499 goto err;
500
501 eh = ext_block_hdr(bh);
502 ppos++;
503 BUG_ON(ppos > depth);
504 path[ppos].p_bh = bh;
505 path[ppos].p_hdr = eh;
506 i--;
507
508 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
509 goto err;
510 }
511
512 path[ppos].p_depth = i;
513 path[ppos].p_hdr = eh;
514 path[ppos].p_ext = NULL;
515 path[ppos].p_idx = NULL;
516
517 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
518 goto err;
519
520 /* find extent */
521 ext4_ext_binsearch(inode, path + ppos, block);
522
523 ext4_ext_show_path(inode, path);
524
525 return path;
526
527err:
528 ext4_ext_drop_refs(path);
529 if (alloc)
530 kfree(path);
531 return ERR_PTR(-EIO);
532}
533
534/*
535 * ext4_ext_insert_index:
536 * insert new index [@logical;@ptr] into the block at @curp;
537 * check where to insert: before @curp or after @curp
538 */
539static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
540 struct ext4_ext_path *curp,
541 int logical, ext4_fsblk_t ptr)
542{
543 struct ext4_extent_idx *ix;
544 int len, err;
545
546 if ((err = ext4_ext_get_access(handle, inode, curp)))
547 return err;
548
549 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
550 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
551 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
552 /* insert after */
553 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
554 len = (len - 1) * sizeof(struct ext4_extent_idx);
555 len = len < 0 ? 0 : len;
556 ext_debug("insert new index %d after: %d. "
557 "move %d from 0x%p to 0x%p\n",
558 logical, ptr, len,
559 (curp->p_idx + 1), (curp->p_idx + 2));
560 memmove(curp->p_idx + 2, curp->p_idx + 1, len);
561 }
562 ix = curp->p_idx + 1;
563 } else {
564 /* insert before */
565 len = len * sizeof(struct ext4_extent_idx);
566 len = len < 0 ? 0 : len;
567 ext_debug("insert new index %d before: %d. "
568 "move %d from 0x%p to 0x%p\n",
569 logical, ptr, len,
570 curp->p_idx, (curp->p_idx + 1));
571 memmove(curp->p_idx + 1, curp->p_idx, len);
572 ix = curp->p_idx;
573 }
574
575 ix->ei_block = cpu_to_le32(logical);
576 ext4_idx_store_pblock(ix, ptr);
577 curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
578
579 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
580 > le16_to_cpu(curp->p_hdr->eh_max));
581 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
582
583 err = ext4_ext_dirty(handle, inode, curp);
584 ext4_std_error(inode->i_sb, err);
585
586 return err;
587}
588
589/*
590 * ext4_ext_split:
591 * inserts new subtree into the path, using free index entry
592 * at depth @at:
593 * - allocates all needed blocks (new leaf and all intermediate index blocks)
594 * - makes decision where to split
595 * - moves remaining extents and index entries (right to the split point)
596 * into the newly allocated blocks
597 * - initializes subtree
598 */
599static int ext4_ext_split(handle_t *handle, struct inode *inode,
600 struct ext4_ext_path *path,
601 struct ext4_extent *newext, int at)
602{
603 struct buffer_head *bh = NULL;
604 int depth = ext_depth(inode);
605 struct ext4_extent_header *neh;
606 struct ext4_extent_idx *fidx;
607 struct ext4_extent *ex;
608 int i = at, k, m, a;
609 ext4_fsblk_t newblock, oldblock;
610 __le32 border;
611 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
612 int err = 0;
613
614 /* make decision: where to split? */
615 /* FIXME: now decision is simplest: at current extent */
616
617 /* if current leaf will be split, then we should use
618 * border from split point */
619 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
620 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
621 border = path[depth].p_ext[1].ee_block;
622 ext_debug("leaf will be split."
623 " next leaf starts at %d\n",
624 le32_to_cpu(border));
625 } else {
626 border = newext->ee_block;
627 ext_debug("leaf will be added."
628 " next leaf starts at %d\n",
629 le32_to_cpu(border));
630 }
631
632 /*
633 * If error occurs, then we break processing
634 * and mark filesystem read-only. index won't
635 * be inserted and tree will be in consistent
636 * state. Next mount will repair buffers too.
637 */
638
639 /*
640 * Get array to track all allocated blocks.
641 * We need this to handle errors and free blocks
642 * upon them.
643 */
644 ablocks = kmalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
645 if (!ablocks)
646 return -ENOMEM;
647 memset(ablocks, 0, sizeof(ext4_fsblk_t) * depth);
648
649 /* allocate all needed blocks */
650 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
651 for (a = 0; a < depth - at; a++) {
652 newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
653 if (newblock == 0)
654 goto cleanup;
655 ablocks[a] = newblock;
656 }
657
658 /* initialize new leaf */
659 newblock = ablocks[--a];
660 BUG_ON(newblock == 0);
661 bh = sb_getblk(inode->i_sb, newblock);
662 if (!bh) {
663 err = -EIO;
664 goto cleanup;
665 }
666 lock_buffer(bh);
667
668 if ((err = ext4_journal_get_create_access(handle, bh)))
669 goto cleanup;
670
671 neh = ext_block_hdr(bh);
672 neh->eh_entries = 0;
673 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
674 neh->eh_magic = EXT4_EXT_MAGIC;
675 neh->eh_depth = 0;
676 ex = EXT_FIRST_EXTENT(neh);
677
678 /* move remainder of path[depth] to the new leaf */
679 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
680 /* start copy from next extent */
681 /* TODO: we could do it by single memmove */
682 m = 0;
683 path[depth].p_ext++;
684 while (path[depth].p_ext <=
685 EXT_MAX_EXTENT(path[depth].p_hdr)) {
686 ext_debug("move %d:%llu:%d in new leaf %llu\n",
687 le32_to_cpu(path[depth].p_ext->ee_block),
688 ext_pblock(path[depth].p_ext),
689 le16_to_cpu(path[depth].p_ext->ee_len),
690 newblock);
691 /*memmove(ex++, path[depth].p_ext++,
692 sizeof(struct ext4_extent));
693 neh->eh_entries++;*/
694 path[depth].p_ext++;
695 m++;
696 }
697 if (m) {
698 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
699 neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
700 }
701
702 set_buffer_uptodate(bh);
703 unlock_buffer(bh);
704
705 if ((err = ext4_journal_dirty_metadata(handle, bh)))
706 goto cleanup;
707 brelse(bh);
708 bh = NULL;
709
710 /* correct old leaf */
711 if (m) {
712 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
713 goto cleanup;
714 path[depth].p_hdr->eh_entries =
715 cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
716 if ((err = ext4_ext_dirty(handle, inode, path + depth)))
717 goto cleanup;
718
719 }
720
721 /* create intermediate indexes */
722 k = depth - at - 1;
723 BUG_ON(k < 0);
724 if (k)
725 ext_debug("create %d intermediate indices\n", k);
726 /* insert new index into current index block */
727 /* current depth stored in i var */
728 i = depth - 1;
729 while (k--) {
730 oldblock = newblock;
731 newblock = ablocks[--a];
732 bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock);
733 if (!bh) {
734 err = -EIO;
735 goto cleanup;
736 }
737 lock_buffer(bh);
738
739 if ((err = ext4_journal_get_create_access(handle, bh)))
740 goto cleanup;
741
742 neh = ext_block_hdr(bh);
743 neh->eh_entries = cpu_to_le16(1);
744 neh->eh_magic = EXT4_EXT_MAGIC;
745 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
746 neh->eh_depth = cpu_to_le16(depth - i);
747 fidx = EXT_FIRST_INDEX(neh);
748 fidx->ei_block = border;
749 ext4_idx_store_pblock(fidx, oldblock);
750
751 ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i,
752 newblock, (unsigned long) le32_to_cpu(border),
753 oldblock);
754 /* copy indexes */
755 m = 0;
756 path[i].p_idx++;
757
758 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
759 EXT_MAX_INDEX(path[i].p_hdr));
760 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
761 EXT_LAST_INDEX(path[i].p_hdr));
762 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
763 ext_debug("%d: move %d:%d in new index %llu\n", i,
764 le32_to_cpu(path[i].p_idx->ei_block),
765 idx_pblock(path[i].p_idx),
766 newblock);
767 /*memmove(++fidx, path[i].p_idx++,
768 sizeof(struct ext4_extent_idx));
769 neh->eh_entries++;
770 BUG_ON(neh->eh_entries > neh->eh_max);*/
771 path[i].p_idx++;
772 m++;
773 }
774 if (m) {
775 memmove(++fidx, path[i].p_idx - m,
776 sizeof(struct ext4_extent_idx) * m);
777 neh->eh_entries =
778 cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
779 }
780 set_buffer_uptodate(bh);
781 unlock_buffer(bh);
782
783 if ((err = ext4_journal_dirty_metadata(handle, bh)))
784 goto cleanup;
785 brelse(bh);
786 bh = NULL;
787
788 /* correct old index */
789 if (m) {
790 err = ext4_ext_get_access(handle, inode, path + i);
791 if (err)
792 goto cleanup;
793 path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
794 err = ext4_ext_dirty(handle, inode, path + i);
795 if (err)
796 goto cleanup;
797 }
798
799 i--;
800 }
801
802 /* insert new index */
803 if (err)
804 goto cleanup;
805
806 err = ext4_ext_insert_index(handle, inode, path + at,
807 le32_to_cpu(border), newblock);
808
809cleanup:
810 if (bh) {
811 if (buffer_locked(bh))
812 unlock_buffer(bh);
813 brelse(bh);
814 }
815
816 if (err) {
817 /* free all allocated blocks in error case */
818 for (i = 0; i < depth; i++) {
819 if (!ablocks[i])
820 continue;
821 ext4_free_blocks(handle, inode, ablocks[i], 1);
822 }
823 }
824 kfree(ablocks);
825
826 return err;
827}
828
829/*
830 * ext4_ext_grow_indepth:
831 * implements tree growing procedure:
832 * - allocates new block
833 * - moves top-level data (index block or leaf) into the new block
834 * - initializes new top-level, creating index that points to the
835 * just created block
836 */
837static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
838 struct ext4_ext_path *path,
839 struct ext4_extent *newext)
840{
841 struct ext4_ext_path *curp = path;
842 struct ext4_extent_header *neh;
843 struct ext4_extent_idx *fidx;
844 struct buffer_head *bh;
845 ext4_fsblk_t newblock;
846 int err = 0;
847
848 newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
849 if (newblock == 0)
850 return err;
851
852 bh = sb_getblk(inode->i_sb, newblock);
853 if (!bh) {
854 err = -EIO;
855 ext4_std_error(inode->i_sb, err);
856 return err;
857 }
858 lock_buffer(bh);
859
860 if ((err = ext4_journal_get_create_access(handle, bh))) {
861 unlock_buffer(bh);
862 goto out;
863 }
864
865 /* move top-level index/leaf into new block */
866 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
867
868 /* set size of new block */
869 neh = ext_block_hdr(bh);
870 /* old root could have indexes or leaves
871 * so calculate e_max right way */
872 if (ext_depth(inode))
873 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
874 else
875 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
876 neh->eh_magic = EXT4_EXT_MAGIC;
877 set_buffer_uptodate(bh);
878 unlock_buffer(bh);
879
880 if ((err = ext4_journal_dirty_metadata(handle, bh)))
881 goto out;
882
883 /* create index in new top-level index: num,max,pointer */
884 if ((err = ext4_ext_get_access(handle, inode, curp)))
885 goto out;
886
887 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
888 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
889 curp->p_hdr->eh_entries = cpu_to_le16(1);
890 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
891 /* FIXME: it works, but actually path[0] can be index */
892 curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
893 ext4_idx_store_pblock(curp->p_idx, newblock);
894
895 neh = ext_inode_hdr(inode);
896 fidx = EXT_FIRST_INDEX(neh);
897 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
898 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
899 le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
900
901 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
902 err = ext4_ext_dirty(handle, inode, curp);
903out:
904 brelse(bh);
905
906 return err;
907}
908
909/*
910 * ext4_ext_create_new_leaf:
911 * finds empty index and adds new leaf.
912 * if no free index is found, then it requests in-depth growing.
913 */
914static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
915 struct ext4_ext_path *path,
916 struct ext4_extent *newext)
917{
918 struct ext4_ext_path *curp;
919 int depth, i, err = 0;
920
921repeat:
922 i = depth = ext_depth(inode);
923
924 /* walk up to the tree and look for free index entry */
925 curp = path + depth;
926 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
927 i--;
928 curp--;
929 }
930
931 /* we use already allocated block for index block,
932 * so subsequent data blocks should be contiguous */
933 if (EXT_HAS_FREE_INDEX(curp)) {
934 /* if we found index with free entry, then use that
935 * entry: create all needed subtree and add new leaf */
936 err = ext4_ext_split(handle, inode, path, newext, i);
937
938 /* refill path */
939 ext4_ext_drop_refs(path);
940 path = ext4_ext_find_extent(inode,
941 le32_to_cpu(newext->ee_block),
942 path);
943 if (IS_ERR(path))
944 err = PTR_ERR(path);
945 } else {
946 /* tree is full, time to grow in depth */
947 err = ext4_ext_grow_indepth(handle, inode, path, newext);
948 if (err)
949 goto out;
950
951 /* refill path */
952 ext4_ext_drop_refs(path);
953 path = ext4_ext_find_extent(inode,
954 le32_to_cpu(newext->ee_block),
955 path);
956 if (IS_ERR(path)) {
957 err = PTR_ERR(path);
958 goto out;
959 }
960
961 /*
962 * only first (depth 0 -> 1) produces free space;
963 * in all other cases we have to split the grown tree
964 */
965 depth = ext_depth(inode);
966 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
967 /* now we need to split */
968 goto repeat;
969 }
970 }
971
972out:
973 return err;
974}
975
976/*
977 * ext4_ext_next_allocated_block:
978 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
979 * NOTE: it considers block number from index entry as
980 * allocated block. Thus, index entries have to be consistent
981 * with leaves.
982 */
983static unsigned long
984ext4_ext_next_allocated_block(struct ext4_ext_path *path)
985{
986 int depth;
987
988 BUG_ON(path == NULL);
989 depth = path->p_depth;
990
991 if (depth == 0 && path->p_ext == NULL)
992 return EXT_MAX_BLOCK;
993
994 while (depth >= 0) {
995 if (depth == path->p_depth) {
996 /* leaf */
997 if (path[depth].p_ext !=
998 EXT_LAST_EXTENT(path[depth].p_hdr))
999 return le32_to_cpu(path[depth].p_ext[1].ee_block);
1000 } else {
1001 /* index */
1002 if (path[depth].p_idx !=
1003 EXT_LAST_INDEX(path[depth].p_hdr))
1004 return le32_to_cpu(path[depth].p_idx[1].ei_block);
1005 }
1006 depth--;
1007 }
1008
1009 return EXT_MAX_BLOCK;
1010}
1011
1012/*
1013 * ext4_ext_next_leaf_block:
1014 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1015 */
1016static unsigned ext4_ext_next_leaf_block(struct inode *inode,
1017 struct ext4_ext_path *path)
1018{
1019 int depth;
1020
1021 BUG_ON(path == NULL);
1022 depth = path->p_depth;
1023
1024 /* zero-tree has no leaf blocks at all */
1025 if (depth == 0)
1026 return EXT_MAX_BLOCK;
1027
1028 /* go to index block */
1029 depth--;
1030
1031 while (depth >= 0) {
1032 if (path[depth].p_idx !=
1033 EXT_LAST_INDEX(path[depth].p_hdr))
1034 return le32_to_cpu(path[depth].p_idx[1].ei_block);
1035 depth--;
1036 }
1037
1038 return EXT_MAX_BLOCK;
1039}
1040
1041/*
1042 * ext4_ext_correct_indexes:
1043 * if leaf gets modified and modified extent is first in the leaf,
1044 * then we have to correct all indexes above.
1045 * TODO: do we need to correct tree in all cases?
1046 */
1047int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1048 struct ext4_ext_path *path)
1049{
1050 struct ext4_extent_header *eh;
1051 int depth = ext_depth(inode);
1052 struct ext4_extent *ex;
1053 __le32 border;
1054 int k, err = 0;
1055
1056 eh = path[depth].p_hdr;
1057 ex = path[depth].p_ext;
1058 BUG_ON(ex == NULL);
1059 BUG_ON(eh == NULL);
1060
1061 if (depth == 0) {
1062 /* there is no tree at all */
1063 return 0;
1064 }
1065
1066 if (ex != EXT_FIRST_EXTENT(eh)) {
1067 /* we correct tree if first leaf got modified only */
1068 return 0;
1069 }
1070
1071 /*
1072 * TODO: we need correction if border is smaller than current one
1073 */
1074 k = depth - 1;
1075 border = path[depth].p_ext->ee_block;
1076 if ((err = ext4_ext_get_access(handle, inode, path + k)))
1077 return err;
1078 path[k].p_idx->ei_block = border;
1079 if ((err = ext4_ext_dirty(handle, inode, path + k)))
1080 return err;
1081
1082 while (k--) {
1083 /* change all left-side indexes */
1084 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1085 break;
1086 if ((err = ext4_ext_get_access(handle, inode, path + k)))
1087 break;
1088 path[k].p_idx->ei_block = border;
1089 if ((err = ext4_ext_dirty(handle, inode, path + k)))
1090 break;
1091 }
1092
1093 return err;
1094}
1095
1096static int inline
1097ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1098 struct ext4_extent *ex2)
1099{
1100 if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
1101 le32_to_cpu(ex2->ee_block))
1102 return 0;
1103
1104 /*
1105 * To allow future support for preallocated extents to be added
1106 * as an RO_COMPAT feature, refuse to merge to extents if
1107 * this can result in the top bit of ee_len being set.
1108 */
1109 if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
1110 return 0;
1111#ifdef AGRESSIVE_TEST
1112 if (le16_to_cpu(ex1->ee_len) >= 4)
1113 return 0;
1114#endif
1115
1116 if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
1117 return 1;
1118 return 0;
1119}
1120
1121/*
1122 * ext4_ext_insert_extent:
1123 * tries to merge requsted extent into the existing extent or
1124 * inserts requested extent as new one into the tree,
1125 * creating new leaf in the no-space case.
1126 */
1127int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1128 struct ext4_ext_path *path,
1129 struct ext4_extent *newext)
1130{
1131 struct ext4_extent_header * eh;
1132 struct ext4_extent *ex, *fex;
1133 struct ext4_extent *nearex; /* nearest extent */
1134 struct ext4_ext_path *npath = NULL;
1135 int depth, len, err, next;
1136
1137 BUG_ON(newext->ee_len == 0);
1138 depth = ext_depth(inode);
1139 ex = path[depth].p_ext;
1140 BUG_ON(path[depth].p_hdr == NULL);
1141
1142 /* try to insert block into found extent and return */
1143 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1144 ext_debug("append %d block to %d:%d (from %llu)\n",
1145 le16_to_cpu(newext->ee_len),
1146 le32_to_cpu(ex->ee_block),
1147 le16_to_cpu(ex->ee_len), ext_pblock(ex));
1148 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
1149 return err;
1150 ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
1151 + le16_to_cpu(newext->ee_len));
1152 eh = path[depth].p_hdr;
1153 nearex = ex;
1154 goto merge;
1155 }
1156
1157repeat:
1158 depth = ext_depth(inode);
1159 eh = path[depth].p_hdr;
1160 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1161 goto has_space;
1162
1163 /* probably next leaf has space for us? */
1164 fex = EXT_LAST_EXTENT(eh);
1165 next = ext4_ext_next_leaf_block(inode, path);
1166 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1167 && next != EXT_MAX_BLOCK) {
1168 ext_debug("next leaf block - %d\n", next);
1169 BUG_ON(npath != NULL);
1170 npath = ext4_ext_find_extent(inode, next, NULL);
1171 if (IS_ERR(npath))
1172 return PTR_ERR(npath);
1173 BUG_ON(npath->p_depth != path->p_depth);
1174 eh = npath[depth].p_hdr;
1175 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1176 ext_debug("next leaf isnt full(%d)\n",
1177 le16_to_cpu(eh->eh_entries));
1178 path = npath;
1179 goto repeat;
1180 }
1181 ext_debug("next leaf has no free space(%d,%d)\n",
1182 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1183 }
1184
1185 /*
1186 * There is no free space in the found leaf.
1187 * We're gonna add a new leaf in the tree.
1188 */
1189 err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1190 if (err)
1191 goto cleanup;
1192 depth = ext_depth(inode);
1193 eh = path[depth].p_hdr;
1194
1195has_space:
1196 nearex = path[depth].p_ext;
1197
1198 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
1199 goto cleanup;
1200
1201 if (!nearex) {
1202 /* there is no extent in this leaf, create first one */
1203 ext_debug("first extent in the leaf: %d:%llu:%d\n",
1204 le32_to_cpu(newext->ee_block),
1205 ext_pblock(newext),
1206 le16_to_cpu(newext->ee_len));
1207 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1208 } else if (le32_to_cpu(newext->ee_block)
1209 > le32_to_cpu(nearex->ee_block)) {
1210/* BUG_ON(newext->ee_block == nearex->ee_block); */
1211 if (nearex != EXT_LAST_EXTENT(eh)) {
1212 len = EXT_MAX_EXTENT(eh) - nearex;
1213 len = (len - 1) * sizeof(struct ext4_extent);
1214 len = len < 0 ? 0 : len;
1215 ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
1216 "move %d from 0x%p to 0x%p\n",
1217 le32_to_cpu(newext->ee_block),
1218 ext_pblock(newext),
1219 le16_to_cpu(newext->ee_len),
1220 nearex, len, nearex + 1, nearex + 2);
1221 memmove(nearex + 2, nearex + 1, len);
1222 }
1223 path[depth].p_ext = nearex + 1;
1224 } else {
1225 BUG_ON(newext->ee_block == nearex->ee_block);
1226 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1227 len = len < 0 ? 0 : len;
1228 ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
1229 "move %d from 0x%p to 0x%p\n",
1230 le32_to_cpu(newext->ee_block),
1231 ext_pblock(newext),
1232 le16_to_cpu(newext->ee_len),
1233 nearex, len, nearex + 1, nearex + 2);
1234 memmove(nearex + 1, nearex, len);
1235 path[depth].p_ext = nearex;
1236 }
1237
1238 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
1239 nearex = path[depth].p_ext;
1240 nearex->ee_block = newext->ee_block;
1241 nearex->ee_start = newext->ee_start;
1242 nearex->ee_start_hi = newext->ee_start_hi;
1243 nearex->ee_len = newext->ee_len;
1244
1245merge:
1246 /* try to merge extents to the right */
1247 while (nearex < EXT_LAST_EXTENT(eh)) {
1248 if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
1249 break;
1250 /* merge with next extent! */
1251 nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
1252 + le16_to_cpu(nearex[1].ee_len));
1253 if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
1254 len = (EXT_LAST_EXTENT(eh) - nearex - 1)
1255 * sizeof(struct ext4_extent);
1256 memmove(nearex + 1, nearex + 2, len);
1257 }
1258 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
1259 BUG_ON(eh->eh_entries == 0);
1260 }
1261
1262 /* try to merge extents to the left */
1263
1264 /* time to correct all indexes above */
1265 err = ext4_ext_correct_indexes(handle, inode, path);
1266 if (err)
1267 goto cleanup;
1268
1269 err = ext4_ext_dirty(handle, inode, path + depth);
1270
1271cleanup:
1272 if (npath) {
1273 ext4_ext_drop_refs(npath);
1274 kfree(npath);
1275 }
1276 ext4_ext_tree_changed(inode);
1277 ext4_ext_invalidate_cache(inode);
1278 return err;
1279}
1280
1281int ext4_ext_walk_space(struct inode *inode, unsigned long block,
1282 unsigned long num, ext_prepare_callback func,
1283 void *cbdata)
1284{
1285 struct ext4_ext_path *path = NULL;
1286 struct ext4_ext_cache cbex;
1287 struct ext4_extent *ex;
1288 unsigned long next, start = 0, end = 0;
1289 unsigned long last = block + num;
1290 int depth, exists, err = 0;
1291
1292 BUG_ON(func == NULL);
1293 BUG_ON(inode == NULL);
1294
1295 while (block < last && block != EXT_MAX_BLOCK) {
1296 num = last - block;
1297 /* find extent for this block */
1298 path = ext4_ext_find_extent(inode, block, path);
1299 if (IS_ERR(path)) {
1300 err = PTR_ERR(path);
1301 path = NULL;
1302 break;
1303 }
1304
1305 depth = ext_depth(inode);
1306 BUG_ON(path[depth].p_hdr == NULL);
1307 ex = path[depth].p_ext;
1308 next = ext4_ext_next_allocated_block(path);
1309
1310 exists = 0;
1311 if (!ex) {
1312 /* there is no extent yet, so try to allocate
1313 * all requested space */
1314 start = block;
1315 end = block + num;
1316 } else if (le32_to_cpu(ex->ee_block) > block) {
1317 /* need to allocate space before found extent */
1318 start = block;
1319 end = le32_to_cpu(ex->ee_block);
1320 if (block + num < end)
1321 end = block + num;
1322 } else if (block >=
1323 le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
1324 /* need to allocate space after found extent */
1325 start = block;
1326 end = block + num;
1327 if (end >= next)
1328 end = next;
1329 } else if (block >= le32_to_cpu(ex->ee_block)) {
1330 /*
1331 * some part of requested space is covered
1332 * by found extent
1333 */
1334 start = block;
1335 end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
1336 if (block + num < end)
1337 end = block + num;
1338 exists = 1;
1339 } else {
1340 BUG();
1341 }
1342 BUG_ON(end <= start);
1343
1344 if (!exists) {
1345 cbex.ec_block = start;
1346 cbex.ec_len = end - start;
1347 cbex.ec_start = 0;
1348 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1349 } else {
1350 cbex.ec_block = le32_to_cpu(ex->ee_block);
1351 cbex.ec_len = le16_to_cpu(ex->ee_len);
1352 cbex.ec_start = ext_pblock(ex);
1353 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1354 }
1355
1356 BUG_ON(cbex.ec_len == 0);
1357 err = func(inode, path, &cbex, cbdata);
1358 ext4_ext_drop_refs(path);
1359
1360 if (err < 0)
1361 break;
1362 if (err == EXT_REPEAT)
1363 continue;
1364 else if (err == EXT_BREAK) {
1365 err = 0;
1366 break;
1367 }
1368
1369 if (ext_depth(inode) != depth) {
1370 /* depth was changed. we have to realloc path */
1371 kfree(path);
1372 path = NULL;
1373 }
1374
1375 block = cbex.ec_block + cbex.ec_len;
1376 }
1377
1378 if (path) {
1379 ext4_ext_drop_refs(path);
1380 kfree(path);
1381 }
1382
1383 return err;
1384}
1385
1386static inline void
1387ext4_ext_put_in_cache(struct inode *inode, __u32 block,
1388 __u32 len, __u32 start, int type)
1389{
1390 struct ext4_ext_cache *cex;
1391 BUG_ON(len == 0);
1392 cex = &EXT4_I(inode)->i_cached_extent;
1393 cex->ec_type = type;
1394 cex->ec_block = block;
1395 cex->ec_len = len;
1396 cex->ec_start = start;
1397}
1398
1399/*
1400 * ext4_ext_put_gap_in_cache:
1401 * calculate boundaries of the gap that the requested block fits into
1402 * and cache this gap
1403 */
1404static inline void
1405ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1406 unsigned long block)
1407{
1408 int depth = ext_depth(inode);
1409 unsigned long lblock, len;
1410 struct ext4_extent *ex;
1411
1412 ex = path[depth].p_ext;
1413 if (ex == NULL) {
1414 /* there is no extent yet, so gap is [0;-] */
1415 lblock = 0;
1416 len = EXT_MAX_BLOCK;
1417 ext_debug("cache gap(whole file):");
1418 } else if (block < le32_to_cpu(ex->ee_block)) {
1419 lblock = block;
1420 len = le32_to_cpu(ex->ee_block) - block;
1421 ext_debug("cache gap(before): %lu [%lu:%lu]",
1422 (unsigned long) block,
1423 (unsigned long) le32_to_cpu(ex->ee_block),
1424 (unsigned long) le16_to_cpu(ex->ee_len));
1425 } else if (block >= le32_to_cpu(ex->ee_block)
1426 + le16_to_cpu(ex->ee_len)) {
1427 lblock = le32_to_cpu(ex->ee_block)
1428 + le16_to_cpu(ex->ee_len);
1429 len = ext4_ext_next_allocated_block(path);
1430 ext_debug("cache gap(after): [%lu:%lu] %lu",
1431 (unsigned long) le32_to_cpu(ex->ee_block),
1432 (unsigned long) le16_to_cpu(ex->ee_len),
1433 (unsigned long) block);
1434 BUG_ON(len == lblock);
1435 len = len - lblock;
1436 } else {
1437 lblock = len = 0;
1438 BUG();
1439 }
1440
1441 ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len);
1442 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1443}
1444
1445static inline int
1446ext4_ext_in_cache(struct inode *inode, unsigned long block,
1447 struct ext4_extent *ex)
1448{
1449 struct ext4_ext_cache *cex;
1450
1451 cex = &EXT4_I(inode)->i_cached_extent;
1452
1453 /* has cache valid data? */
1454 if (cex->ec_type == EXT4_EXT_CACHE_NO)
1455 return EXT4_EXT_CACHE_NO;
1456
1457 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1458 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1459 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1460 ex->ee_block = cpu_to_le32(cex->ec_block);
1461 ext4_ext_store_pblock(ex, cex->ec_start);
1462 ex->ee_len = cpu_to_le16(cex->ec_len);
1463 ext_debug("%lu cached by %lu:%lu:%llu\n",
1464 (unsigned long) block,
1465 (unsigned long) cex->ec_block,
1466 (unsigned long) cex->ec_len,
1467 cex->ec_start);
1468 return cex->ec_type;
1469 }
1470
1471 /* not in cache */
1472 return EXT4_EXT_CACHE_NO;
1473}
1474
1475/*
1476 * ext4_ext_rm_idx:
1477 * removes index from the index block.
1478 * It's used in truncate case only, thus all requests are for
1479 * last index in the block only.
1480 */
1481int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1482 struct ext4_ext_path *path)
1483{
1484 struct buffer_head *bh;
1485 int err;
1486 ext4_fsblk_t leaf;
1487
1488 /* free index block */
1489 path--;
1490 leaf = idx_pblock(path->p_idx);
1491 BUG_ON(path->p_hdr->eh_entries == 0);
1492 if ((err = ext4_ext_get_access(handle, inode, path)))
1493 return err;
1494 path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
1495 if ((err = ext4_ext_dirty(handle, inode, path)))
1496 return err;
1497 ext_debug("index is empty, remove it, free block %llu\n", leaf);
1498 bh = sb_find_get_block(inode->i_sb, leaf);
1499 ext4_forget(handle, 1, inode, bh, leaf);
1500 ext4_free_blocks(handle, inode, leaf, 1);
1501 return err;
1502}
1503
1504/*
1505 * ext4_ext_calc_credits_for_insert:
1506 * This routine returns max. credits that the extent tree can consume.
1507 * It should be OK for low-performance paths like ->writepage()
1508 * To allow many writing processes to fit into a single transaction,
1509 * the caller should calculate credits under truncate_mutex and
1510 * pass the actual path.
1511 */
1512int inline ext4_ext_calc_credits_for_insert(struct inode *inode,
1513 struct ext4_ext_path *path)
1514{
1515 int depth, needed;
1516
1517 if (path) {
1518 /* probably there is space in leaf? */
1519 depth = ext_depth(inode);
1520 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1521 < le16_to_cpu(path[depth].p_hdr->eh_max))
1522 return 1;
1523 }
1524
1525 /*
1526 * given 32-bit logical block (4294967296 blocks), max. tree
1527 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1528 * Let's also add one more level for imbalance.
1529 */
1530 depth = 5;
1531
1532 /* allocation of new data block(s) */
1533 needed = 2;
1534
1535 /*
1536 * tree can be full, so it would need to grow in depth:
1537 * allocation + old root + new root
1538 */
1539 needed += 2 + 1 + 1;
1540
1541 /*
1542 * Index split can happen, we would need:
1543 * allocate intermediate indexes (bitmap + group)
1544 * + change two blocks at each level, but root (already included)
1545 */
1546 needed = (depth * 2) + (depth * 2);
1547
1548 /* any allocation modifies superblock */
1549 needed += 1;
1550
1551 return needed;
1552}
1553
1554static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1555 struct ext4_extent *ex,
1556 unsigned long from, unsigned long to)
1557{
1558 struct buffer_head *bh;
1559 int i;
1560
1561#ifdef EXTENTS_STATS
1562 {
1563 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1564 unsigned short ee_len = le16_to_cpu(ex->ee_len);
1565 spin_lock(&sbi->s_ext_stats_lock);
1566 sbi->s_ext_blocks += ee_len;
1567 sbi->s_ext_extents++;
1568 if (ee_len < sbi->s_ext_min)
1569 sbi->s_ext_min = ee_len;
1570 if (ee_len > sbi->s_ext_max)
1571 sbi->s_ext_max = ee_len;
1572 if (ext_depth(inode) > sbi->s_depth_max)
1573 sbi->s_depth_max = ext_depth(inode);
1574 spin_unlock(&sbi->s_ext_stats_lock);
1575 }
1576#endif
1577 if (from >= le32_to_cpu(ex->ee_block)
1578 && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
1579 /* tail removal */
1580 unsigned long num;
1581 ext4_fsblk_t start;
1582 num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
1583 start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
1584 ext_debug("free last %lu blocks starting %llu\n", num, start);
1585 for (i = 0; i < num; i++) {
1586 bh = sb_find_get_block(inode->i_sb, start + i);
1587 ext4_forget(handle, 0, inode, bh, start + i);
1588 }
1589 ext4_free_blocks(handle, inode, start, num);
1590 } else if (from == le32_to_cpu(ex->ee_block)
1591 && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
1592 printk("strange request: removal %lu-%lu from %u:%u\n",
1593 from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
1594 } else {
1595 printk("strange request: removal(2) %lu-%lu from %u:%u\n",
1596 from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
1597 }
1598 return 0;
1599}
1600
1601static int
1602ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1603 struct ext4_ext_path *path, unsigned long start)
1604{
1605 int err = 0, correct_index = 0;
1606 int depth = ext_depth(inode), credits;
1607 struct ext4_extent_header *eh;
1608 unsigned a, b, block, num;
1609 unsigned long ex_ee_block;
1610 unsigned short ex_ee_len;
1611 struct ext4_extent *ex;
1612
1613 ext_debug("truncate since %lu in leaf\n", start);
1614 if (!path[depth].p_hdr)
1615 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1616 eh = path[depth].p_hdr;
1617 BUG_ON(eh == NULL);
1618 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
1619 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
1620
1621 /* find where to start removing */
1622 ex = EXT_LAST_EXTENT(eh);
1623
1624 ex_ee_block = le32_to_cpu(ex->ee_block);
1625 ex_ee_len = le16_to_cpu(ex->ee_len);
1626
1627 while (ex >= EXT_FIRST_EXTENT(eh) &&
1628 ex_ee_block + ex_ee_len > start) {
1629 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
1630 path[depth].p_ext = ex;
1631
1632 a = ex_ee_block > start ? ex_ee_block : start;
1633 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
1634 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
1635
1636 ext_debug(" border %u:%u\n", a, b);
1637
1638 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
1639 block = 0;
1640 num = 0;
1641 BUG();
1642 } else if (a != ex_ee_block) {
1643 /* remove tail of the extent */
1644 block = ex_ee_block;
1645 num = a - block;
1646 } else if (b != ex_ee_block + ex_ee_len - 1) {
1647 /* remove head of the extent */
1648 block = a;
1649 num = b - a;
1650 /* there is no "make a hole" API yet */
1651 BUG();
1652 } else {
1653 /* remove whole extent: excellent! */
1654 block = ex_ee_block;
1655 num = 0;
1656 BUG_ON(a != ex_ee_block);
1657 BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1658 }
1659
1660 /* at present, extent can't cross block group: */
1661 /* leaf + bitmap + group desc + sb + inode */
1662 credits = 5;
1663 if (ex == EXT_FIRST_EXTENT(eh)) {
1664 correct_index = 1;
1665 credits += (ext_depth(inode)) + 1;
1666 }
1667#ifdef CONFIG_QUOTA
1668 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1669#endif
1670
1671 handle = ext4_ext_journal_restart(handle, credits);
1672 if (IS_ERR(handle)) {
1673 err = PTR_ERR(handle);
1674 goto out;
1675 }
1676
1677 err = ext4_ext_get_access(handle, inode, path + depth);
1678 if (err)
1679 goto out;
1680
1681 err = ext4_remove_blocks(handle, inode, ex, a, b);
1682 if (err)
1683 goto out;
1684
1685 if (num == 0) {
1686 /* this extent is removed; mark slot entirely unused */
1687 ext4_ext_store_pblock(ex, 0);
1688 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
1689 }
1690
1691 ex->ee_block = cpu_to_le32(block);
1692 ex->ee_len = cpu_to_le16(num);
1693
1694 err = ext4_ext_dirty(handle, inode, path + depth);
1695 if (err)
1696 goto out;
1697
1698 ext_debug("new extent: %u:%u:%llu\n", block, num,
1699 ext_pblock(ex));
1700 ex--;
1701 ex_ee_block = le32_to_cpu(ex->ee_block);
1702 ex_ee_len = le16_to_cpu(ex->ee_len);
1703 }
1704
1705 if (correct_index && eh->eh_entries)
1706 err = ext4_ext_correct_indexes(handle, inode, path);
1707
1708 /* if this leaf is free, then we should
1709 * remove it from index block above */
1710 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
1711 err = ext4_ext_rm_idx(handle, inode, path + depth);
1712
1713out:
1714 return err;
1715}
1716
1717/*
1718 * ext4_ext_more_to_rm:
1719 * returns 1 if current index has to be freed (even partial)
1720 */
1721static int inline
1722ext4_ext_more_to_rm(struct ext4_ext_path *path)
1723{
1724 BUG_ON(path->p_idx == NULL);
1725
1726 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
1727 return 0;
1728
1729 /*
1730 * if truncate on deeper level happened, it wasn't partial,
1731 * so we have to consider current index for truncation
1732 */
1733 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
1734 return 0;
1735 return 1;
1736}
1737
1738int ext4_ext_remove_space(struct inode *inode, unsigned long start)
1739{
1740 struct super_block *sb = inode->i_sb;
1741 int depth = ext_depth(inode);
1742 struct ext4_ext_path *path;
1743 handle_t *handle;
1744 int i = 0, err = 0;
1745
1746 ext_debug("truncate since %lu\n", start);
1747
1748 /* probably first extent we're gonna free will be last in block */
1749 handle = ext4_journal_start(inode, depth + 1);
1750 if (IS_ERR(handle))
1751 return PTR_ERR(handle);
1752
1753 ext4_ext_invalidate_cache(inode);
1754
1755 /*
1756 * We start scanning from right side, freeing all the blocks
1757 * after i_size and walking into the tree depth-wise.
1758 */
1759 path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
1760 if (path == NULL) {
1761 ext4_journal_stop(handle);
1762 return -ENOMEM;
1763 }
1764 memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
1765 path[0].p_hdr = ext_inode_hdr(inode);
1766 if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) {
1767 err = -EIO;
1768 goto out;
1769 }
1770 path[0].p_depth = depth;
1771
1772 while (i >= 0 && err == 0) {
1773 if (i == depth) {
1774 /* this is leaf block */
1775 err = ext4_ext_rm_leaf(handle, inode, path, start);
1776 /* root level has p_bh == NULL, brelse() eats this */
1777 brelse(path[i].p_bh);
1778 path[i].p_bh = NULL;
1779 i--;
1780 continue;
1781 }
1782
1783 /* this is index block */
1784 if (!path[i].p_hdr) {
1785 ext_debug("initialize header\n");
1786 path[i].p_hdr = ext_block_hdr(path[i].p_bh);
1787 if (ext4_ext_check_header(__FUNCTION__, inode,
1788 path[i].p_hdr)) {
1789 err = -EIO;
1790 goto out;
1791 }
1792 }
1793
1794 BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries)
1795 > le16_to_cpu(path[i].p_hdr->eh_max));
1796 BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC);
1797
1798 if (!path[i].p_idx) {
1799 /* this level hasn't been touched yet */
1800 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
1801 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
1802 ext_debug("init index ptr: hdr 0x%p, num %d\n",
1803 path[i].p_hdr,
1804 le16_to_cpu(path[i].p_hdr->eh_entries));
1805 } else {
1806 /* we were already here, see at next index */
1807 path[i].p_idx--;
1808 }
1809
1810 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
1811 i, EXT_FIRST_INDEX(path[i].p_hdr),
1812 path[i].p_idx);
1813 if (ext4_ext_more_to_rm(path + i)) {
1814 /* go to the next level */
1815 ext_debug("move to level %d (block %llu)\n",
1816 i + 1, idx_pblock(path[i].p_idx));
1817 memset(path + i + 1, 0, sizeof(*path));
1818 path[i+1].p_bh =
1819 sb_bread(sb, idx_pblock(path[i].p_idx));
1820 if (!path[i+1].p_bh) {
1821 /* should we reset i_size? */
1822 err = -EIO;
1823 break;
1824 }
1825
1826 /* save actual number of indexes since this
1827 * number is changed at the next iteration */
1828 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
1829 i++;
1830 } else {
1831 /* we finished processing this index, go up */
1832 if (path[i].p_hdr->eh_entries == 0 && i > 0) {
1833 /* index is empty, remove it;
1834 * handle must be already prepared by the
1835 * truncatei_leaf() */
1836 err = ext4_ext_rm_idx(handle, inode, path + i);
1837 }
1838 /* root level has p_bh == NULL, brelse() eats this */
1839 brelse(path[i].p_bh);
1840 path[i].p_bh = NULL;
1841 i--;
1842 ext_debug("return to level %d\n", i);
1843 }
1844 }
1845
1846 /* TODO: flexible tree reduction should be here */
1847 if (path->p_hdr->eh_entries == 0) {
1848 /*
1849 * truncate to zero freed all the tree,
1850 * so we need to correct eh_depth
1851 */
1852 err = ext4_ext_get_access(handle, inode, path);
1853 if (err == 0) {
1854 ext_inode_hdr(inode)->eh_depth = 0;
1855 ext_inode_hdr(inode)->eh_max =
1856 cpu_to_le16(ext4_ext_space_root(inode));
1857 err = ext4_ext_dirty(handle, inode, path);
1858 }
1859 }
1860out:
1861 ext4_ext_tree_changed(inode);
1862 ext4_ext_drop_refs(path);
1863 kfree(path);
1864 ext4_journal_stop(handle);
1865
1866 return err;
1867}
1868
1869/*
1870 * called at mount time
1871 */
1872void ext4_ext_init(struct super_block *sb)
1873{
1874 /*
1875 * possible initialization would be here
1876 */
1877
1878 if (test_opt(sb, EXTENTS)) {
1879 printk("EXT4-fs: file extents enabled");
1880#ifdef AGRESSIVE_TEST
1881 printk(", agressive tests");
1882#endif
1883#ifdef CHECK_BINSEARCH
1884 printk(", check binsearch");
1885#endif
1886#ifdef EXTENTS_STATS
1887 printk(", stats");
1888#endif
1889 printk("\n");
1890#ifdef EXTENTS_STATS
1891 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
1892 EXT4_SB(sb)->s_ext_min = 1 << 30;
1893 EXT4_SB(sb)->s_ext_max = 0;
1894#endif
1895 }
1896}
1897
1898/*
1899 * called at umount time
1900 */
1901void ext4_ext_release(struct super_block *sb)
1902{
1903 if (!test_opt(sb, EXTENTS))
1904 return;
1905
1906#ifdef EXTENTS_STATS
1907 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
1908 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
1910 sbi->s_ext_blocks, sbi->s_ext_extents,
1911 sbi->s_ext_blocks / sbi->s_ext_extents);
1912 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
1913 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
1914 }
1915#endif
1916}
1917
1918int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1919 ext4_fsblk_t iblock,
1920 unsigned long max_blocks, struct buffer_head *bh_result,
1921 int create, int extend_disksize)
1922{
1923 struct ext4_ext_path *path = NULL;
1924 struct ext4_extent newex, *ex;
1925 ext4_fsblk_t goal, newblock;
1926 int err = 0, depth;
1927 unsigned long allocated = 0;
1928
1929 __clear_bit(BH_New, &bh_result->b_state);
1930 ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock,
1931 max_blocks, (unsigned) inode->i_ino);
1932 mutex_lock(&EXT4_I(inode)->truncate_mutex);
1933
1934 /* check in cache */
1935 if ((goal = ext4_ext_in_cache(inode, iblock, &newex))) {
1936 if (goal == EXT4_EXT_CACHE_GAP) {
1937 if (!create) {
1938 /* block isn't allocated yet and
1939 * user doesn't want to allocate it */
1940 goto out2;
1941 }
1942 /* we should allocate requested block */
1943 } else if (goal == EXT4_EXT_CACHE_EXTENT) {
1944 /* block is already allocated */
1945 newblock = iblock
1946 - le32_to_cpu(newex.ee_block)
1947 + ext_pblock(&newex);
1948 /* number of remaining blocks in the extent */
1949 allocated = le16_to_cpu(newex.ee_len) -
1950 (iblock - le32_to_cpu(newex.ee_block));
1951 goto out;
1952 } else {
1953 BUG();
1954 }
1955 }
1956
1957 /* find extent for this block */
1958 path = ext4_ext_find_extent(inode, iblock, NULL);
1959 if (IS_ERR(path)) {
1960 err = PTR_ERR(path);
1961 path = NULL;
1962 goto out2;
1963 }
1964
1965 depth = ext_depth(inode);
1966
1967 /*
1968 * consistent leaf must not be empty;
1969 * this situation is possible, though, _during_ tree modification;
1970 * this is why assert can't be put in ext4_ext_find_extent()
1971 */
1972 BUG_ON(path[depth].p_ext == NULL && depth != 0);
1973
1974 if ((ex = path[depth].p_ext)) {
1975 unsigned long ee_block = le32_to_cpu(ex->ee_block);
1976 ext4_fsblk_t ee_start = ext_pblock(ex);
1977 unsigned short ee_len = le16_to_cpu(ex->ee_len);
1978
1979 /*
1980 * Allow future support for preallocated extents to be added
1981 * as an RO_COMPAT feature:
1982 * Uninitialized extents are treated as holes, except that
1983 * we avoid (fail) allocating new blocks during a write.
1984 */
1985 if (ee_len > EXT_MAX_LEN)
1986 goto out2;
1987 /* if found extent covers block, simply return it */
1988 if (iblock >= ee_block && iblock < ee_block + ee_len) {
1989 newblock = iblock - ee_block + ee_start;
1990 /* number of remaining blocks in the extent */
1991 allocated = ee_len - (iblock - ee_block);
1992 ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
1993 ee_block, ee_len, newblock);
1994 ext4_ext_put_in_cache(inode, ee_block, ee_len,
1995 ee_start, EXT4_EXT_CACHE_EXTENT);
1996 goto out;
1997 }
1998 }
1999
2000 /*
2001 * requested block isn't allocated yet;
2002 * we couldn't try to create block if create flag is zero
2003 */
2004 if (!create) {
2005 /* put just found gap into cache to speed up
2006 * subsequent requests */
2007 ext4_ext_put_gap_in_cache(inode, path, iblock);
2008 goto out2;
2009 }
2010 /*
2011 * Okay, we need to do block allocation. Lazily initialize the block
2012 * allocation info here if necessary.
2013 */
2014 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2015 ext4_init_block_alloc_info(inode);
2016
2017 /* allocate new block */
2018 goal = ext4_ext_find_goal(inode, path, iblock);
2019 allocated = max_blocks;
2020 newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
2021 if (!newblock)
2022 goto out2;
2023 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2024 goal, newblock, allocated);
2025
2026 /* try to insert new extent into found leaf and return */
2027 newex.ee_block = cpu_to_le32(iblock);
2028 ext4_ext_store_pblock(&newex, newblock);
2029 newex.ee_len = cpu_to_le16(allocated);
2030 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2031 if (err)
2032 goto out2;
2033
2034 if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2035 EXT4_I(inode)->i_disksize = inode->i_size;
2036
2037 /* previous routine could use block we allocated */
2038 newblock = ext_pblock(&newex);
2039 __set_bit(BH_New, &bh_result->b_state);
2040
2041 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2042 EXT4_EXT_CACHE_EXTENT);
2043out:
2044 if (allocated > max_blocks)
2045 allocated = max_blocks;
2046 ext4_ext_show_leaf(inode, path);
2047 __set_bit(BH_Mapped, &bh_result->b_state);
2048 bh_result->b_bdev = inode->i_sb->s_bdev;
2049 bh_result->b_blocknr = newblock;
2050out2:
2051 if (path) {
2052 ext4_ext_drop_refs(path);
2053 kfree(path);
2054 }
2055 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
2056
2057 return err ? err : allocated;
2058}
2059
2060void ext4_ext_truncate(struct inode * inode, struct page *page)
2061{
2062 struct address_space *mapping = inode->i_mapping;
2063 struct super_block *sb = inode->i_sb;
2064 unsigned long last_block;
2065 handle_t *handle;
2066 int err = 0;
2067
2068 /*
2069 * probably first extent we're gonna free will be last in block
2070 */
2071 err = ext4_writepage_trans_blocks(inode) + 3;
2072 handle = ext4_journal_start(inode, err);
2073 if (IS_ERR(handle)) {
2074 if (page) {
2075 clear_highpage(page);
2076 flush_dcache_page(page);
2077 unlock_page(page);
2078 page_cache_release(page);
2079 }
2080 return;
2081 }
2082
2083 if (page)
2084 ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2085
2086 mutex_lock(&EXT4_I(inode)->truncate_mutex);
2087 ext4_ext_invalidate_cache(inode);
2088
2089 /*
2090 * TODO: optimization is possible here.
2091 * Probably we need not scan at all,
2092 * because page truncation is enough.
2093 */
2094 if (ext4_orphan_add(handle, inode))
2095 goto out_stop;
2096
2097 /* we have to know where to truncate from in crash case */
2098 EXT4_I(inode)->i_disksize = inode->i_size;
2099 ext4_mark_inode_dirty(handle, inode);
2100
2101 last_block = (inode->i_size + sb->s_blocksize - 1)
2102 >> EXT4_BLOCK_SIZE_BITS(sb);
2103 err = ext4_ext_remove_space(inode, last_block);
2104
2105 /* In a multi-transaction truncate, we only make the final
2106 * transaction synchronous. */
2107 if (IS_SYNC(inode))
2108 handle->h_sync = 1;
2109
2110out_stop:
2111 /*
2112 * If this was a simple ftruncate() and the file will remain alive,
2113 * then we need to clear up the orphan record which we created above.
2114 * However, if this was a real unlink then we were called by
2115 * ext4_delete_inode(), and we allow that function to clean up the
2116 * orphan info for us.
2117 */
2118 if (inode->i_nlink)
2119 ext4_orphan_del(handle, inode);
2120
2121 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
2122 ext4_journal_stop(handle);
2123}
2124
2125/*
2126 * ext4_ext_writepage_trans_blocks:
2127 * calculate max number of blocks we could modify
2128 * in order to allocate new block for an inode
2129 */
2130int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2131{
2132 int needed;
2133
2134 needed = ext4_ext_calc_credits_for_insert(inode, NULL);
2135
2136 /* caller wants to allocate num blocks, but note it includes sb */
2137 needed = needed * num - (num - 1);
2138
2139#ifdef CONFIG_QUOTA
2140 needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2141#endif
2142
2143 return needed;
2144}
2145
2146EXPORT_SYMBOL(ext4_mark_inode_dirty);
2147EXPORT_SYMBOL(ext4_ext_invalidate_cache);
2148EXPORT_SYMBOL(ext4_ext_insert_extent);
2149EXPORT_SYMBOL(ext4_ext_walk_space);
2150EXPORT_SYMBOL(ext4_ext_find_goal);
2151EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
2152
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 000000000000..0b622c0624b7
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,139 @@
1/*
2 * linux/fs/ext4/file.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/file.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext4 fs regular file handling primitives
16 *
17 * 64-bit file support on 64-bit platforms by Jakub Jelinek
18 * (jj@sunsite.ms.mff.cuni.cz)
19 */
20
21#include <linux/time.h>
22#include <linux/fs.h>
23#include <linux/jbd2.h>
24#include <linux/ext4_fs.h>
25#include <linux/ext4_jbd2.h>
26#include "xattr.h"
27#include "acl.h"
28
29/*
30 * Called when an inode is released. Note that this is different
31 * from ext4_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed.
33 */
34static int ext4_release_file (struct inode * inode, struct file * filp)
35{
36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1))
39 {
40 mutex_lock(&EXT4_I(inode)->truncate_mutex);
41 ext4_discard_reservation(inode);
42 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
43 }
44 if (is_dx(inode) && filp->private_data)
45 ext4_htree_free_dir_info(filp->private_data);
46
47 return 0;
48}
49
50static ssize_t
51ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
52 unsigned long nr_segs, loff_t pos)
53{
54 struct file *file = iocb->ki_filp;
55 struct inode *inode = file->f_dentry->d_inode;
56 ssize_t ret;
57 int err;
58
59 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
60
61 /*
62 * Skip flushing if there was an error, or if nothing was written.
63 */
64 if (ret <= 0)
65 return ret;
66
67 /*
68 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
69 * journalling then we need to make sure that we force the transaction
70 * to disk to keep all metadata uptodate synchronously.
71 */
72 if (file->f_flags & O_SYNC) {
73 /*
74 * If we are non-data-journaled, then the dirty data has
75 * already been flushed to backing store by generic_osync_inode,
76 * and the inode has been flushed too if there have been any
77 * modifications other than mere timestamp updates.
78 *
79 * Open question --- do we care about flushing timestamps too
80 * if the inode is IS_SYNC?
81 */
82 if (!ext4_should_journal_data(inode))
83 return ret;
84
85 goto force_commit;
86 }
87
88 /*
89 * So we know that there has been no forced data flush. If the inode
90 * is marked IS_SYNC, we need to force one ourselves.
91 */
92 if (!IS_SYNC(inode))
93 return ret;
94
95 /*
96 * Open question #2 --- should we force data to disk here too? If we
97 * don't, the only impact is that data=writeback filesystems won't
98 * flush data to disk automatically on IS_SYNC, only metadata (but
99 * historically, that is what ext2 has done.)
100 */
101
102force_commit:
103 err = ext4_force_commit(inode->i_sb);
104 if (err)
105 return err;
106 return ret;
107}
108
109const struct file_operations ext4_file_operations = {
110 .llseek = generic_file_llseek,
111 .read = do_sync_read,
112 .write = do_sync_write,
113 .aio_read = generic_file_aio_read,
114 .aio_write = ext4_file_write,
115 .ioctl = ext4_ioctl,
116#ifdef CONFIG_COMPAT
117 .compat_ioctl = ext4_compat_ioctl,
118#endif
119 .mmap = generic_file_mmap,
120 .open = generic_file_open,
121 .release = ext4_release_file,
122 .fsync = ext4_sync_file,
123 .sendfile = generic_file_sendfile,
124 .splice_read = generic_file_splice_read,
125 .splice_write = generic_file_splice_write,
126};
127
128struct inode_operations ext4_file_inode_operations = {
129 .truncate = ext4_truncate,
130 .setattr = ext4_setattr,
131#ifdef CONFIG_EXT4DEV_FS_XATTR
132 .setxattr = generic_setxattr,
133 .getxattr = generic_getxattr,
134 .listxattr = ext4_listxattr,
135 .removexattr = generic_removexattr,
136#endif
137 .permission = ext4_permission,
138};
139
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 000000000000..2a167d7131fa
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,88 @@
1/*
2 * linux/fs/ext4/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext4fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include <linux/time.h>
26#include <linux/fs.h>
27#include <linux/sched.h>
28#include <linux/writeback.h>
29#include <linux/jbd2.h>
30#include <linux/ext4_fs.h>
31#include <linux/ext4_jbd2.h>
32
33/*
34 * akpm: A new design for ext4_sync_file().
35 *
36 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
37 * There cannot be a transaction open by this task.
38 * Another task could have dirtied this inode. Its data can be in any
39 * state in the journalling system.
40 *
41 * What we do is just kick off a commit and wait on it. This will snapshot the
42 * inode to disk.
43 */
44
45int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
46{
47 struct inode *inode = dentry->d_inode;
48 int ret = 0;
49
50 J_ASSERT(ext4_journal_current_handle() == 0);
51
52 /*
53 * data=writeback:
54 * The caller's filemap_fdatawrite()/wait will sync the data.
55 * sync_inode() will sync the metadata
56 *
57 * data=ordered:
58 * The caller's filemap_fdatawrite() will write the data and
59 * sync_inode() will write the inode if it is dirty. Then the caller's
60 * filemap_fdatawait() will wait on the pages.
61 *
62 * data=journal:
63 * filemap_fdatawrite won't do anything (the buffers are clean).
64 * ext4_force_commit will write the file data into the journal and
65 * will wait on that.
66 * filemap_fdatawait() will encounter a ton of newly-dirtied pages
67 * (they were dirtied by commit). But that's OK - the blocks are
68 * safe in-journal, which is all fsync() needs to ensure.
69 */
70 if (ext4_should_journal_data(inode)) {
71 ret = ext4_force_commit(inode->i_sb);
72 goto out;
73 }
74
75 /*
76 * The VFS has written the file data. If the inode is unaltered
77 * then we need not start a commit.
78 */
79 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
80 struct writeback_control wbc = {
81 .sync_mode = WB_SYNC_ALL,
82 .nr_to_write = 0, /* sys_fsync did this */
83 };
84 ret = sync_inode(inode, &wbc);
85 }
86out:
87 return ret;
88}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 000000000000..a67966385e06
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,152 @@
1/*
2 * linux/fs/ext4/hash.c
3 *
4 * Copyright (C) 2002 by Theodore Ts'o
5 *
6 * This file is released under the GPL v2.
7 *
8 * This file may be redistributed under the terms of the GNU Public
9 * License.
10 */
11
12#include <linux/fs.h>
13#include <linux/jbd2.h>
14#include <linux/sched.h>
15#include <linux/ext4_fs.h>
16#include <linux/cryptohash.h>
17
18#define DELTA 0x9E3779B9
19
20static void TEA_transform(__u32 buf[4], __u32 const in[])
21{
22 __u32 sum = 0;
23 __u32 b0 = buf[0], b1 = buf[1];
24 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
25 int n = 16;
26
27 do {
28 sum += DELTA;
29 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
30 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
31 } while(--n);
32
33 buf[0] += b0;
34 buf[1] += b1;
35}
36
37
38/* The old legacy hash */
39static __u32 dx_hack_hash (const char *name, int len)
40{
41 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
42 while (len--) {
43 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
44
45 if (hash & 0x80000000) hash -= 0x7fffffff;
46 hash1 = hash0;
47 hash0 = hash;
48 }
49 return (hash0 << 1);
50}
51
52static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
53{
54 __u32 pad, val;
55 int i;
56
57 pad = (__u32)len | ((__u32)len << 8);
58 pad |= pad << 16;
59
60 val = pad;
61 if (len > num*4)
62 len = num * 4;
63 for (i=0; i < len; i++) {
64 if ((i % 4) == 0)
65 val = pad;
66 val = msg[i] + (val << 8);
67 if ((i % 4) == 3) {
68 *buf++ = val;
69 val = pad;
70 num--;
71 }
72 }
73 if (--num >= 0)
74 *buf++ = val;
75 while (--num >= 0)
76 *buf++ = pad;
77}
78
79/*
80 * Returns the hash of a filename. If len is 0 and name is NULL, then
81 * this function can be used to test whether or not a hash version is
82 * supported.
83 *
84 * The seed is an 4 longword (32 bits) "secret" which can be used to
85 * uniquify a hash. If the seed is all zero's, then some default seed
86 * may be used.
87 *
88 * A particular hash version specifies whether or not the seed is
89 * represented, and whether or not the returned hash is 32 bits or 64
90 * bits. 32 bit hashes will return 0 for the minor hash.
91 */
92int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
93{
94 __u32 hash;
95 __u32 minor_hash = 0;
96 const char *p;
97 int i;
98 __u32 in[8], buf[4];
99
100 /* Initialize the default seed for the hash checksum functions */
101 buf[0] = 0x67452301;
102 buf[1] = 0xefcdab89;
103 buf[2] = 0x98badcfe;
104 buf[3] = 0x10325476;
105
106 /* Check to see if the seed is all zero's */
107 if (hinfo->seed) {
108 for (i=0; i < 4; i++) {
109 if (hinfo->seed[i])
110 break;
111 }
112 if (i < 4)
113 memcpy(buf, hinfo->seed, sizeof(buf));
114 }
115
116 switch (hinfo->hash_version) {
117 case DX_HASH_LEGACY:
118 hash = dx_hack_hash(name, len);
119 break;
120 case DX_HASH_HALF_MD4:
121 p = name;
122 while (len > 0) {
123 str2hashbuf(p, len, in, 8);
124 half_md4_transform(buf, in);
125 len -= 32;
126 p += 32;
127 }
128 minor_hash = buf[2];
129 hash = buf[1];
130 break;
131 case DX_HASH_TEA:
132 p = name;
133 while (len > 0) {
134 str2hashbuf(p, len, in, 4);
135 TEA_transform(buf, in);
136 len -= 16;
137 p += 16;
138 }
139 hash = buf[0];
140 minor_hash = buf[1];
141 break;
142 default:
143 hinfo->hash = 0;
144 return -1;
145 }
146 hash = hash & ~1;
147 if (hash == (EXT4_HTREE_EOF << 1))
148 hash = (EXT4_HTREE_EOF-1) << 1;
149 hinfo->hash = hash;
150 hinfo->minor_hash = minor_hash;
151 return 0;
152}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 000000000000..c88b439ba5cd
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,772 @@
1/*
2 * linux/fs/ext4/ialloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * BSD ufs-inspired inode and directory allocation by
10 * Stephen Tweedie (sct@redhat.com), 1993
11 * Big-endian to little-endian byte-swapping/bitmaps by
12 * David S. Miller (davem@caip.rutgers.edu), 1995
13 */
14
15#include <linux/time.h>
16#include <linux/fs.h>
17#include <linux/jbd2.h>
18#include <linux/ext4_fs.h>
19#include <linux/ext4_jbd2.h>
20#include <linux/stat.h>
21#include <linux/string.h>
22#include <linux/quotaops.h>
23#include <linux/buffer_head.h>
24#include <linux/random.h>
25#include <linux/bitops.h>
26#include <linux/blkdev.h>
27#include <asm/byteorder.h>
28
29#include "xattr.h"
30#include "acl.h"
31
32/*
33 * ialloc.c contains the inodes allocation and deallocation routines
34 */
35
36/*
37 * The free inodes are managed by bitmaps. A file system contains several
38 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
39 * block for inodes, N blocks for the inode table and data blocks.
40 *
41 * The file system contains group descriptors which are located after the
42 * super block. Each descriptor contains the number of the bitmap block and
43 * the free blocks count in the block.
44 */
45
46
47/*
48 * Read the inode allocation bitmap for a given block_group, reading
49 * into the specified slot in the superblock's bitmap cache.
50 *
51 * Return buffer_head of bitmap on success or NULL.
52 */
53static struct buffer_head *
54read_inode_bitmap(struct super_block * sb, unsigned long block_group)
55{
56 struct ext4_group_desc *desc;
57 struct buffer_head *bh = NULL;
58
59 desc = ext4_get_group_desc(sb, block_group, NULL);
60 if (!desc)
61 goto error_out;
62
63 bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
64 if (!bh)
65 ext4_error(sb, "read_inode_bitmap",
66 "Cannot read inode bitmap - "
67 "block_group = %lu, inode_bitmap = %llu",
68 block_group, ext4_inode_bitmap(sb, desc));
69error_out:
70 return bh;
71}
72
73/*
74 * NOTE! When we get the inode, we're the only people
75 * that have access to it, and as such there are no
76 * race conditions we have to worry about. The inode
77 * is not on the hash-lists, and it cannot be reached
78 * through the filesystem because the directory entry
79 * has been deleted earlier.
80 *
81 * HOWEVER: we must make sure that we get no aliases,
82 * which means that we have to call "clear_inode()"
83 * _before_ we mark the inode not in use in the inode
84 * bitmaps. Otherwise a newly created file might use
85 * the same inode number (not actually the same pointer
86 * though), and then we'd have two inodes sharing the
87 * same inode number and space on the harddisk.
88 */
89void ext4_free_inode (handle_t *handle, struct inode * inode)
90{
91 struct super_block * sb = inode->i_sb;
92 int is_directory;
93 unsigned long ino;
94 struct buffer_head *bitmap_bh = NULL;
95 struct buffer_head *bh2;
96 unsigned long block_group;
97 unsigned long bit;
98 struct ext4_group_desc * gdp;
99 struct ext4_super_block * es;
100 struct ext4_sb_info *sbi;
101 int fatal = 0, err;
102
103 if (atomic_read(&inode->i_count) > 1) {
104 printk ("ext4_free_inode: inode has count=%d\n",
105 atomic_read(&inode->i_count));
106 return;
107 }
108 if (inode->i_nlink) {
109 printk ("ext4_free_inode: inode has nlink=%d\n",
110 inode->i_nlink);
111 return;
112 }
113 if (!sb) {
114 printk("ext4_free_inode: inode on nonexistent device\n");
115 return;
116 }
117 sbi = EXT4_SB(sb);
118
119 ino = inode->i_ino;
120 ext4_debug ("freeing inode %lu\n", ino);
121
122 /*
123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well.
125 */
126 DQUOT_INIT(inode);
127 ext4_xattr_delete_inode(handle, inode);
128 DQUOT_FREE_INODE(inode);
129 DQUOT_DROP(inode);
130
131 is_directory = S_ISDIR(inode->i_mode);
132
133 /* Do this BEFORE marking the inode not in use or returning an error */
134 clear_inode (inode);
135
136 es = EXT4_SB(sb)->s_es;
137 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
138 ext4_error (sb, "ext4_free_inode",
139 "reserved or nonexistent inode %lu", ino);
140 goto error_return;
141 }
142 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
143 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
144 bitmap_bh = read_inode_bitmap(sb, block_group);
145 if (!bitmap_bh)
146 goto error_return;
147
148 BUFFER_TRACE(bitmap_bh, "get_write_access");
149 fatal = ext4_journal_get_write_access(handle, bitmap_bh);
150 if (fatal)
151 goto error_return;
152
153 /* Ok, now we can actually update the inode bitmaps.. */
154 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
155 bit, bitmap_bh->b_data))
156 ext4_error (sb, "ext4_free_inode",
157 "bit already cleared for inode %lu", ino);
158 else {
159 gdp = ext4_get_group_desc (sb, block_group, &bh2);
160
161 BUFFER_TRACE(bh2, "get_write_access");
162 fatal = ext4_journal_get_write_access(handle, bh2);
163 if (fatal) goto error_return;
164
165 if (gdp) {
166 spin_lock(sb_bgl_lock(sbi, block_group));
167 gdp->bg_free_inodes_count = cpu_to_le16(
168 le16_to_cpu(gdp->bg_free_inodes_count) + 1);
169 if (is_directory)
170 gdp->bg_used_dirs_count = cpu_to_le16(
171 le16_to_cpu(gdp->bg_used_dirs_count) - 1);
172 spin_unlock(sb_bgl_lock(sbi, block_group));
173 percpu_counter_inc(&sbi->s_freeinodes_counter);
174 if (is_directory)
175 percpu_counter_dec(&sbi->s_dirs_counter);
176
177 }
178 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
179 err = ext4_journal_dirty_metadata(handle, bh2);
180 if (!fatal) fatal = err;
181 }
182 BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
183 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
184 if (!fatal)
185 fatal = err;
186 sb->s_dirt = 1;
187error_return:
188 brelse(bitmap_bh);
189 ext4_std_error(sb, fatal);
190}
191
192/*
193 * There are two policies for allocating an inode. If the new inode is
194 * a directory, then a forward search is made for a block group with both
195 * free space and a low directory-to-inode ratio; if that fails, then of
196 * the groups with above-average free space, that group with the fewest
197 * directories already is chosen.
198 *
199 * For other inodes, search forward from the parent directory\'s block
200 * group to find a free inode.
201 */
202static int find_group_dir(struct super_block *sb, struct inode *parent)
203{
204 int ngroups = EXT4_SB(sb)->s_groups_count;
205 unsigned int freei, avefreei;
206 struct ext4_group_desc *desc, *best_desc = NULL;
207 struct buffer_head *bh;
208 int group, best_group = -1;
209
210 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
211 avefreei = freei / ngroups;
212
213 for (group = 0; group < ngroups; group++) {
214 desc = ext4_get_group_desc (sb, group, &bh);
215 if (!desc || !desc->bg_free_inodes_count)
216 continue;
217 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
218 continue;
219 if (!best_desc ||
220 (le16_to_cpu(desc->bg_free_blocks_count) >
221 le16_to_cpu(best_desc->bg_free_blocks_count))) {
222 best_group = group;
223 best_desc = desc;
224 }
225 }
226 return best_group;
227}
228
229/*
230 * Orlov's allocator for directories.
231 *
232 * We always try to spread first-level directories.
233 *
234 * If there are blockgroups with both free inodes and free blocks counts
235 * not worse than average we return one with smallest directory count.
236 * Otherwise we simply return a random group.
237 *
238 * For the rest rules look so:
239 *
240 * It's OK to put directory into a group unless
241 * it has too many directories already (max_dirs) or
242 * it has too few free inodes left (min_inodes) or
243 * it has too few free blocks left (min_blocks) or
244 * it's already running too large debt (max_debt).
245 * Parent's group is prefered, if it doesn't satisfy these
246 * conditions we search cyclically through the rest. If none
247 * of the groups look good we just look for a group with more
248 * free inodes than average (starting at parent's group).
249 *
250 * Debt is incremented each time we allocate a directory and decremented
251 * when we allocate an inode, within 0--255.
252 */
253
254#define INODE_COST 64
255#define BLOCK_COST 256
256
257static int find_group_orlov(struct super_block *sb, struct inode *parent)
258{
259 int parent_group = EXT4_I(parent)->i_block_group;
260 struct ext4_sb_info *sbi = EXT4_SB(sb);
261 struct ext4_super_block *es = sbi->s_es;
262 int ngroups = sbi->s_groups_count;
263 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
264 unsigned int freei, avefreei;
265 ext4_fsblk_t freeb, avefreeb;
266 ext4_fsblk_t blocks_per_dir;
267 unsigned int ndirs;
268 int max_debt, max_dirs, min_inodes;
269 ext4_grpblk_t min_blocks;
270 int group = -1, i;
271 struct ext4_group_desc *desc;
272 struct buffer_head *bh;
273
274 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
275 avefreei = freei / ngroups;
276 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
277 avefreeb = freeb;
278 do_div(avefreeb, ngroups);
279 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
280
281 if ((parent == sb->s_root->d_inode) ||
282 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
283 int best_ndir = inodes_per_group;
284 int best_group = -1;
285
286 get_random_bytes(&group, sizeof(group));
287 parent_group = (unsigned)group % ngroups;
288 for (i = 0; i < ngroups; i++) {
289 group = (parent_group + i) % ngroups;
290 desc = ext4_get_group_desc (sb, group, &bh);
291 if (!desc || !desc->bg_free_inodes_count)
292 continue;
293 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
294 continue;
295 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
296 continue;
297 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
298 continue;
299 best_group = group;
300 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
301 }
302 if (best_group >= 0)
303 return best_group;
304 goto fallback;
305 }
306
307 blocks_per_dir = ext4_blocks_count(es) - freeb;
308 do_div(blocks_per_dir, ndirs);
309
310 max_dirs = ndirs / ngroups + inodes_per_group / 16;
311 min_inodes = avefreei - inodes_per_group / 4;
312 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
313
314 max_debt = EXT4_BLOCKS_PER_GROUP(sb);
315 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
316 if (max_debt * INODE_COST > inodes_per_group)
317 max_debt = inodes_per_group / INODE_COST;
318 if (max_debt > 255)
319 max_debt = 255;
320 if (max_debt == 0)
321 max_debt = 1;
322
323 for (i = 0; i < ngroups; i++) {
324 group = (parent_group + i) % ngroups;
325 desc = ext4_get_group_desc (sb, group, &bh);
326 if (!desc || !desc->bg_free_inodes_count)
327 continue;
328 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
329 continue;
330 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
331 continue;
332 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
333 continue;
334 return group;
335 }
336
337fallback:
338 for (i = 0; i < ngroups; i++) {
339 group = (parent_group + i) % ngroups;
340 desc = ext4_get_group_desc (sb, group, &bh);
341 if (!desc || !desc->bg_free_inodes_count)
342 continue;
343 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
344 return group;
345 }
346
347 if (avefreei) {
348 /*
349 * The free-inodes counter is approximate, and for really small
350 * filesystems the above test can fail to find any blockgroups
351 */
352 avefreei = 0;
353 goto fallback;
354 }
355
356 return -1;
357}
358
359static int find_group_other(struct super_block *sb, struct inode *parent)
360{
361 int parent_group = EXT4_I(parent)->i_block_group;
362 int ngroups = EXT4_SB(sb)->s_groups_count;
363 struct ext4_group_desc *desc;
364 struct buffer_head *bh;
365 int group, i;
366
367 /*
368 * Try to place the inode in its parent directory
369 */
370 group = parent_group;
371 desc = ext4_get_group_desc (sb, group, &bh);
372 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
373 le16_to_cpu(desc->bg_free_blocks_count))
374 return group;
375
376 /*
377 * We're going to place this inode in a different blockgroup from its
378 * parent. We want to cause files in a common directory to all land in
379 * the same blockgroup. But we want files which are in a different
380 * directory which shares a blockgroup with our parent to land in a
381 * different blockgroup.
382 *
383 * So add our directory's i_ino into the starting point for the hash.
384 */
385 group = (group + parent->i_ino) % ngroups;
386
387 /*
388 * Use a quadratic hash to find a group with a free inode and some free
389 * blocks.
390 */
391 for (i = 1; i < ngroups; i <<= 1) {
392 group += i;
393 if (group >= ngroups)
394 group -= ngroups;
395 desc = ext4_get_group_desc (sb, group, &bh);
396 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
397 le16_to_cpu(desc->bg_free_blocks_count))
398 return group;
399 }
400
401 /*
402 * That failed: try linear search for a free inode, even if that group
403 * has no free blocks.
404 */
405 group = parent_group;
406 for (i = 0; i < ngroups; i++) {
407 if (++group >= ngroups)
408 group = 0;
409 desc = ext4_get_group_desc (sb, group, &bh);
410 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
411 return group;
412 }
413
414 return -1;
415}
416
417/*
418 * There are two policies for allocating an inode. If the new inode is
419 * a directory, then a forward search is made for a block group with both
420 * free space and a low directory-to-inode ratio; if that fails, then of
421 * the groups with above-average free space, that group with the fewest
422 * directories already is chosen.
423 *
424 * For other inodes, search forward from the parent directory's block
425 * group to find a free inode.
426 */
427struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
428{
429 struct super_block *sb;
430 struct buffer_head *bitmap_bh = NULL;
431 struct buffer_head *bh2;
432 int group;
433 unsigned long ino = 0;
434 struct inode * inode;
435 struct ext4_group_desc * gdp = NULL;
436 struct ext4_super_block * es;
437 struct ext4_inode_info *ei;
438 struct ext4_sb_info *sbi;
439 int err = 0;
440 struct inode *ret;
441 int i;
442
443 /* Cannot create files in a deleted directory */
444 if (!dir || !dir->i_nlink)
445 return ERR_PTR(-EPERM);
446
447 sb = dir->i_sb;
448 inode = new_inode(sb);
449 if (!inode)
450 return ERR_PTR(-ENOMEM);
451 ei = EXT4_I(inode);
452
453 sbi = EXT4_SB(sb);
454 es = sbi->s_es;
455 if (S_ISDIR(mode)) {
456 if (test_opt (sb, OLDALLOC))
457 group = find_group_dir(sb, dir);
458 else
459 group = find_group_orlov(sb, dir);
460 } else
461 group = find_group_other(sb, dir);
462
463 err = -ENOSPC;
464 if (group == -1)
465 goto out;
466
467 for (i = 0; i < sbi->s_groups_count; i++) {
468 err = -EIO;
469
470 gdp = ext4_get_group_desc(sb, group, &bh2);
471 if (!gdp)
472 goto fail;
473
474 brelse(bitmap_bh);
475 bitmap_bh = read_inode_bitmap(sb, group);
476 if (!bitmap_bh)
477 goto fail;
478
479 ino = 0;
480
481repeat_in_this_group:
482 ino = ext4_find_next_zero_bit((unsigned long *)
483 bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
484 if (ino < EXT4_INODES_PER_GROUP(sb)) {
485
486 BUFFER_TRACE(bitmap_bh, "get_write_access");
487 err = ext4_journal_get_write_access(handle, bitmap_bh);
488 if (err)
489 goto fail;
490
491 if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
492 ino, bitmap_bh->b_data)) {
493 /* we won it */
494 BUFFER_TRACE(bitmap_bh,
495 "call ext4_journal_dirty_metadata");
496 err = ext4_journal_dirty_metadata(handle,
497 bitmap_bh);
498 if (err)
499 goto fail;
500 goto got;
501 }
502 /* we lost it */
503 jbd2_journal_release_buffer(handle, bitmap_bh);
504
505 if (++ino < EXT4_INODES_PER_GROUP(sb))
506 goto repeat_in_this_group;
507 }
508
509 /*
510 * This case is possible in concurrent environment. It is very
511 * rare. We cannot repeat the find_group_xxx() call because
512 * that will simply return the same blockgroup, because the
513 * group descriptor metadata has not yet been updated.
514 * So we just go onto the next blockgroup.
515 */
516 if (++group == sbi->s_groups_count)
517 group = 0;
518 }
519 err = -ENOSPC;
520 goto out;
521
522got:
523 ino += group * EXT4_INODES_PER_GROUP(sb) + 1;
524 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
525 ext4_error (sb, "ext4_new_inode",
526 "reserved inode or inode > inodes count - "
527 "block_group = %d, inode=%lu", group, ino);
528 err = -EIO;
529 goto fail;
530 }
531
532 BUFFER_TRACE(bh2, "get_write_access");
533 err = ext4_journal_get_write_access(handle, bh2);
534 if (err) goto fail;
535 spin_lock(sb_bgl_lock(sbi, group));
536 gdp->bg_free_inodes_count =
537 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
538 if (S_ISDIR(mode)) {
539 gdp->bg_used_dirs_count =
540 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
541 }
542 spin_unlock(sb_bgl_lock(sbi, group));
543 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
544 err = ext4_journal_dirty_metadata(handle, bh2);
545 if (err) goto fail;
546
547 percpu_counter_dec(&sbi->s_freeinodes_counter);
548 if (S_ISDIR(mode))
549 percpu_counter_inc(&sbi->s_dirs_counter);
550 sb->s_dirt = 1;
551
552 inode->i_uid = current->fsuid;
553 if (test_opt (sb, GRPID))
554 inode->i_gid = dir->i_gid;
555 else if (dir->i_mode & S_ISGID) {
556 inode->i_gid = dir->i_gid;
557 if (S_ISDIR(mode))
558 mode |= S_ISGID;
559 } else
560 inode->i_gid = current->fsgid;
561 inode->i_mode = mode;
562
563 inode->i_ino = ino;
564 /* This is the optimal IO size (for stat), not the fs block size */
565 inode->i_blocks = 0;
566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
567
568 memset(ei->i_data, 0, sizeof(ei->i_data));
569 ei->i_dir_start_lookup = 0;
570 ei->i_disksize = 0;
571
572 ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL;
573 if (S_ISLNK(mode))
574 ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
575 /* dirsync only applies to directories */
576 if (!S_ISDIR(mode))
577 ei->i_flags &= ~EXT4_DIRSYNC_FL;
578#ifdef EXT4_FRAGMENTS
579 ei->i_faddr = 0;
580 ei->i_frag_no = 0;
581 ei->i_frag_size = 0;
582#endif
583 ei->i_file_acl = 0;
584 ei->i_dir_acl = 0;
585 ei->i_dtime = 0;
586 ei->i_block_alloc_info = NULL;
587 ei->i_block_group = group;
588
589 ext4_set_inode_flags(inode);
590 if (IS_DIRSYNC(inode))
591 handle->h_sync = 1;
592 insert_inode_hash(inode);
593 spin_lock(&sbi->s_next_gen_lock);
594 inode->i_generation = sbi->s_next_generation++;
595 spin_unlock(&sbi->s_next_gen_lock);
596
597 ei->i_state = EXT4_STATE_NEW;
598 ei->i_extra_isize =
599 (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ?
600 sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
601
602 ret = inode;
603 if(DQUOT_ALLOC_INODE(inode)) {
604 err = -EDQUOT;
605 goto fail_drop;
606 }
607
608 err = ext4_init_acl(handle, inode, dir);
609 if (err)
610 goto fail_free_drop;
611
612 err = ext4_init_security(handle,inode, dir);
613 if (err)
614 goto fail_free_drop;
615
616 err = ext4_mark_inode_dirty(handle, inode);
617 if (err) {
618 ext4_std_error(sb, err);
619 goto fail_free_drop;
620 }
621 if (test_opt(sb, EXTENTS)) {
622 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
623 ext4_ext_tree_init(handle, inode);
624 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
625 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
626 if (err) goto fail;
627 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
628 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
629 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
630 }
631 }
632
633 ext4_debug("allocating inode %lu\n", inode->i_ino);
634 goto really_out;
635fail:
636 ext4_std_error(sb, err);
637out:
638 iput(inode);
639 ret = ERR_PTR(err);
640really_out:
641 brelse(bitmap_bh);
642 return ret;
643
644fail_free_drop:
645 DQUOT_FREE_INODE(inode);
646
647fail_drop:
648 DQUOT_DROP(inode);
649 inode->i_flags |= S_NOQUOTA;
650 inode->i_nlink = 0;
651 iput(inode);
652 brelse(bitmap_bh);
653 return ERR_PTR(err);
654}
655
656/* Verify that we are loading a valid orphan from disk */
657struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
658{
659 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
660 unsigned long block_group;
661 int bit;
662 struct buffer_head *bitmap_bh = NULL;
663 struct inode *inode = NULL;
664
665 /* Error cases - e2fsck has already cleaned up for us */
666 if (ino > max_ino) {
667 ext4_warning(sb, __FUNCTION__,
668 "bad orphan ino %lu! e2fsck was run?", ino);
669 goto out;
670 }
671
672 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
673 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
674 bitmap_bh = read_inode_bitmap(sb, block_group);
675 if (!bitmap_bh) {
676 ext4_warning(sb, __FUNCTION__,
677 "inode bitmap error for orphan %lu", ino);
678 goto out;
679 }
680
681 /* Having the inode bit set should be a 100% indicator that this
682 * is a valid orphan (no e2fsck run on fs). Orphans also include
683 * inodes that were being truncated, so we can't check i_nlink==0.
684 */
685 if (!ext4_test_bit(bit, bitmap_bh->b_data) ||
686 !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
687 NEXT_ORPHAN(inode) > max_ino) {
688 ext4_warning(sb, __FUNCTION__,
689 "bad orphan inode %lu! e2fsck was run?", ino);
690 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
691 bit, (unsigned long long)bitmap_bh->b_blocknr,
692 ext4_test_bit(bit, bitmap_bh->b_data));
693 printk(KERN_NOTICE "inode=%p\n", inode);
694 if (inode) {
695 printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
696 is_bad_inode(inode));
697 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
698 NEXT_ORPHAN(inode));
699 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
700 }
701 /* Avoid freeing blocks if we got a bad deleted inode */
702 if (inode && inode->i_nlink == 0)
703 inode->i_blocks = 0;
704 iput(inode);
705 inode = NULL;
706 }
707out:
708 brelse(bitmap_bh);
709 return inode;
710}
711
712unsigned long ext4_count_free_inodes (struct super_block * sb)
713{
714 unsigned long desc_count;
715 struct ext4_group_desc *gdp;
716 int i;
717#ifdef EXT4FS_DEBUG
718 struct ext4_super_block *es;
719 unsigned long bitmap_count, x;
720 struct buffer_head *bitmap_bh = NULL;
721
722 es = EXT4_SB(sb)->s_es;
723 desc_count = 0;
724 bitmap_count = 0;
725 gdp = NULL;
726 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
727 gdp = ext4_get_group_desc (sb, i, NULL);
728 if (!gdp)
729 continue;
730 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
731 brelse(bitmap_bh);
732 bitmap_bh = read_inode_bitmap(sb, i);
733 if (!bitmap_bh)
734 continue;
735
736 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
737 printk("group %d: stored = %d, counted = %lu\n",
738 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
739 bitmap_count += x;
740 }
741 brelse(bitmap_bh);
742 printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
743 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
744 return desc_count;
745#else
746 desc_count = 0;
747 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
748 gdp = ext4_get_group_desc (sb, i, NULL);
749 if (!gdp)
750 continue;
751 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
752 cond_resched();
753 }
754 return desc_count;
755#endif
756}
757
758/* Called at mount-time, super-block is locked */
759unsigned long ext4_count_dirs (struct super_block * sb)
760{
761 unsigned long count = 0;
762 int i;
763
764 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
765 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
766 if (!gdp)
767 continue;
768 count += le16_to_cpu(gdp->bg_used_dirs_count);
769 }
770 return count;
771}
772
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 000000000000..0a60ec5a16db
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,3233 @@
1/*
2 * linux/fs/ext4/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23 */
24
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/time.h>
28#include <linux/ext4_jbd2.h>
29#include <linux/jbd2.h>
30#include <linux/smp_lock.h>
31#include <linux/highuid.h>
32#include <linux/pagemap.h>
33#include <linux/quotaops.h>
34#include <linux/string.h>
35#include <linux/buffer_head.h>
36#include <linux/writeback.h>
37#include <linux/mpage.h>
38#include <linux/uio.h>
39#include <linux/bio.h>
40#include "xattr.h"
41#include "acl.h"
42
43/*
44 * Test whether an inode is a fast symlink.
45 */
46static int ext4_inode_is_fast_symlink(struct inode *inode)
47{
48 int ea_blocks = EXT4_I(inode)->i_file_acl ?
49 (inode->i_sb->s_blocksize >> 9) : 0;
50
51 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
52}
53
54/*
55 * The ext4 forget function must perform a revoke if we are freeing data
56 * which has been journaled. Metadata (eg. indirect blocks) must be
57 * revoked in all cases.
58 *
59 * "bh" may be NULL: a metadata block may have been freed from memory
60 * but there may still be a record of it in the journal, and that record
61 * still needs to be revoked.
62 */
63int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
64 struct buffer_head *bh, ext4_fsblk_t blocknr)
65{
66 int err;
67
68 might_sleep();
69
70 BUFFER_TRACE(bh, "enter");
71
72 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
73 "data mode %lx\n",
74 bh, is_metadata, inode->i_mode,
75 test_opt(inode->i_sb, DATA_FLAGS));
76
77 /* Never use the revoke function if we are doing full data
78 * journaling: there is no need to, and a V1 superblock won't
79 * support it. Otherwise, only skip the revoke on un-journaled
80 * data blocks. */
81
82 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
83 (!is_metadata && !ext4_should_journal_data(inode))) {
84 if (bh) {
85 BUFFER_TRACE(bh, "call jbd2_journal_forget");
86 return ext4_journal_forget(handle, bh);
87 }
88 return 0;
89 }
90
91 /*
92 * data!=journal && (is_metadata || should_journal_data(inode))
93 */
94 BUFFER_TRACE(bh, "call ext4_journal_revoke");
95 err = ext4_journal_revoke(handle, blocknr, bh);
96 if (err)
97 ext4_abort(inode->i_sb, __FUNCTION__,
98 "error %d when attempting revoke", err);
99 BUFFER_TRACE(bh, "exit");
100 return err;
101}
102
103/*
104 * Work out how many blocks we need to proceed with the next chunk of a
105 * truncate transaction.
106 */
107static unsigned long blocks_for_truncate(struct inode *inode)
108{
109 unsigned long needed;
110
111 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
112
113 /* Give ourselves just enough room to cope with inodes in which
114 * i_blocks is corrupt: we've seen disk corruptions in the past
115 * which resulted in random data in an inode which looked enough
116 * like a regular file for ext4 to try to delete it. Things
117 * will go a bit crazy if that happens, but at least we should
118 * try not to panic the whole kernel. */
119 if (needed < 2)
120 needed = 2;
121
122 /* But we need to bound the transaction so we don't overflow the
123 * journal. */
124 if (needed > EXT4_MAX_TRANS_DATA)
125 needed = EXT4_MAX_TRANS_DATA;
126
127 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
128}
129
130/*
131 * Truncate transactions can be complex and absolutely huge. So we need to
132 * be able to restart the transaction at a conventient checkpoint to make
133 * sure we don't overflow the journal.
134 *
135 * start_transaction gets us a new handle for a truncate transaction,
136 * and extend_transaction tries to extend the existing one a bit. If
137 * extend fails, we need to propagate the failure up and restart the
138 * transaction in the top-level truncate loop. --sct
139 */
140static handle_t *start_transaction(struct inode *inode)
141{
142 handle_t *result;
143
144 result = ext4_journal_start(inode, blocks_for_truncate(inode));
145 if (!IS_ERR(result))
146 return result;
147
148 ext4_std_error(inode->i_sb, PTR_ERR(result));
149 return result;
150}
151
152/*
153 * Try to extend this transaction for the purposes of truncation.
154 *
155 * Returns 0 if we managed to create more room. If we can't create more
156 * room, and the transaction must be restarted we return 1.
157 */
158static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
159{
160 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
161 return 0;
162 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
163 return 0;
164 return 1;
165}
166
167/*
168 * Restart the transaction associated with *handle. This does a commit,
169 * so before we call here everything must be consistently dirtied against
170 * this transaction.
171 */
172static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
173{
174 jbd_debug(2, "restarting handle %p\n", handle);
175 return ext4_journal_restart(handle, blocks_for_truncate(inode));
176}
177
178/*
179 * Called at the last iput() if i_nlink is zero.
180 */
181void ext4_delete_inode (struct inode * inode)
182{
183 handle_t *handle;
184
185 truncate_inode_pages(&inode->i_data, 0);
186
187 if (is_bad_inode(inode))
188 goto no_delete;
189
190 handle = start_transaction(inode);
191 if (IS_ERR(handle)) {
192 /*
193 * If we're going to skip the normal cleanup, we still need to
194 * make sure that the in-core orphan linked list is properly
195 * cleaned up.
196 */
197 ext4_orphan_del(NULL, inode);
198 goto no_delete;
199 }
200
201 if (IS_SYNC(inode))
202 handle->h_sync = 1;
203 inode->i_size = 0;
204 if (inode->i_blocks)
205 ext4_truncate(inode);
206 /*
207 * Kill off the orphan record which ext4_truncate created.
208 * AKPM: I think this can be inside the above `if'.
209 * Note that ext4_orphan_del() has to be able to cope with the
210 * deletion of a non-existent orphan - this is because we don't
211 * know if ext4_truncate() actually created an orphan record.
212 * (Well, we could do this if we need to, but heck - it works)
213 */
214 ext4_orphan_del(handle, inode);
215 EXT4_I(inode)->i_dtime = get_seconds();
216
217 /*
218 * One subtle ordering requirement: if anything has gone wrong
219 * (transaction abort, IO errors, whatever), then we can still
220 * do these next steps (the fs will already have been marked as
221 * having errors), but we can't free the inode if the mark_dirty
222 * fails.
223 */
224 if (ext4_mark_inode_dirty(handle, inode))
225 /* If that failed, just do the required in-core inode clear. */
226 clear_inode(inode);
227 else
228 ext4_free_inode(handle, inode);
229 ext4_journal_stop(handle);
230 return;
231no_delete:
232 clear_inode(inode); /* We must guarantee clearing of inode... */
233}
234
235typedef struct {
236 __le32 *p;
237 __le32 key;
238 struct buffer_head *bh;
239} Indirect;
240
241static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
242{
243 p->key = *(p->p = v);
244 p->bh = bh;
245}
246
247static int verify_chain(Indirect *from, Indirect *to)
248{
249 while (from <= to && from->key == *from->p)
250 from++;
251 return (from > to);
252}
253
254/**
255 * ext4_block_to_path - parse the block number into array of offsets
256 * @inode: inode in question (we are only interested in its superblock)
257 * @i_block: block number to be parsed
258 * @offsets: array to store the offsets in
259 * @boundary: set this non-zero if the referred-to block is likely to be
260 * followed (on disk) by an indirect block.
261 *
262 * To store the locations of file's data ext4 uses a data structure common
263 * for UNIX filesystems - tree of pointers anchored in the inode, with
264 * data blocks at leaves and indirect blocks in intermediate nodes.
265 * This function translates the block number into path in that tree -
266 * return value is the path length and @offsets[n] is the offset of
267 * pointer to (n+1)th node in the nth one. If @block is out of range
268 * (negative or too large) warning is printed and zero returned.
269 *
270 * Note: function doesn't find node addresses, so no IO is needed. All
271 * we need to know is the capacity of indirect blocks (taken from the
272 * inode->i_sb).
273 */
274
275/*
276 * Portability note: the last comparison (check that we fit into triple
277 * indirect block) is spelled differently, because otherwise on an
278 * architecture with 32-bit longs and 8Kb pages we might get into trouble
279 * if our filesystem had 8Kb blocks. We might use long long, but that would
280 * kill us on x86. Oh, well, at least the sign propagation does not matter -
281 * i_block would have to be negative in the very beginning, so we would not
282 * get there at all.
283 */
284
285static int ext4_block_to_path(struct inode *inode,
286 long i_block, int offsets[4], int *boundary)
287{
288 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
289 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
290 const long direct_blocks = EXT4_NDIR_BLOCKS,
291 indirect_blocks = ptrs,
292 double_blocks = (1 << (ptrs_bits * 2));
293 int n = 0;
294 int final = 0;
295
296 if (i_block < 0) {
297 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
298 } else if (i_block < direct_blocks) {
299 offsets[n++] = i_block;
300 final = direct_blocks;
301 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
302 offsets[n++] = EXT4_IND_BLOCK;
303 offsets[n++] = i_block;
304 final = ptrs;
305 } else if ((i_block -= indirect_blocks) < double_blocks) {
306 offsets[n++] = EXT4_DIND_BLOCK;
307 offsets[n++] = i_block >> ptrs_bits;
308 offsets[n++] = i_block & (ptrs - 1);
309 final = ptrs;
310 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
311 offsets[n++] = EXT4_TIND_BLOCK;
312 offsets[n++] = i_block >> (ptrs_bits * 2);
313 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
314 offsets[n++] = i_block & (ptrs - 1);
315 final = ptrs;
316 } else {
317 ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
318 }
319 if (boundary)
320 *boundary = final - 1 - (i_block & (ptrs - 1));
321 return n;
322}
323
324/**
325 * ext4_get_branch - read the chain of indirect blocks leading to data
326 * @inode: inode in question
327 * @depth: depth of the chain (1 - direct pointer, etc.)
328 * @offsets: offsets of pointers in inode/indirect blocks
329 * @chain: place to store the result
330 * @err: here we store the error value
331 *
332 * Function fills the array of triples <key, p, bh> and returns %NULL
333 * if everything went OK or the pointer to the last filled triple
334 * (incomplete one) otherwise. Upon the return chain[i].key contains
335 * the number of (i+1)-th block in the chain (as it is stored in memory,
336 * i.e. little-endian 32-bit), chain[i].p contains the address of that
337 * number (it points into struct inode for i==0 and into the bh->b_data
338 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
339 * block for i>0 and NULL for i==0. In other words, it holds the block
340 * numbers of the chain, addresses they were taken from (and where we can
341 * verify that chain did not change) and buffer_heads hosting these
342 * numbers.
343 *
344 * Function stops when it stumbles upon zero pointer (absent block)
345 * (pointer to last triple returned, *@err == 0)
346 * or when it gets an IO error reading an indirect block
347 * (ditto, *@err == -EIO)
348 * or when it notices that chain had been changed while it was reading
349 * (ditto, *@err == -EAGAIN)
350 * or when it reads all @depth-1 indirect blocks successfully and finds
351 * the whole chain, all way to the data (returns %NULL, *err == 0).
352 */
353static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
354 Indirect chain[4], int *err)
355{
356 struct super_block *sb = inode->i_sb;
357 Indirect *p = chain;
358 struct buffer_head *bh;
359
360 *err = 0;
361 /* i_data is not going away, no lock needed */
362 add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
363 if (!p->key)
364 goto no_block;
365 while (--depth) {
366 bh = sb_bread(sb, le32_to_cpu(p->key));
367 if (!bh)
368 goto failure;
369 /* Reader: pointers */
370 if (!verify_chain(chain, p))
371 goto changed;
372 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
373 /* Reader: end */
374 if (!p->key)
375 goto no_block;
376 }
377 return NULL;
378
379changed:
380 brelse(bh);
381 *err = -EAGAIN;
382 goto no_block;
383failure:
384 *err = -EIO;
385no_block:
386 return p;
387}
388
389/**
390 * ext4_find_near - find a place for allocation with sufficient locality
391 * @inode: owner
392 * @ind: descriptor of indirect block.
393 *
394 * This function returns the prefered place for block allocation.
395 * It is used when heuristic for sequential allocation fails.
396 * Rules are:
397 * + if there is a block to the left of our position - allocate near it.
398 * + if pointer will live in indirect block - allocate near that block.
399 * + if pointer will live in inode - allocate in the same
400 * cylinder group.
401 *
402 * In the latter case we colour the starting block by the callers PID to
403 * prevent it from clashing with concurrent allocations for a different inode
404 * in the same block group. The PID is used here so that functionally related
405 * files will be close-by on-disk.
406 *
407 * Caller must make sure that @ind is valid and will stay that way.
408 */
409static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
410{
411 struct ext4_inode_info *ei = EXT4_I(inode);
412 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
413 __le32 *p;
414 ext4_fsblk_t bg_start;
415 ext4_grpblk_t colour;
416
417 /* Try to find previous block */
418 for (p = ind->p - 1; p >= start; p--) {
419 if (*p)
420 return le32_to_cpu(*p);
421 }
422
423 /* No such thing, so let's try location of indirect block */
424 if (ind->bh)
425 return ind->bh->b_blocknr;
426
427 /*
428 * It is going to be referred to from the inode itself? OK, just put it
429 * into the same cylinder group then.
430 */
431 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
432 colour = (current->pid % 16) *
433 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
434 return bg_start + colour;
435}
436
437/**
438 * ext4_find_goal - find a prefered place for allocation.
439 * @inode: owner
440 * @block: block we want
441 * @chain: chain of indirect blocks
442 * @partial: pointer to the last triple within a chain
443 * @goal: place to store the result.
444 *
445 * Normally this function find the prefered place for block allocation,
446 * stores it in *@goal and returns zero.
447 */
448
449static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
450 Indirect chain[4], Indirect *partial)
451{
452 struct ext4_block_alloc_info *block_i;
453
454 block_i = EXT4_I(inode)->i_block_alloc_info;
455
456 /*
457 * try the heuristic for sequential allocation,
458 * failing that at least try to get decent locality.
459 */
460 if (block_i && (block == block_i->last_alloc_logical_block + 1)
461 && (block_i->last_alloc_physical_block != 0)) {
462 return block_i->last_alloc_physical_block + 1;
463 }
464
465 return ext4_find_near(inode, partial);
466}
467
468/**
469 * ext4_blks_to_allocate: Look up the block map and count the number
470 * of direct blocks need to be allocated for the given branch.
471 *
472 * @branch: chain of indirect blocks
473 * @k: number of blocks need for indirect blocks
474 * @blks: number of data blocks to be mapped.
475 * @blocks_to_boundary: the offset in the indirect block
476 *
477 * return the total number of blocks to be allocate, including the
478 * direct and indirect blocks.
479 */
480static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
481 int blocks_to_boundary)
482{
483 unsigned long count = 0;
484
485 /*
486 * Simple case, [t,d]Indirect block(s) has not allocated yet
487 * then it's clear blocks on that path have not allocated
488 */
489 if (k > 0) {
490 /* right now we don't handle cross boundary allocation */
491 if (blks < blocks_to_boundary + 1)
492 count += blks;
493 else
494 count += blocks_to_boundary + 1;
495 return count;
496 }
497
498 count++;
499 while (count < blks && count <= blocks_to_boundary &&
500 le32_to_cpu(*(branch[0].p + count)) == 0) {
501 count++;
502 }
503 return count;
504}
505
506/**
507 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
508 * @indirect_blks: the number of blocks need to allocate for indirect
509 * blocks
510 *
511 * @new_blocks: on return it will store the new block numbers for
512 * the indirect blocks(if needed) and the first direct block,
513 * @blks: on return it will store the total number of allocated
514 * direct blocks
515 */
516static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
517 ext4_fsblk_t goal, int indirect_blks, int blks,
518 ext4_fsblk_t new_blocks[4], int *err)
519{
520 int target, i;
521 unsigned long count = 0;
522 int index = 0;
523 ext4_fsblk_t current_block = 0;
524 int ret = 0;
525
526 /*
527 * Here we try to allocate the requested multiple blocks at once,
528 * on a best-effort basis.
529 * To build a branch, we should allocate blocks for
530 * the indirect blocks(if not allocated yet), and at least
531 * the first direct block of this branch. That's the
532 * minimum number of blocks need to allocate(required)
533 */
534 target = blks + indirect_blks;
535
536 while (1) {
537 count = target;
538 /* allocating blocks for indirect blocks and direct blocks */
539 current_block = ext4_new_blocks(handle,inode,goal,&count,err);
540 if (*err)
541 goto failed_out;
542
543 target -= count;
544 /* allocate blocks for indirect blocks */
545 while (index < indirect_blks && count) {
546 new_blocks[index++] = current_block++;
547 count--;
548 }
549
550 if (count > 0)
551 break;
552 }
553
554 /* save the new block number for the first direct block */
555 new_blocks[index] = current_block;
556
557 /* total number of blocks allocated for direct blocks */
558 ret = count;
559 *err = 0;
560 return ret;
561failed_out:
562 for (i = 0; i <index; i++)
563 ext4_free_blocks(handle, inode, new_blocks[i], 1);
564 return ret;
565}
566
567/**
568 * ext4_alloc_branch - allocate and set up a chain of blocks.
569 * @inode: owner
570 * @indirect_blks: number of allocated indirect blocks
571 * @blks: number of allocated direct blocks
572 * @offsets: offsets (in the blocks) to store the pointers to next.
573 * @branch: place to store the chain in.
574 *
575 * This function allocates blocks, zeroes out all but the last one,
576 * links them into chain and (if we are synchronous) writes them to disk.
577 * In other words, it prepares a branch that can be spliced onto the
578 * inode. It stores the information about that chain in the branch[], in
579 * the same format as ext4_get_branch() would do. We are calling it after
580 * we had read the existing part of chain and partial points to the last
581 * triple of that (one with zero ->key). Upon the exit we have the same
582 * picture as after the successful ext4_get_block(), except that in one
583 * place chain is disconnected - *branch->p is still zero (we did not
584 * set the last link), but branch->key contains the number that should
585 * be placed into *branch->p to fill that gap.
586 *
587 * If allocation fails we free all blocks we've allocated (and forget
588 * their buffer_heads) and return the error value the from failed
589 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
590 * as described above and return 0.
591 */
592static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
593 int indirect_blks, int *blks, ext4_fsblk_t goal,
594 int *offsets, Indirect *branch)
595{
596 int blocksize = inode->i_sb->s_blocksize;
597 int i, n = 0;
598 int err = 0;
599 struct buffer_head *bh;
600 int num;
601 ext4_fsblk_t new_blocks[4];
602 ext4_fsblk_t current_block;
603
604 num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
605 *blks, new_blocks, &err);
606 if (err)
607 return err;
608
609 branch[0].key = cpu_to_le32(new_blocks[0]);
610 /*
611 * metadata blocks and data blocks are allocated.
612 */
613 for (n = 1; n <= indirect_blks; n++) {
614 /*
615 * Get buffer_head for parent block, zero it out
616 * and set the pointer to new one, then send
617 * parent to disk.
618 */
619 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
620 branch[n].bh = bh;
621 lock_buffer(bh);
622 BUFFER_TRACE(bh, "call get_create_access");
623 err = ext4_journal_get_create_access(handle, bh);
624 if (err) {
625 unlock_buffer(bh);
626 brelse(bh);
627 goto failed;
628 }
629
630 memset(bh->b_data, 0, blocksize);
631 branch[n].p = (__le32 *) bh->b_data + offsets[n];
632 branch[n].key = cpu_to_le32(new_blocks[n]);
633 *branch[n].p = branch[n].key;
634 if ( n == indirect_blks) {
635 current_block = new_blocks[n];
636 /*
637 * End of chain, update the last new metablock of
638 * the chain to point to the new allocated
639 * data blocks numbers
640 */
641 for (i=1; i < num; i++)
642 *(branch[n].p + i) = cpu_to_le32(++current_block);
643 }
644 BUFFER_TRACE(bh, "marking uptodate");
645 set_buffer_uptodate(bh);
646 unlock_buffer(bh);
647
648 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
649 err = ext4_journal_dirty_metadata(handle, bh);
650 if (err)
651 goto failed;
652 }
653 *blks = num;
654 return err;
655failed:
656 /* Allocation failed, free what we already allocated */
657 for (i = 1; i <= n ; i++) {
658 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
659 ext4_journal_forget(handle, branch[i].bh);
660 }
661 for (i = 0; i <indirect_blks; i++)
662 ext4_free_blocks(handle, inode, new_blocks[i], 1);
663
664 ext4_free_blocks(handle, inode, new_blocks[i], num);
665
666 return err;
667}
668
669/**
670 * ext4_splice_branch - splice the allocated branch onto inode.
671 * @inode: owner
672 * @block: (logical) number of block we are adding
673 * @chain: chain of indirect blocks (with a missing link - see
674 * ext4_alloc_branch)
675 * @where: location of missing link
676 * @num: number of indirect blocks we are adding
677 * @blks: number of direct blocks we are adding
678 *
679 * This function fills the missing link and does all housekeeping needed in
680 * inode (->i_blocks, etc.). In case of success we end up with the full
681 * chain to new block and return 0.
682 */
683static int ext4_splice_branch(handle_t *handle, struct inode *inode,
684 long block, Indirect *where, int num, int blks)
685{
686 int i;
687 int err = 0;
688 struct ext4_block_alloc_info *block_i;
689 ext4_fsblk_t current_block;
690
691 block_i = EXT4_I(inode)->i_block_alloc_info;
692 /*
693 * If we're splicing into a [td]indirect block (as opposed to the
694 * inode) then we need to get write access to the [td]indirect block
695 * before the splice.
696 */
697 if (where->bh) {
698 BUFFER_TRACE(where->bh, "get_write_access");
699 err = ext4_journal_get_write_access(handle, where->bh);
700 if (err)
701 goto err_out;
702 }
703 /* That's it */
704
705 *where->p = where->key;
706
707 /*
708 * Update the host buffer_head or inode to point to more just allocated
709 * direct blocks blocks
710 */
711 if (num == 0 && blks > 1) {
712 current_block = le32_to_cpu(where->key) + 1;
713 for (i = 1; i < blks; i++)
714 *(where->p + i ) = cpu_to_le32(current_block++);
715 }
716
717 /*
718 * update the most recently allocated logical & physical block
719 * in i_block_alloc_info, to assist find the proper goal block for next
720 * allocation
721 */
722 if (block_i) {
723 block_i->last_alloc_logical_block = block + blks - 1;
724 block_i->last_alloc_physical_block =
725 le32_to_cpu(where[num].key) + blks - 1;
726 }
727
728 /* We are done with atomic stuff, now do the rest of housekeeping */
729
730 inode->i_ctime = CURRENT_TIME_SEC;
731 ext4_mark_inode_dirty(handle, inode);
732
733 /* had we spliced it onto indirect block? */
734 if (where->bh) {
735 /*
736 * If we spliced it onto an indirect block, we haven't
737 * altered the inode. Note however that if it is being spliced
738 * onto an indirect block at the very end of the file (the
739 * file is growing) then we *will* alter the inode to reflect
740 * the new i_size. But that is not done here - it is done in
741 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
742 */
743 jbd_debug(5, "splicing indirect only\n");
744 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
745 err = ext4_journal_dirty_metadata(handle, where->bh);
746 if (err)
747 goto err_out;
748 } else {
749 /*
750 * OK, we spliced it into the inode itself on a direct block.
751 * Inode was dirtied above.
752 */
753 jbd_debug(5, "splicing direct\n");
754 }
755 return err;
756
757err_out:
758 for (i = 1; i <= num; i++) {
759 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
760 ext4_journal_forget(handle, where[i].bh);
761 ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
762 }
763 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
764
765 return err;
766}
767
768/*
769 * Allocation strategy is simple: if we have to allocate something, we will
770 * have to go the whole way to leaf. So let's do it before attaching anything
771 * to tree, set linkage between the newborn blocks, write them if sync is
772 * required, recheck the path, free and repeat if check fails, otherwise
773 * set the last missing link (that will protect us from any truncate-generated
774 * removals - all blocks on the path are immune now) and possibly force the
775 * write on the parent block.
776 * That has a nice additional property: no special recovery from the failed
777 * allocations is needed - we simply release blocks and do not touch anything
778 * reachable from inode.
779 *
780 * `handle' can be NULL if create == 0.
781 *
782 * The BKL may not be held on entry here. Be sure to take it early.
783 * return > 0, # of blocks mapped or allocated.
784 * return = 0, if plain lookup failed.
785 * return < 0, error case.
786 */
787int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
788 sector_t iblock, unsigned long maxblocks,
789 struct buffer_head *bh_result,
790 int create, int extend_disksize)
791{
792 int err = -EIO;
793 int offsets[4];
794 Indirect chain[4];
795 Indirect *partial;
796 ext4_fsblk_t goal;
797 int indirect_blks;
798 int blocks_to_boundary = 0;
799 int depth;
800 struct ext4_inode_info *ei = EXT4_I(inode);
801 int count = 0;
802 ext4_fsblk_t first_block = 0;
803
804
805 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
806 J_ASSERT(handle != NULL || create == 0);
807 depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
808
809 if (depth == 0)
810 goto out;
811
812 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
813
814 /* Simplest case - block found, no allocation needed */
815 if (!partial) {
816 first_block = le32_to_cpu(chain[depth - 1].key);
817 clear_buffer_new(bh_result);
818 count++;
819 /*map more blocks*/
820 while (count < maxblocks && count <= blocks_to_boundary) {
821 ext4_fsblk_t blk;
822
823 if (!verify_chain(chain, partial)) {
824 /*
825 * Indirect block might be removed by
826 * truncate while we were reading it.
827 * Handling of that case: forget what we've
828 * got now. Flag the err as EAGAIN, so it
829 * will reread.
830 */
831 err = -EAGAIN;
832 count = 0;
833 break;
834 }
835 blk = le32_to_cpu(*(chain[depth-1].p + count));
836
837 if (blk == first_block + count)
838 count++;
839 else
840 break;
841 }
842 if (err != -EAGAIN)
843 goto got_it;
844 }
845
846 /* Next simple case - plain lookup or failed read of indirect block */
847 if (!create || err == -EIO)
848 goto cleanup;
849
850 mutex_lock(&ei->truncate_mutex);
851
852 /*
853 * If the indirect block is missing while we are reading
854 * the chain(ext4_get_branch() returns -EAGAIN err), or
855 * if the chain has been changed after we grab the semaphore,
856 * (either because another process truncated this branch, or
857 * another get_block allocated this branch) re-grab the chain to see if
858 * the request block has been allocated or not.
859 *
860 * Since we already block the truncate/other get_block
861 * at this point, we will have the current copy of the chain when we
862 * splice the branch into the tree.
863 */
864 if (err == -EAGAIN || !verify_chain(chain, partial)) {
865 while (partial > chain) {
866 brelse(partial->bh);
867 partial--;
868 }
869 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
870 if (!partial) {
871 count++;
872 mutex_unlock(&ei->truncate_mutex);
873 if (err)
874 goto cleanup;
875 clear_buffer_new(bh_result);
876 goto got_it;
877 }
878 }
879
880 /*
881 * Okay, we need to do block allocation. Lazily initialize the block
882 * allocation info here if necessary
883 */
884 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
885 ext4_init_block_alloc_info(inode);
886
887 goal = ext4_find_goal(inode, iblock, chain, partial);
888
889 /* the number of blocks need to allocate for [d,t]indirect blocks */
890 indirect_blks = (chain + depth) - partial - 1;
891
892 /*
893 * Next look up the indirect map to count the totoal number of
894 * direct blocks to allocate for this branch.
895 */
896 count = ext4_blks_to_allocate(partial, indirect_blks,
897 maxblocks, blocks_to_boundary);
898 /*
899 * Block out ext4_truncate while we alter the tree
900 */
901 err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
902 offsets + (partial - chain), partial);
903
904 /*
905 * The ext4_splice_branch call will free and forget any buffers
906 * on the new chain if there is a failure, but that risks using
907 * up transaction credits, especially for bitmaps where the
908 * credits cannot be returned. Can we handle this somehow? We
909 * may need to return -EAGAIN upwards in the worst case. --sct
910 */
911 if (!err)
912 err = ext4_splice_branch(handle, inode, iblock,
913 partial, indirect_blks, count);
914 /*
915 * i_disksize growing is protected by truncate_mutex. Don't forget to
916 * protect it if you're about to implement concurrent
917 * ext4_get_block() -bzzz
918 */
919 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
920 ei->i_disksize = inode->i_size;
921 mutex_unlock(&ei->truncate_mutex);
922 if (err)
923 goto cleanup;
924
925 set_buffer_new(bh_result);
926got_it:
927 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
928 if (count > blocks_to_boundary)
929 set_buffer_boundary(bh_result);
930 err = count;
931 /* Clean up and exit */
932 partial = chain + depth - 1; /* the whole chain */
933cleanup:
934 while (partial > chain) {
935 BUFFER_TRACE(partial->bh, "call brelse");
936 brelse(partial->bh);
937 partial--;
938 }
939 BUFFER_TRACE(bh_result, "returned");
940out:
941 return err;
942}
943
944#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
945
946static int ext4_get_block(struct inode *inode, sector_t iblock,
947 struct buffer_head *bh_result, int create)
948{
949 handle_t *handle = journal_current_handle();
950 int ret = 0;
951 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
952
953 if (!create)
954 goto get_block; /* A read */
955
956 if (max_blocks == 1)
957 goto get_block; /* A single block get */
958
959 if (handle->h_transaction->t_state == T_LOCKED) {
960 /*
961 * Huge direct-io writes can hold off commits for long
962 * periods of time. Let this commit run.
963 */
964 ext4_journal_stop(handle);
965 handle = ext4_journal_start(inode, DIO_CREDITS);
966 if (IS_ERR(handle))
967 ret = PTR_ERR(handle);
968 goto get_block;
969 }
970
971 if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
972 /*
973 * Getting low on buffer credits...
974 */
975 ret = ext4_journal_extend(handle, DIO_CREDITS);
976 if (ret > 0) {
977 /*
978 * Couldn't extend the transaction. Start a new one.
979 */
980 ret = ext4_journal_restart(handle, DIO_CREDITS);
981 }
982 }
983
984get_block:
985 if (ret == 0) {
986 ret = ext4_get_blocks_wrap(handle, inode, iblock,
987 max_blocks, bh_result, create, 0);
988 if (ret > 0) {
989 bh_result->b_size = (ret << inode->i_blkbits);
990 ret = 0;
991 }
992 }
993 return ret;
994}
995
996/*
997 * `handle' can be NULL if create is zero
998 */
999struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1000 long block, int create, int *errp)
1001{
1002 struct buffer_head dummy;
1003 int fatal = 0, err;
1004
1005 J_ASSERT(handle != NULL || create == 0);
1006
1007 dummy.b_state = 0;
1008 dummy.b_blocknr = -1000;
1009 buffer_trace_init(&dummy.b_history);
1010 err = ext4_get_blocks_wrap(handle, inode, block, 1,
1011 &dummy, create, 1);
1012 /*
1013 * ext4_get_blocks_handle() returns number of blocks
1014 * mapped. 0 in case of a HOLE.
1015 */
1016 if (err > 0) {
1017 if (err > 1)
1018 WARN_ON(1);
1019 err = 0;
1020 }
1021 *errp = err;
1022 if (!err && buffer_mapped(&dummy)) {
1023 struct buffer_head *bh;
1024 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1025 if (!bh) {
1026 *errp = -EIO;
1027 goto err;
1028 }
1029 if (buffer_new(&dummy)) {
1030 J_ASSERT(create != 0);
1031 J_ASSERT(handle != 0);
1032
1033 /*
1034 * Now that we do not always journal data, we should
1035 * keep in mind whether this should always journal the
1036 * new buffer as metadata. For now, regular file
1037 * writes use ext4_get_block instead, so it's not a
1038 * problem.
1039 */
1040 lock_buffer(bh);
1041 BUFFER_TRACE(bh, "call get_create_access");
1042 fatal = ext4_journal_get_create_access(handle, bh);
1043 if (!fatal && !buffer_uptodate(bh)) {
1044 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1045 set_buffer_uptodate(bh);
1046 }
1047 unlock_buffer(bh);
1048 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1049 err = ext4_journal_dirty_metadata(handle, bh);
1050 if (!fatal)
1051 fatal = err;
1052 } else {
1053 BUFFER_TRACE(bh, "not a new buffer");
1054 }
1055 if (fatal) {
1056 *errp = fatal;
1057 brelse(bh);
1058 bh = NULL;
1059 }
1060 return bh;
1061 }
1062err:
1063 return NULL;
1064}
1065
1066struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1067 int block, int create, int *err)
1068{
1069 struct buffer_head * bh;
1070
1071 bh = ext4_getblk(handle, inode, block, create, err);
1072 if (!bh)
1073 return bh;
1074 if (buffer_uptodate(bh))
1075 return bh;
1076 ll_rw_block(READ_META, 1, &bh);
1077 wait_on_buffer(bh);
1078 if (buffer_uptodate(bh))
1079 return bh;
1080 put_bh(bh);
1081 *err = -EIO;
1082 return NULL;
1083}
1084
1085static int walk_page_buffers( handle_t *handle,
1086 struct buffer_head *head,
1087 unsigned from,
1088 unsigned to,
1089 int *partial,
1090 int (*fn)( handle_t *handle,
1091 struct buffer_head *bh))
1092{
1093 struct buffer_head *bh;
1094 unsigned block_start, block_end;
1095 unsigned blocksize = head->b_size;
1096 int err, ret = 0;
1097 struct buffer_head *next;
1098
1099 for ( bh = head, block_start = 0;
1100 ret == 0 && (bh != head || !block_start);
1101 block_start = block_end, bh = next)
1102 {
1103 next = bh->b_this_page;
1104 block_end = block_start + blocksize;
1105 if (block_end <= from || block_start >= to) {
1106 if (partial && !buffer_uptodate(bh))
1107 *partial = 1;
1108 continue;
1109 }
1110 err = (*fn)(handle, bh);
1111 if (!ret)
1112 ret = err;
1113 }
1114 return ret;
1115}
1116
1117/*
1118 * To preserve ordering, it is essential that the hole instantiation and
1119 * the data write be encapsulated in a single transaction. We cannot
1120 * close off a transaction and start a new one between the ext4_get_block()
1121 * and the commit_write(). So doing the jbd2_journal_start at the start of
1122 * prepare_write() is the right place.
1123 *
1124 * Also, this function can nest inside ext4_writepage() ->
1125 * block_write_full_page(). In that case, we *know* that ext4_writepage()
1126 * has generated enough buffer credits to do the whole page. So we won't
1127 * block on the journal in that case, which is good, because the caller may
1128 * be PF_MEMALLOC.
1129 *
1130 * By accident, ext4 can be reentered when a transaction is open via
1131 * quota file writes. If we were to commit the transaction while thus
1132 * reentered, there can be a deadlock - we would be holding a quota
1133 * lock, and the commit would never complete if another thread had a
1134 * transaction open and was blocking on the quota lock - a ranking
1135 * violation.
1136 *
1137 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1138 * will _not_ run commit under these circumstances because handle->h_ref
1139 * is elevated. We'll still have enough credits for the tiny quotafile
1140 * write.
1141 */
1142static int do_journal_get_write_access(handle_t *handle,
1143 struct buffer_head *bh)
1144{
1145 if (!buffer_mapped(bh) || buffer_freed(bh))
1146 return 0;
1147 return ext4_journal_get_write_access(handle, bh);
1148}
1149
1150static int ext4_prepare_write(struct file *file, struct page *page,
1151 unsigned from, unsigned to)
1152{
1153 struct inode *inode = page->mapping->host;
1154 int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1155 handle_t *handle;
1156 int retries = 0;
1157
1158retry:
1159 handle = ext4_journal_start(inode, needed_blocks);
1160 if (IS_ERR(handle)) {
1161 ret = PTR_ERR(handle);
1162 goto out;
1163 }
1164 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1165 ret = nobh_prepare_write(page, from, to, ext4_get_block);
1166 else
1167 ret = block_prepare_write(page, from, to, ext4_get_block);
1168 if (ret)
1169 goto prepare_write_failed;
1170
1171 if (ext4_should_journal_data(inode)) {
1172 ret = walk_page_buffers(handle, page_buffers(page),
1173 from, to, NULL, do_journal_get_write_access);
1174 }
1175prepare_write_failed:
1176 if (ret)
1177 ext4_journal_stop(handle);
1178 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1179 goto retry;
1180out:
1181 return ret;
1182}
1183
1184int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1185{
1186 int err = jbd2_journal_dirty_data(handle, bh);
1187 if (err)
1188 ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1189 bh, handle,err);
1190 return err;
1191}
1192
1193/* For commit_write() in data=journal mode */
1194static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1195{
1196 if (!buffer_mapped(bh) || buffer_freed(bh))
1197 return 0;
1198 set_buffer_uptodate(bh);
1199 return ext4_journal_dirty_metadata(handle, bh);
1200}
1201
1202/*
1203 * We need to pick up the new inode size which generic_commit_write gave us
1204 * `file' can be NULL - eg, when called from page_symlink().
1205 *
1206 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1207 * buffers are managed internally.
1208 */
1209static int ext4_ordered_commit_write(struct file *file, struct page *page,
1210 unsigned from, unsigned to)
1211{
1212 handle_t *handle = ext4_journal_current_handle();
1213 struct inode *inode = page->mapping->host;
1214 int ret = 0, ret2;
1215
1216 ret = walk_page_buffers(handle, page_buffers(page),
1217 from, to, NULL, ext4_journal_dirty_data);
1218
1219 if (ret == 0) {
1220 /*
1221 * generic_commit_write() will run mark_inode_dirty() if i_size
1222 * changes. So let's piggyback the i_disksize mark_inode_dirty
1223 * into that.
1224 */
1225 loff_t new_i_size;
1226
1227 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1228 if (new_i_size > EXT4_I(inode)->i_disksize)
1229 EXT4_I(inode)->i_disksize = new_i_size;
1230 ret = generic_commit_write(file, page, from, to);
1231 }
1232 ret2 = ext4_journal_stop(handle);
1233 if (!ret)
1234 ret = ret2;
1235 return ret;
1236}
1237
1238static int ext4_writeback_commit_write(struct file *file, struct page *page,
1239 unsigned from, unsigned to)
1240{
1241 handle_t *handle = ext4_journal_current_handle();
1242 struct inode *inode = page->mapping->host;
1243 int ret = 0, ret2;
1244 loff_t new_i_size;
1245
1246 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1247 if (new_i_size > EXT4_I(inode)->i_disksize)
1248 EXT4_I(inode)->i_disksize = new_i_size;
1249
1250 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1251 ret = nobh_commit_write(file, page, from, to);
1252 else
1253 ret = generic_commit_write(file, page, from, to);
1254
1255 ret2 = ext4_journal_stop(handle);
1256 if (!ret)
1257 ret = ret2;
1258 return ret;
1259}
1260
1261static int ext4_journalled_commit_write(struct file *file,
1262 struct page *page, unsigned from, unsigned to)
1263{
1264 handle_t *handle = ext4_journal_current_handle();
1265 struct inode *inode = page->mapping->host;
1266 int ret = 0, ret2;
1267 int partial = 0;
1268 loff_t pos;
1269
1270 /*
1271 * Here we duplicate the generic_commit_write() functionality
1272 */
1273 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1274
1275 ret = walk_page_buffers(handle, page_buffers(page), from,
1276 to, &partial, commit_write_fn);
1277 if (!partial)
1278 SetPageUptodate(page);
1279 if (pos > inode->i_size)
1280 i_size_write(inode, pos);
1281 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1282 if (inode->i_size > EXT4_I(inode)->i_disksize) {
1283 EXT4_I(inode)->i_disksize = inode->i_size;
1284 ret2 = ext4_mark_inode_dirty(handle, inode);
1285 if (!ret)
1286 ret = ret2;
1287 }
1288 ret2 = ext4_journal_stop(handle);
1289 if (!ret)
1290 ret = ret2;
1291 return ret;
1292}
1293
1294/*
1295 * bmap() is special. It gets used by applications such as lilo and by
1296 * the swapper to find the on-disk block of a specific piece of data.
1297 *
1298 * Naturally, this is dangerous if the block concerned is still in the
1299 * journal. If somebody makes a swapfile on an ext4 data-journaling
1300 * filesystem and enables swap, then they may get a nasty shock when the
1301 * data getting swapped to that swapfile suddenly gets overwritten by
1302 * the original zero's written out previously to the journal and
1303 * awaiting writeback in the kernel's buffer cache.
1304 *
1305 * So, if we see any bmap calls here on a modified, data-journaled file,
1306 * take extra steps to flush any blocks which might be in the cache.
1307 */
1308static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
1309{
1310 struct inode *inode = mapping->host;
1311 journal_t *journal;
1312 int err;
1313
1314 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
1315 /*
1316 * This is a REALLY heavyweight approach, but the use of
1317 * bmap on dirty files is expected to be extremely rare:
1318 * only if we run lilo or swapon on a freshly made file
1319 * do we expect this to happen.
1320 *
1321 * (bmap requires CAP_SYS_RAWIO so this does not
1322 * represent an unprivileged user DOS attack --- we'd be
1323 * in trouble if mortal users could trigger this path at
1324 * will.)
1325 *
1326 * NB. EXT4_STATE_JDATA is not set on files other than
1327 * regular files. If somebody wants to bmap a directory
1328 * or symlink and gets confused because the buffer
1329 * hasn't yet been flushed to disk, they deserve
1330 * everything they get.
1331 */
1332
1333 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
1334 journal = EXT4_JOURNAL(inode);
1335 jbd2_journal_lock_updates(journal);
1336 err = jbd2_journal_flush(journal);
1337 jbd2_journal_unlock_updates(journal);
1338
1339 if (err)
1340 return 0;
1341 }
1342
1343 return generic_block_bmap(mapping,block,ext4_get_block);
1344}
1345
1346static int bget_one(handle_t *handle, struct buffer_head *bh)
1347{
1348 get_bh(bh);
1349 return 0;
1350}
1351
1352static int bput_one(handle_t *handle, struct buffer_head *bh)
1353{
1354 put_bh(bh);
1355 return 0;
1356}
1357
1358static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1359{
1360 if (buffer_mapped(bh))
1361 return ext4_journal_dirty_data(handle, bh);
1362 return 0;
1363}
1364
1365/*
1366 * Note that we always start a transaction even if we're not journalling
1367 * data. This is to preserve ordering: any hole instantiation within
1368 * __block_write_full_page -> ext4_get_block() should be journalled
1369 * along with the data so we don't crash and then get metadata which
1370 * refers to old data.
1371 *
1372 * In all journalling modes block_write_full_page() will start the I/O.
1373 *
1374 * Problem:
1375 *
1376 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1377 * ext4_writepage()
1378 *
1379 * Similar for:
1380 *
1381 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1382 *
1383 * Same applies to ext4_get_block(). We will deadlock on various things like
1384 * lock_journal and i_truncate_mutex.
1385 *
1386 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1387 * allocations fail.
1388 *
1389 * 16May01: If we're reentered then journal_current_handle() will be
1390 * non-zero. We simply *return*.
1391 *
1392 * 1 July 2001: @@@ FIXME:
1393 * In journalled data mode, a data buffer may be metadata against the
1394 * current transaction. But the same file is part of a shared mapping
1395 * and someone does a writepage() on it.
1396 *
1397 * We will move the buffer onto the async_data list, but *after* it has
1398 * been dirtied. So there's a small window where we have dirty data on
1399 * BJ_Metadata.
1400 *
1401 * Note that this only applies to the last partial page in the file. The
1402 * bit which block_write_full_page() uses prepare/commit for. (That's
1403 * broken code anyway: it's wrong for msync()).
1404 *
1405 * It's a rare case: affects the final partial page, for journalled data
1406 * where the file is subject to bith write() and writepage() in the same
1407 * transction. To fix it we'll need a custom block_write_full_page().
1408 * We'll probably need that anyway for journalling writepage() output.
1409 *
1410 * We don't honour synchronous mounts for writepage(). That would be
1411 * disastrous. Any write() or metadata operation will sync the fs for
1412 * us.
1413 *
1414 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1415 * we don't need to open a transaction here.
1416 */
1417static int ext4_ordered_writepage(struct page *page,
1418 struct writeback_control *wbc)
1419{
1420 struct inode *inode = page->mapping->host;
1421 struct buffer_head *page_bufs;
1422 handle_t *handle = NULL;
1423 int ret = 0;
1424 int err;
1425
1426 J_ASSERT(PageLocked(page));
1427
1428 /*
1429 * We give up here if we're reentered, because it might be for a
1430 * different filesystem.
1431 */
1432 if (ext4_journal_current_handle())
1433 goto out_fail;
1434
1435 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1436
1437 if (IS_ERR(handle)) {
1438 ret = PTR_ERR(handle);
1439 goto out_fail;
1440 }
1441
1442 if (!page_has_buffers(page)) {
1443 create_empty_buffers(page, inode->i_sb->s_blocksize,
1444 (1 << BH_Dirty)|(1 << BH_Uptodate));
1445 }
1446 page_bufs = page_buffers(page);
1447 walk_page_buffers(handle, page_bufs, 0,
1448 PAGE_CACHE_SIZE, NULL, bget_one);
1449
1450 ret = block_write_full_page(page, ext4_get_block, wbc);
1451
1452 /*
1453 * The page can become unlocked at any point now, and
1454 * truncate can then come in and change things. So we
1455 * can't touch *page from now on. But *page_bufs is
1456 * safe due to elevated refcount.
1457 */
1458
1459 /*
1460 * And attach them to the current transaction. But only if
1461 * block_write_full_page() succeeded. Otherwise they are unmapped,
1462 * and generally junk.
1463 */
1464 if (ret == 0) {
1465 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1466 NULL, jbd2_journal_dirty_data_fn);
1467 if (!ret)
1468 ret = err;
1469 }
1470 walk_page_buffers(handle, page_bufs, 0,
1471 PAGE_CACHE_SIZE, NULL, bput_one);
1472 err = ext4_journal_stop(handle);
1473 if (!ret)
1474 ret = err;
1475 return ret;
1476
1477out_fail:
1478 redirty_page_for_writepage(wbc, page);
1479 unlock_page(page);
1480 return ret;
1481}
1482
1483static int ext4_writeback_writepage(struct page *page,
1484 struct writeback_control *wbc)
1485{
1486 struct inode *inode = page->mapping->host;
1487 handle_t *handle = NULL;
1488 int ret = 0;
1489 int err;
1490
1491 if (ext4_journal_current_handle())
1492 goto out_fail;
1493
1494 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1495 if (IS_ERR(handle)) {
1496 ret = PTR_ERR(handle);
1497 goto out_fail;
1498 }
1499
1500 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1501 ret = nobh_writepage(page, ext4_get_block, wbc);
1502 else
1503 ret = block_write_full_page(page, ext4_get_block, wbc);
1504
1505 err = ext4_journal_stop(handle);
1506 if (!ret)
1507 ret = err;
1508 return ret;
1509
1510out_fail:
1511 redirty_page_for_writepage(wbc, page);
1512 unlock_page(page);
1513 return ret;
1514}
1515
1516static int ext4_journalled_writepage(struct page *page,
1517 struct writeback_control *wbc)
1518{
1519 struct inode *inode = page->mapping->host;
1520 handle_t *handle = NULL;
1521 int ret = 0;
1522 int err;
1523
1524 if (ext4_journal_current_handle())
1525 goto no_write;
1526
1527 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1528 if (IS_ERR(handle)) {
1529 ret = PTR_ERR(handle);
1530 goto no_write;
1531 }
1532
1533 if (!page_has_buffers(page) || PageChecked(page)) {
1534 /*
1535 * It's mmapped pagecache. Add buffers and journal it. There
1536 * doesn't seem much point in redirtying the page here.
1537 */
1538 ClearPageChecked(page);
1539 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1540 ext4_get_block);
1541 if (ret != 0) {
1542 ext4_journal_stop(handle);
1543 goto out_unlock;
1544 }
1545 ret = walk_page_buffers(handle, page_buffers(page), 0,
1546 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1547
1548 err = walk_page_buffers(handle, page_buffers(page), 0,
1549 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1550 if (ret == 0)
1551 ret = err;
1552 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1553 unlock_page(page);
1554 } else {
1555 /*
1556 * It may be a page full of checkpoint-mode buffers. We don't
1557 * really know unless we go poke around in the buffer_heads.
1558 * But block_write_full_page will do the right thing.
1559 */
1560 ret = block_write_full_page(page, ext4_get_block, wbc);
1561 }
1562 err = ext4_journal_stop(handle);
1563 if (!ret)
1564 ret = err;
1565out:
1566 return ret;
1567
1568no_write:
1569 redirty_page_for_writepage(wbc, page);
1570out_unlock:
1571 unlock_page(page);
1572 goto out;
1573}
1574
1575static int ext4_readpage(struct file *file, struct page *page)
1576{
1577 return mpage_readpage(page, ext4_get_block);
1578}
1579
1580static int
1581ext4_readpages(struct file *file, struct address_space *mapping,
1582 struct list_head *pages, unsigned nr_pages)
1583{
1584 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
1585}
1586
1587static void ext4_invalidatepage(struct page *page, unsigned long offset)
1588{
1589 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
1590
1591 /*
1592 * If it's a full truncate we just forget about the pending dirtying
1593 */
1594 if (offset == 0)
1595 ClearPageChecked(page);
1596
1597 jbd2_journal_invalidatepage(journal, page, offset);
1598}
1599
1600static int ext4_releasepage(struct page *page, gfp_t wait)
1601{
1602 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
1603
1604 WARN_ON(PageChecked(page));
1605 if (!page_has_buffers(page))
1606 return 0;
1607 return jbd2_journal_try_to_free_buffers(journal, page, wait);
1608}
1609
1610/*
1611 * If the O_DIRECT write will extend the file then add this inode to the
1612 * orphan list. So recovery will truncate it back to the original size
1613 * if the machine crashes during the write.
1614 *
1615 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1616 * crashes then stale disk data _may_ be exposed inside the file.
1617 */
1618static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1619 const struct iovec *iov, loff_t offset,
1620 unsigned long nr_segs)
1621{
1622 struct file *file = iocb->ki_filp;
1623 struct inode *inode = file->f_mapping->host;
1624 struct ext4_inode_info *ei = EXT4_I(inode);
1625 handle_t *handle = NULL;
1626 ssize_t ret;
1627 int orphan = 0;
1628 size_t count = iov_length(iov, nr_segs);
1629
1630 if (rw == WRITE) {
1631 loff_t final_size = offset + count;
1632
1633 handle = ext4_journal_start(inode, DIO_CREDITS);
1634 if (IS_ERR(handle)) {
1635 ret = PTR_ERR(handle);
1636 goto out;
1637 }
1638 if (final_size > inode->i_size) {
1639 ret = ext4_orphan_add(handle, inode);
1640 if (ret)
1641 goto out_stop;
1642 orphan = 1;
1643 ei->i_disksize = inode->i_size;
1644 }
1645 }
1646
1647 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1648 offset, nr_segs,
1649 ext4_get_block, NULL);
1650
1651 /*
1652 * Reacquire the handle: ext4_get_block() can restart the transaction
1653 */
1654 handle = journal_current_handle();
1655
1656out_stop:
1657 if (handle) {
1658 int err;
1659
1660 if (orphan && inode->i_nlink)
1661 ext4_orphan_del(handle, inode);
1662 if (orphan && ret > 0) {
1663 loff_t end = offset + ret;
1664 if (end > inode->i_size) {
1665 ei->i_disksize = end;
1666 i_size_write(inode, end);
1667 /*
1668 * We're going to return a positive `ret'
1669 * here due to non-zero-length I/O, so there's
1670 * no way of reporting error returns from
1671 * ext4_mark_inode_dirty() to userspace. So
1672 * ignore it.
1673 */
1674 ext4_mark_inode_dirty(handle, inode);
1675 }
1676 }
1677 err = ext4_journal_stop(handle);
1678 if (ret == 0)
1679 ret = err;
1680 }
1681out:
1682 return ret;
1683}
1684
1685/*
1686 * Pages can be marked dirty completely asynchronously from ext4's journalling
1687 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1688 * much here because ->set_page_dirty is called under VFS locks. The page is
1689 * not necessarily locked.
1690 *
1691 * We cannot just dirty the page and leave attached buffers clean, because the
1692 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1693 * or jbddirty because all the journalling code will explode.
1694 *
1695 * So what we do is to mark the page "pending dirty" and next time writepage
1696 * is called, propagate that into the buffers appropriately.
1697 */
1698static int ext4_journalled_set_page_dirty(struct page *page)
1699{
1700 SetPageChecked(page);
1701 return __set_page_dirty_nobuffers(page);
1702}
1703
1704static const struct address_space_operations ext4_ordered_aops = {
1705 .readpage = ext4_readpage,
1706 .readpages = ext4_readpages,
1707 .writepage = ext4_ordered_writepage,
1708 .sync_page = block_sync_page,
1709 .prepare_write = ext4_prepare_write,
1710 .commit_write = ext4_ordered_commit_write,
1711 .bmap = ext4_bmap,
1712 .invalidatepage = ext4_invalidatepage,
1713 .releasepage = ext4_releasepage,
1714 .direct_IO = ext4_direct_IO,
1715 .migratepage = buffer_migrate_page,
1716};
1717
1718static const struct address_space_operations ext4_writeback_aops = {
1719 .readpage = ext4_readpage,
1720 .readpages = ext4_readpages,
1721 .writepage = ext4_writeback_writepage,
1722 .sync_page = block_sync_page,
1723 .prepare_write = ext4_prepare_write,
1724 .commit_write = ext4_writeback_commit_write,
1725 .bmap = ext4_bmap,
1726 .invalidatepage = ext4_invalidatepage,
1727 .releasepage = ext4_releasepage,
1728 .direct_IO = ext4_direct_IO,
1729 .migratepage = buffer_migrate_page,
1730};
1731
1732static const struct address_space_operations ext4_journalled_aops = {
1733 .readpage = ext4_readpage,
1734 .readpages = ext4_readpages,
1735 .writepage = ext4_journalled_writepage,
1736 .sync_page = block_sync_page,
1737 .prepare_write = ext4_prepare_write,
1738 .commit_write = ext4_journalled_commit_write,
1739 .set_page_dirty = ext4_journalled_set_page_dirty,
1740 .bmap = ext4_bmap,
1741 .invalidatepage = ext4_invalidatepage,
1742 .releasepage = ext4_releasepage,
1743};
1744
1745void ext4_set_aops(struct inode *inode)
1746{
1747 if (ext4_should_order_data(inode))
1748 inode->i_mapping->a_ops = &ext4_ordered_aops;
1749 else if (ext4_should_writeback_data(inode))
1750 inode->i_mapping->a_ops = &ext4_writeback_aops;
1751 else
1752 inode->i_mapping->a_ops = &ext4_journalled_aops;
1753}
1754
1755/*
1756 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
1757 * up to the end of the block which corresponds to `from'.
1758 * This required during truncate. We need to physically zero the tail end
1759 * of that block so it doesn't yield old data if the file is later grown.
1760 */
1761int ext4_block_truncate_page(handle_t *handle, struct page *page,
1762 struct address_space *mapping, loff_t from)
1763{
1764 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1765 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1766 unsigned blocksize, iblock, length, pos;
1767 struct inode *inode = mapping->host;
1768 struct buffer_head *bh;
1769 int err = 0;
1770 void *kaddr;
1771
1772 blocksize = inode->i_sb->s_blocksize;
1773 length = blocksize - (offset & (blocksize - 1));
1774 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1775
1776 /*
1777 * For "nobh" option, we can only work if we don't need to
1778 * read-in the page - otherwise we create buffers to do the IO.
1779 */
1780 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1781 ext4_should_writeback_data(inode) && PageUptodate(page)) {
1782 kaddr = kmap_atomic(page, KM_USER0);
1783 memset(kaddr + offset, 0, length);
1784 flush_dcache_page(page);
1785 kunmap_atomic(kaddr, KM_USER0);
1786 set_page_dirty(page);
1787 goto unlock;
1788 }
1789
1790 if (!page_has_buffers(page))
1791 create_empty_buffers(page, blocksize, 0);
1792
1793 /* Find the buffer that contains "offset" */
1794 bh = page_buffers(page);
1795 pos = blocksize;
1796 while (offset >= pos) {
1797 bh = bh->b_this_page;
1798 iblock++;
1799 pos += blocksize;
1800 }
1801
1802 err = 0;
1803 if (buffer_freed(bh)) {
1804 BUFFER_TRACE(bh, "freed: skip");
1805 goto unlock;
1806 }
1807
1808 if (!buffer_mapped(bh)) {
1809 BUFFER_TRACE(bh, "unmapped");
1810 ext4_get_block(inode, iblock, bh, 0);
1811 /* unmapped? It's a hole - nothing to do */
1812 if (!buffer_mapped(bh)) {
1813 BUFFER_TRACE(bh, "still unmapped");
1814 goto unlock;
1815 }
1816 }
1817
1818 /* Ok, it's mapped. Make sure it's up-to-date */
1819 if (PageUptodate(page))
1820 set_buffer_uptodate(bh);
1821
1822 if (!buffer_uptodate(bh)) {
1823 err = -EIO;
1824 ll_rw_block(READ, 1, &bh);
1825 wait_on_buffer(bh);
1826 /* Uhhuh. Read error. Complain and punt. */
1827 if (!buffer_uptodate(bh))
1828 goto unlock;
1829 }
1830
1831 if (ext4_should_journal_data(inode)) {
1832 BUFFER_TRACE(bh, "get write access");
1833 err = ext4_journal_get_write_access(handle, bh);
1834 if (err)
1835 goto unlock;
1836 }
1837
1838 kaddr = kmap_atomic(page, KM_USER0);
1839 memset(kaddr + offset, 0, length);
1840 flush_dcache_page(page);
1841 kunmap_atomic(kaddr, KM_USER0);
1842
1843 BUFFER_TRACE(bh, "zeroed end of block");
1844
1845 err = 0;
1846 if (ext4_should_journal_data(inode)) {
1847 err = ext4_journal_dirty_metadata(handle, bh);
1848 } else {
1849 if (ext4_should_order_data(inode))
1850 err = ext4_journal_dirty_data(handle, bh);
1851 mark_buffer_dirty(bh);
1852 }
1853
1854unlock:
1855 unlock_page(page);
1856 page_cache_release(page);
1857 return err;
1858}
1859
1860/*
1861 * Probably it should be a library function... search for first non-zero word
1862 * or memcmp with zero_page, whatever is better for particular architecture.
1863 * Linus?
1864 */
1865static inline int all_zeroes(__le32 *p, __le32 *q)
1866{
1867 while (p < q)
1868 if (*p++)
1869 return 0;
1870 return 1;
1871}
1872
1873/**
1874 * ext4_find_shared - find the indirect blocks for partial truncation.
1875 * @inode: inode in question
1876 * @depth: depth of the affected branch
1877 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
1878 * @chain: place to store the pointers to partial indirect blocks
1879 * @top: place to the (detached) top of branch
1880 *
1881 * This is a helper function used by ext4_truncate().
1882 *
1883 * When we do truncate() we may have to clean the ends of several
1884 * indirect blocks but leave the blocks themselves alive. Block is
1885 * partially truncated if some data below the new i_size is refered
1886 * from it (and it is on the path to the first completely truncated
1887 * data block, indeed). We have to free the top of that path along
1888 * with everything to the right of the path. Since no allocation
1889 * past the truncation point is possible until ext4_truncate()
1890 * finishes, we may safely do the latter, but top of branch may
1891 * require special attention - pageout below the truncation point
1892 * might try to populate it.
1893 *
1894 * We atomically detach the top of branch from the tree, store the
1895 * block number of its root in *@top, pointers to buffer_heads of
1896 * partially truncated blocks - in @chain[].bh and pointers to
1897 * their last elements that should not be removed - in
1898 * @chain[].p. Return value is the pointer to last filled element
1899 * of @chain.
1900 *
1901 * The work left to caller to do the actual freeing of subtrees:
1902 * a) free the subtree starting from *@top
1903 * b) free the subtrees whose roots are stored in
1904 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1905 * c) free the subtrees growing from the inode past the @chain[0].
1906 * (no partially truncated stuff there). */
1907
1908static Indirect *ext4_find_shared(struct inode *inode, int depth,
1909 int offsets[4], Indirect chain[4], __le32 *top)
1910{
1911 Indirect *partial, *p;
1912 int k, err;
1913
1914 *top = 0;
1915 /* Make k index the deepest non-null offest + 1 */
1916 for (k = depth; k > 1 && !offsets[k-1]; k--)
1917 ;
1918 partial = ext4_get_branch(inode, k, offsets, chain, &err);
1919 /* Writer: pointers */
1920 if (!partial)
1921 partial = chain + k-1;
1922 /*
1923 * If the branch acquired continuation since we've looked at it -
1924 * fine, it should all survive and (new) top doesn't belong to us.
1925 */
1926 if (!partial->key && *partial->p)
1927 /* Writer: end */
1928 goto no_top;
1929 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1930 ;
1931 /*
1932 * OK, we've found the last block that must survive. The rest of our
1933 * branch should be detached before unlocking. However, if that rest
1934 * of branch is all ours and does not grow immediately from the inode
1935 * it's easier to cheat and just decrement partial->p.
1936 */
1937 if (p == chain + k - 1 && p > chain) {
1938 p->p--;
1939 } else {
1940 *top = *p->p;
1941 /* Nope, don't do this in ext4. Must leave the tree intact */
1942#if 0
1943 *p->p = 0;
1944#endif
1945 }
1946 /* Writer: end */
1947
1948 while(partial > p) {
1949 brelse(partial->bh);
1950 partial--;
1951 }
1952no_top:
1953 return partial;
1954}
1955
1956/*
1957 * Zero a number of block pointers in either an inode or an indirect block.
1958 * If we restart the transaction we must again get write access to the
1959 * indirect block for further modification.
1960 *
1961 * We release `count' blocks on disk, but (last - first) may be greater
1962 * than `count' because there can be holes in there.
1963 */
1964static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
1965 struct buffer_head *bh, ext4_fsblk_t block_to_free,
1966 unsigned long count, __le32 *first, __le32 *last)
1967{
1968 __le32 *p;
1969 if (try_to_extend_transaction(handle, inode)) {
1970 if (bh) {
1971 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1972 ext4_journal_dirty_metadata(handle, bh);
1973 }
1974 ext4_mark_inode_dirty(handle, inode);
1975 ext4_journal_test_restart(handle, inode);
1976 if (bh) {
1977 BUFFER_TRACE(bh, "retaking write access");
1978 ext4_journal_get_write_access(handle, bh);
1979 }
1980 }
1981
1982 /*
1983 * Any buffers which are on the journal will be in memory. We find
1984 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
1985 * on them. We've already detached each block from the file, so
1986 * bforget() in jbd2_journal_forget() should be safe.
1987 *
1988 * AKPM: turn on bforget in jbd2_journal_forget()!!!
1989 */
1990 for (p = first; p < last; p++) {
1991 u32 nr = le32_to_cpu(*p);
1992 if (nr) {
1993 struct buffer_head *bh;
1994
1995 *p = 0;
1996 bh = sb_find_get_block(inode->i_sb, nr);
1997 ext4_forget(handle, 0, inode, bh, nr);
1998 }
1999 }
2000
2001 ext4_free_blocks(handle, inode, block_to_free, count);
2002}
2003
2004/**
2005 * ext4_free_data - free a list of data blocks
2006 * @handle: handle for this transaction
2007 * @inode: inode we are dealing with
2008 * @this_bh: indirect buffer_head which contains *@first and *@last
2009 * @first: array of block numbers
2010 * @last: points immediately past the end of array
2011 *
2012 * We are freeing all blocks refered from that array (numbers are stored as
2013 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2014 *
2015 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2016 * blocks are contiguous then releasing them at one time will only affect one
2017 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2018 * actually use a lot of journal space.
2019 *
2020 * @this_bh will be %NULL if @first and @last point into the inode's direct
2021 * block pointers.
2022 */
2023static void ext4_free_data(handle_t *handle, struct inode *inode,
2024 struct buffer_head *this_bh,
2025 __le32 *first, __le32 *last)
2026{
2027 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
2028 unsigned long count = 0; /* Number of blocks in the run */
2029 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2030 corresponding to
2031 block_to_free */
2032 ext4_fsblk_t nr; /* Current block # */
2033 __le32 *p; /* Pointer into inode/ind
2034 for current block */
2035 int err;
2036
2037 if (this_bh) { /* For indirect block */
2038 BUFFER_TRACE(this_bh, "get_write_access");
2039 err = ext4_journal_get_write_access(handle, this_bh);
2040 /* Important: if we can't update the indirect pointers
2041 * to the blocks, we can't free them. */
2042 if (err)
2043 return;
2044 }
2045
2046 for (p = first; p < last; p++) {
2047 nr = le32_to_cpu(*p);
2048 if (nr) {
2049 /* accumulate blocks to free if they're contiguous */
2050 if (count == 0) {
2051 block_to_free = nr;
2052 block_to_free_p = p;
2053 count = 1;
2054 } else if (nr == block_to_free + count) {
2055 count++;
2056 } else {
2057 ext4_clear_blocks(handle, inode, this_bh,
2058 block_to_free,
2059 count, block_to_free_p, p);
2060 block_to_free = nr;
2061 block_to_free_p = p;
2062 count = 1;
2063 }
2064 }
2065 }
2066
2067 if (count > 0)
2068 ext4_clear_blocks(handle, inode, this_bh, block_to_free,
2069 count, block_to_free_p, p);
2070
2071 if (this_bh) {
2072 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
2073 ext4_journal_dirty_metadata(handle, this_bh);
2074 }
2075}
2076
2077/**
2078 * ext4_free_branches - free an array of branches
2079 * @handle: JBD handle for this transaction
2080 * @inode: inode we are dealing with
2081 * @parent_bh: the buffer_head which contains *@first and *@last
2082 * @first: array of block numbers
2083 * @last: pointer immediately past the end of array
2084 * @depth: depth of the branches to free
2085 *
2086 * We are freeing all blocks refered from these branches (numbers are
2087 * stored as little-endian 32-bit) and updating @inode->i_blocks
2088 * appropriately.
2089 */
2090static void ext4_free_branches(handle_t *handle, struct inode *inode,
2091 struct buffer_head *parent_bh,
2092 __le32 *first, __le32 *last, int depth)
2093{
2094 ext4_fsblk_t nr;
2095 __le32 *p;
2096
2097 if (is_handle_aborted(handle))
2098 return;
2099
2100 if (depth--) {
2101 struct buffer_head *bh;
2102 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2103 p = last;
2104 while (--p >= first) {
2105 nr = le32_to_cpu(*p);
2106 if (!nr)
2107 continue; /* A hole */
2108
2109 /* Go read the buffer for the next level down */
2110 bh = sb_bread(inode->i_sb, nr);
2111
2112 /*
2113 * A read failure? Report error and clear slot
2114 * (should be rare).
2115 */
2116 if (!bh) {
2117 ext4_error(inode->i_sb, "ext4_free_branches",
2118 "Read failure, inode=%lu, block=%llu",
2119 inode->i_ino, nr);
2120 continue;
2121 }
2122
2123 /* This zaps the entire block. Bottom up. */
2124 BUFFER_TRACE(bh, "free child branches");
2125 ext4_free_branches(handle, inode, bh,
2126 (__le32*)bh->b_data,
2127 (__le32*)bh->b_data + addr_per_block,
2128 depth);
2129
2130 /*
2131 * We've probably journalled the indirect block several
2132 * times during the truncate. But it's no longer
2133 * needed and we now drop it from the transaction via
2134 * jbd2_journal_revoke().
2135 *
2136 * That's easy if it's exclusively part of this
2137 * transaction. But if it's part of the committing
2138 * transaction then jbd2_journal_forget() will simply
2139 * brelse() it. That means that if the underlying
2140 * block is reallocated in ext4_get_block(),
2141 * unmap_underlying_metadata() will find this block
2142 * and will try to get rid of it. damn, damn.
2143 *
2144 * If this block has already been committed to the
2145 * journal, a revoke record will be written. And
2146 * revoke records must be emitted *before* clearing
2147 * this block's bit in the bitmaps.
2148 */
2149 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
2150
2151 /*
2152 * Everything below this this pointer has been
2153 * released. Now let this top-of-subtree go.
2154 *
2155 * We want the freeing of this indirect block to be
2156 * atomic in the journal with the updating of the
2157 * bitmap block which owns it. So make some room in
2158 * the journal.
2159 *
2160 * We zero the parent pointer *after* freeing its
2161 * pointee in the bitmaps, so if extend_transaction()
2162 * for some reason fails to put the bitmap changes and
2163 * the release into the same transaction, recovery
2164 * will merely complain about releasing a free block,
2165 * rather than leaking blocks.
2166 */
2167 if (is_handle_aborted(handle))
2168 return;
2169 if (try_to_extend_transaction(handle, inode)) {
2170 ext4_mark_inode_dirty(handle, inode);
2171 ext4_journal_test_restart(handle, inode);
2172 }
2173
2174 ext4_free_blocks(handle, inode, nr, 1);
2175
2176 if (parent_bh) {
2177 /*
2178 * The block which we have just freed is
2179 * pointed to by an indirect block: journal it
2180 */
2181 BUFFER_TRACE(parent_bh, "get_write_access");
2182 if (!ext4_journal_get_write_access(handle,
2183 parent_bh)){
2184 *p = 0;
2185 BUFFER_TRACE(parent_bh,
2186 "call ext4_journal_dirty_metadata");
2187 ext4_journal_dirty_metadata(handle,
2188 parent_bh);
2189 }
2190 }
2191 }
2192 } else {
2193 /* We have reached the bottom of the tree. */
2194 BUFFER_TRACE(parent_bh, "free data blocks");
2195 ext4_free_data(handle, inode, parent_bh, first, last);
2196 }
2197}
2198
2199/*
2200 * ext4_truncate()
2201 *
2202 * We block out ext4_get_block() block instantiations across the entire
2203 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
2204 * simultaneously on behalf of the same inode.
2205 *
2206 * As we work through the truncate and commmit bits of it to the journal there
2207 * is one core, guiding principle: the file's tree must always be consistent on
2208 * disk. We must be able to restart the truncate after a crash.
2209 *
2210 * The file's tree may be transiently inconsistent in memory (although it
2211 * probably isn't), but whenever we close off and commit a journal transaction,
2212 * the contents of (the filesystem + the journal) must be consistent and
2213 * restartable. It's pretty simple, really: bottom up, right to left (although
2214 * left-to-right works OK too).
2215 *
2216 * Note that at recovery time, journal replay occurs *before* the restart of
2217 * truncate against the orphan inode list.
2218 *
2219 * The committed inode has the new, desired i_size (which is the same as
2220 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
2221 * that this inode's truncate did not complete and it will again call
2222 * ext4_truncate() to have another go. So there will be instantiated blocks
2223 * to the right of the truncation point in a crashed ext4 filesystem. But
2224 * that's fine - as long as they are linked from the inode, the post-crash
2225 * ext4_truncate() run will find them and release them.
2226 */
2227void ext4_truncate(struct inode *inode)
2228{
2229 handle_t *handle;
2230 struct ext4_inode_info *ei = EXT4_I(inode);
2231 __le32 *i_data = ei->i_data;
2232 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2233 struct address_space *mapping = inode->i_mapping;
2234 int offsets[4];
2235 Indirect chain[4];
2236 Indirect *partial;
2237 __le32 nr = 0;
2238 int n;
2239 long last_block;
2240 unsigned blocksize = inode->i_sb->s_blocksize;
2241 struct page *page;
2242
2243 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2244 S_ISLNK(inode->i_mode)))
2245 return;
2246 if (ext4_inode_is_fast_symlink(inode))
2247 return;
2248 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2249 return;
2250
2251 /*
2252 * We have to lock the EOF page here, because lock_page() nests
2253 * outside jbd2_journal_start().
2254 */
2255 if ((inode->i_size & (blocksize - 1)) == 0) {
2256 /* Block boundary? Nothing to do */
2257 page = NULL;
2258 } else {
2259 page = grab_cache_page(mapping,
2260 inode->i_size >> PAGE_CACHE_SHIFT);
2261 if (!page)
2262 return;
2263 }
2264
2265 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
2266 return ext4_ext_truncate(inode, page);
2267
2268 handle = start_transaction(inode);
2269 if (IS_ERR(handle)) {
2270 if (page) {
2271 clear_highpage(page);
2272 flush_dcache_page(page);
2273 unlock_page(page);
2274 page_cache_release(page);
2275 }
2276 return; /* AKPM: return what? */
2277 }
2278
2279 last_block = (inode->i_size + blocksize-1)
2280 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
2281
2282 if (page)
2283 ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2284
2285 n = ext4_block_to_path(inode, last_block, offsets, NULL);
2286 if (n == 0)
2287 goto out_stop; /* error */
2288
2289 /*
2290 * OK. This truncate is going to happen. We add the inode to the
2291 * orphan list, so that if this truncate spans multiple transactions,
2292 * and we crash, we will resume the truncate when the filesystem
2293 * recovers. It also marks the inode dirty, to catch the new size.
2294 *
2295 * Implication: the file must always be in a sane, consistent
2296 * truncatable state while each transaction commits.
2297 */
2298 if (ext4_orphan_add(handle, inode))
2299 goto out_stop;
2300
2301 /*
2302 * The orphan list entry will now protect us from any crash which
2303 * occurs before the truncate completes, so it is now safe to propagate
2304 * the new, shorter inode size (held for now in i_size) into the
2305 * on-disk inode. We do this via i_disksize, which is the value which
2306 * ext4 *really* writes onto the disk inode.
2307 */
2308 ei->i_disksize = inode->i_size;
2309
2310 /*
2311 * From here we block out all ext4_get_block() callers who want to
2312 * modify the block allocation tree.
2313 */
2314 mutex_lock(&ei->truncate_mutex);
2315
2316 if (n == 1) { /* direct blocks */
2317 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
2318 i_data + EXT4_NDIR_BLOCKS);
2319 goto do_indirects;
2320 }
2321
2322 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
2323 /* Kill the top of shared branch (not detached) */
2324 if (nr) {
2325 if (partial == chain) {
2326 /* Shared branch grows from the inode */
2327 ext4_free_branches(handle, inode, NULL,
2328 &nr, &nr+1, (chain+n-1) - partial);
2329 *partial->p = 0;
2330 /*
2331 * We mark the inode dirty prior to restart,
2332 * and prior to stop. No need for it here.
2333 */
2334 } else {
2335 /* Shared branch grows from an indirect block */
2336 BUFFER_TRACE(partial->bh, "get_write_access");
2337 ext4_free_branches(handle, inode, partial->bh,
2338 partial->p,
2339 partial->p+1, (chain+n-1) - partial);
2340 }
2341 }
2342 /* Clear the ends of indirect blocks on the shared branch */
2343 while (partial > chain) {
2344 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
2345 (__le32*)partial->bh->b_data+addr_per_block,
2346 (chain+n-1) - partial);
2347 BUFFER_TRACE(partial->bh, "call brelse");
2348 brelse (partial->bh);
2349 partial--;
2350 }
2351do_indirects:
2352 /* Kill the remaining (whole) subtrees */
2353 switch (offsets[0]) {
2354 default:
2355 nr = i_data[EXT4_IND_BLOCK];
2356 if (nr) {
2357 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
2358 i_data[EXT4_IND_BLOCK] = 0;
2359 }
2360 case EXT4_IND_BLOCK:
2361 nr = i_data[EXT4_DIND_BLOCK];
2362 if (nr) {
2363 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
2364 i_data[EXT4_DIND_BLOCK] = 0;
2365 }
2366 case EXT4_DIND_BLOCK:
2367 nr = i_data[EXT4_TIND_BLOCK];
2368 if (nr) {
2369 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
2370 i_data[EXT4_TIND_BLOCK] = 0;
2371 }
2372 case EXT4_TIND_BLOCK:
2373 ;
2374 }
2375
2376 ext4_discard_reservation(inode);
2377
2378 mutex_unlock(&ei->truncate_mutex);
2379 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2380 ext4_mark_inode_dirty(handle, inode);
2381
2382 /*
2383 * In a multi-transaction truncate, we only make the final transaction
2384 * synchronous
2385 */
2386 if (IS_SYNC(inode))
2387 handle->h_sync = 1;
2388out_stop:
2389 /*
2390 * If this was a simple ftruncate(), and the file will remain alive
2391 * then we need to clear up the orphan record which we created above.
2392 * However, if this was a real unlink then we were called by
2393 * ext4_delete_inode(), and we allow that function to clean up the
2394 * orphan info for us.
2395 */
2396 if (inode->i_nlink)
2397 ext4_orphan_del(handle, inode);
2398
2399 ext4_journal_stop(handle);
2400}
2401
2402static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
2403 unsigned long ino, struct ext4_iloc *iloc)
2404{
2405 unsigned long desc, group_desc, block_group;
2406 unsigned long offset;
2407 ext4_fsblk_t block;
2408 struct buffer_head *bh;
2409 struct ext4_group_desc * gdp;
2410
2411 if (!ext4_valid_inum(sb, ino)) {
2412 /*
2413 * This error is already checked for in namei.c unless we are
2414 * looking at an NFS filehandle, in which case no error
2415 * report is needed
2416 */
2417 return 0;
2418 }
2419
2420 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2421 if (block_group >= EXT4_SB(sb)->s_groups_count) {
2422 ext4_error(sb,"ext4_get_inode_block","group >= groups count");
2423 return 0;
2424 }
2425 smp_rmb();
2426 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
2427 desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2428 bh = EXT4_SB(sb)->s_group_desc[group_desc];
2429 if (!bh) {
2430 ext4_error (sb, "ext4_get_inode_block",
2431 "Descriptor not loaded");
2432 return 0;
2433 }
2434
2435 gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
2436 desc * EXT4_DESC_SIZE(sb));
2437 /*
2438 * Figure out the offset within the block group inode table
2439 */
2440 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
2441 EXT4_INODE_SIZE(sb);
2442 block = ext4_inode_table(sb, gdp) +
2443 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
2444
2445 iloc->block_group = block_group;
2446 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
2447 return block;
2448}
2449
2450/*
2451 * ext4_get_inode_loc returns with an extra refcount against the inode's
2452 * underlying buffer_head on success. If 'in_mem' is true, we have all
2453 * data in memory that is needed to recreate the on-disk version of this
2454 * inode.
2455 */
2456static int __ext4_get_inode_loc(struct inode *inode,
2457 struct ext4_iloc *iloc, int in_mem)
2458{
2459 ext4_fsblk_t block;
2460 struct buffer_head *bh;
2461
2462 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2463 if (!block)
2464 return -EIO;
2465
2466 bh = sb_getblk(inode->i_sb, block);
2467 if (!bh) {
2468 ext4_error (inode->i_sb, "ext4_get_inode_loc",
2469 "unable to read inode block - "
2470 "inode=%lu, block=%llu",
2471 inode->i_ino, block);
2472 return -EIO;
2473 }
2474 if (!buffer_uptodate(bh)) {
2475 lock_buffer(bh);
2476 if (buffer_uptodate(bh)) {
2477 /* someone brought it uptodate while we waited */
2478 unlock_buffer(bh);
2479 goto has_buffer;
2480 }
2481
2482 /*
2483 * If we have all information of the inode in memory and this
2484 * is the only valid inode in the block, we need not read the
2485 * block.
2486 */
2487 if (in_mem) {
2488 struct buffer_head *bitmap_bh;
2489 struct ext4_group_desc *desc;
2490 int inodes_per_buffer;
2491 int inode_offset, i;
2492 int block_group;
2493 int start;
2494
2495 block_group = (inode->i_ino - 1) /
2496 EXT4_INODES_PER_GROUP(inode->i_sb);
2497 inodes_per_buffer = bh->b_size /
2498 EXT4_INODE_SIZE(inode->i_sb);
2499 inode_offset = ((inode->i_ino - 1) %
2500 EXT4_INODES_PER_GROUP(inode->i_sb));
2501 start = inode_offset & ~(inodes_per_buffer - 1);
2502
2503 /* Is the inode bitmap in cache? */
2504 desc = ext4_get_group_desc(inode->i_sb,
2505 block_group, NULL);
2506 if (!desc)
2507 goto make_io;
2508
2509 bitmap_bh = sb_getblk(inode->i_sb,
2510 ext4_inode_bitmap(inode->i_sb, desc));
2511 if (!bitmap_bh)
2512 goto make_io;
2513
2514 /*
2515 * If the inode bitmap isn't in cache then the
2516 * optimisation may end up performing two reads instead
2517 * of one, so skip it.
2518 */
2519 if (!buffer_uptodate(bitmap_bh)) {
2520 brelse(bitmap_bh);
2521 goto make_io;
2522 }
2523 for (i = start; i < start + inodes_per_buffer; i++) {
2524 if (i == inode_offset)
2525 continue;
2526 if (ext4_test_bit(i, bitmap_bh->b_data))
2527 break;
2528 }
2529 brelse(bitmap_bh);
2530 if (i == start + inodes_per_buffer) {
2531 /* all other inodes are free, so skip I/O */
2532 memset(bh->b_data, 0, bh->b_size);
2533 set_buffer_uptodate(bh);
2534 unlock_buffer(bh);
2535 goto has_buffer;
2536 }
2537 }
2538
2539make_io:
2540 /*
2541 * There are other valid inodes in the buffer, this inode
2542 * has in-inode xattrs, or we don't have this inode in memory.
2543 * Read the block from disk.
2544 */
2545 get_bh(bh);
2546 bh->b_end_io = end_buffer_read_sync;
2547 submit_bh(READ_META, bh);
2548 wait_on_buffer(bh);
2549 if (!buffer_uptodate(bh)) {
2550 ext4_error(inode->i_sb, "ext4_get_inode_loc",
2551 "unable to read inode block - "
2552 "inode=%lu, block=%llu",
2553 inode->i_ino, block);
2554 brelse(bh);
2555 return -EIO;
2556 }
2557 }
2558has_buffer:
2559 iloc->bh = bh;
2560 return 0;
2561}
2562
2563int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
2564{
2565 /* We have all inode data except xattrs in memory here. */
2566 return __ext4_get_inode_loc(inode, iloc,
2567 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
2568}
2569
2570void ext4_set_inode_flags(struct inode *inode)
2571{
2572 unsigned int flags = EXT4_I(inode)->i_flags;
2573
2574 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2575 if (flags & EXT4_SYNC_FL)
2576 inode->i_flags |= S_SYNC;
2577 if (flags & EXT4_APPEND_FL)
2578 inode->i_flags |= S_APPEND;
2579 if (flags & EXT4_IMMUTABLE_FL)
2580 inode->i_flags |= S_IMMUTABLE;
2581 if (flags & EXT4_NOATIME_FL)
2582 inode->i_flags |= S_NOATIME;
2583 if (flags & EXT4_DIRSYNC_FL)
2584 inode->i_flags |= S_DIRSYNC;
2585}
2586
2587void ext4_read_inode(struct inode * inode)
2588{
2589 struct ext4_iloc iloc;
2590 struct ext4_inode *raw_inode;
2591 struct ext4_inode_info *ei = EXT4_I(inode);
2592 struct buffer_head *bh;
2593 int block;
2594
2595#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
2596 ei->i_acl = EXT4_ACL_NOT_CACHED;
2597 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
2598#endif
2599 ei->i_block_alloc_info = NULL;
2600
2601 if (__ext4_get_inode_loc(inode, &iloc, 0))
2602 goto bad_inode;
2603 bh = iloc.bh;
2604 raw_inode = ext4_raw_inode(&iloc);
2605 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2606 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2607 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2608 if(!(test_opt (inode->i_sb, NO_UID32))) {
2609 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2610 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2611 }
2612 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2613 inode->i_size = le32_to_cpu(raw_inode->i_size);
2614 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2615 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2616 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2617 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2618
2619 ei->i_state = 0;
2620 ei->i_dir_start_lookup = 0;
2621 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2622 /* We now have enough fields to check if the inode was active or not.
2623 * This is needed because nfsd might try to access dead inodes
2624 * the test is that same one that e2fsck uses
2625 * NeilBrown 1999oct15
2626 */
2627 if (inode->i_nlink == 0) {
2628 if (inode->i_mode == 0 ||
2629 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
2630 /* this inode is deleted */
2631 brelse (bh);
2632 goto bad_inode;
2633 }
2634 /* The only unlinked inodes we let through here have
2635 * valid i_mode and are being read by the orphan
2636 * recovery code: that's fine, we're about to complete
2637 * the process of deleting those. */
2638 }
2639 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2640 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2641#ifdef EXT4_FRAGMENTS
2642 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2643 ei->i_frag_no = raw_inode->i_frag;
2644 ei->i_frag_size = raw_inode->i_fsize;
2645#endif
2646 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2647 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2648 cpu_to_le32(EXT4_OS_HURD))
2649 ei->i_file_acl |=
2650 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
2651 if (!S_ISREG(inode->i_mode)) {
2652 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2653 } else {
2654 inode->i_size |=
2655 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2656 }
2657 ei->i_disksize = inode->i_size;
2658 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2659 ei->i_block_group = iloc.block_group;
2660 /*
2661 * NOTE! The in-memory inode i_data array is in little-endian order
2662 * even on big-endian machines: we do NOT byteswap the block numbers!
2663 */
2664 for (block = 0; block < EXT4_N_BLOCKS; block++)
2665 ei->i_data[block] = raw_inode->i_block[block];
2666 INIT_LIST_HEAD(&ei->i_orphan);
2667
2668 if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 &&
2669 EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
2670 /*
2671 * When mke2fs creates big inodes it does not zero out
2672 * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE,
2673 * so ignore those first few inodes.
2674 */
2675 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2676 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2677 EXT4_INODE_SIZE(inode->i_sb))
2678 goto bad_inode;
2679 if (ei->i_extra_isize == 0) {
2680 /* The extra space is currently unused. Use it. */
2681 ei->i_extra_isize = sizeof(struct ext4_inode) -
2682 EXT4_GOOD_OLD_INODE_SIZE;
2683 } else {
2684 __le32 *magic = (void *)raw_inode +
2685 EXT4_GOOD_OLD_INODE_SIZE +
2686 ei->i_extra_isize;
2687 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
2688 ei->i_state |= EXT4_STATE_XATTR;
2689 }
2690 } else
2691 ei->i_extra_isize = 0;
2692
2693 if (S_ISREG(inode->i_mode)) {
2694 inode->i_op = &ext4_file_inode_operations;
2695 inode->i_fop = &ext4_file_operations;
2696 ext4_set_aops(inode);
2697 } else if (S_ISDIR(inode->i_mode)) {
2698 inode->i_op = &ext4_dir_inode_operations;
2699 inode->i_fop = &ext4_dir_operations;
2700 } else if (S_ISLNK(inode->i_mode)) {
2701 if (ext4_inode_is_fast_symlink(inode))
2702 inode->i_op = &ext4_fast_symlink_inode_operations;
2703 else {
2704 inode->i_op = &ext4_symlink_inode_operations;
2705 ext4_set_aops(inode);
2706 }
2707 } else {
2708 inode->i_op = &ext4_special_inode_operations;
2709 if (raw_inode->i_block[0])
2710 init_special_inode(inode, inode->i_mode,
2711 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2712 else
2713 init_special_inode(inode, inode->i_mode,
2714 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2715 }
2716 brelse (iloc.bh);
2717 ext4_set_inode_flags(inode);
2718 return;
2719
2720bad_inode:
2721 make_bad_inode(inode);
2722 return;
2723}
2724
2725/*
2726 * Post the struct inode info into an on-disk inode location in the
2727 * buffer-cache. This gobbles the caller's reference to the
2728 * buffer_head in the inode location struct.
2729 *
2730 * The caller must have write access to iloc->bh.
2731 */
2732static int ext4_do_update_inode(handle_t *handle,
2733 struct inode *inode,
2734 struct ext4_iloc *iloc)
2735{
2736 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
2737 struct ext4_inode_info *ei = EXT4_I(inode);
2738 struct buffer_head *bh = iloc->bh;
2739 int err = 0, rc, block;
2740
2741 /* For fields not not tracking in the in-memory inode,
2742 * initialise them to zero for new inodes. */
2743 if (ei->i_state & EXT4_STATE_NEW)
2744 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
2745
2746 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2747 if(!(test_opt(inode->i_sb, NO_UID32))) {
2748 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2749 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2750/*
2751 * Fix up interoperability with old kernels. Otherwise, old inodes get
2752 * re-used with the upper 16 bits of the uid/gid intact
2753 */
2754 if(!ei->i_dtime) {
2755 raw_inode->i_uid_high =
2756 cpu_to_le16(high_16_bits(inode->i_uid));
2757 raw_inode->i_gid_high =
2758 cpu_to_le16(high_16_bits(inode->i_gid));
2759 } else {
2760 raw_inode->i_uid_high = 0;
2761 raw_inode->i_gid_high = 0;
2762 }
2763 } else {
2764 raw_inode->i_uid_low =
2765 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2766 raw_inode->i_gid_low =
2767 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2768 raw_inode->i_uid_high = 0;
2769 raw_inode->i_gid_high = 0;
2770 }
2771 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2772 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2773 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2774 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2775 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2776 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2777 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2778 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2779#ifdef EXT4_FRAGMENTS
2780 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2781 raw_inode->i_frag = ei->i_frag_no;
2782 raw_inode->i_fsize = ei->i_frag_size;
2783#endif
2784 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2785 cpu_to_le32(EXT4_OS_HURD))
2786 raw_inode->i_file_acl_high =
2787 cpu_to_le16(ei->i_file_acl >> 32);
2788 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2789 if (!S_ISREG(inode->i_mode)) {
2790 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2791 } else {
2792 raw_inode->i_size_high =
2793 cpu_to_le32(ei->i_disksize >> 32);
2794 if (ei->i_disksize > 0x7fffffffULL) {
2795 struct super_block *sb = inode->i_sb;
2796 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
2797 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
2798 EXT4_SB(sb)->s_es->s_rev_level ==
2799 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
2800 /* If this is the first large file
2801 * created, add a flag to the superblock.
2802 */
2803 err = ext4_journal_get_write_access(handle,
2804 EXT4_SB(sb)->s_sbh);
2805 if (err)
2806 goto out_brelse;
2807 ext4_update_dynamic_rev(sb);
2808 EXT4_SET_RO_COMPAT_FEATURE(sb,
2809 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
2810 sb->s_dirt = 1;
2811 handle->h_sync = 1;
2812 err = ext4_journal_dirty_metadata(handle,
2813 EXT4_SB(sb)->s_sbh);
2814 }
2815 }
2816 }
2817 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2818 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2819 if (old_valid_dev(inode->i_rdev)) {
2820 raw_inode->i_block[0] =
2821 cpu_to_le32(old_encode_dev(inode->i_rdev));
2822 raw_inode->i_block[1] = 0;
2823 } else {
2824 raw_inode->i_block[0] = 0;
2825 raw_inode->i_block[1] =
2826 cpu_to_le32(new_encode_dev(inode->i_rdev));
2827 raw_inode->i_block[2] = 0;
2828 }
2829 } else for (block = 0; block < EXT4_N_BLOCKS; block++)
2830 raw_inode->i_block[block] = ei->i_data[block];
2831
2832 if (ei->i_extra_isize)
2833 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2834
2835 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
2836 rc = ext4_journal_dirty_metadata(handle, bh);
2837 if (!err)
2838 err = rc;
2839 ei->i_state &= ~EXT4_STATE_NEW;
2840
2841out_brelse:
2842 brelse (bh);
2843 ext4_std_error(inode->i_sb, err);
2844 return err;
2845}
2846
2847/*
2848 * ext4_write_inode()
2849 *
2850 * We are called from a few places:
2851 *
2852 * - Within generic_file_write() for O_SYNC files.
2853 * Here, there will be no transaction running. We wait for any running
2854 * trasnaction to commit.
2855 *
2856 * - Within sys_sync(), kupdate and such.
2857 * We wait on commit, if tol to.
2858 *
2859 * - Within prune_icache() (PF_MEMALLOC == true)
2860 * Here we simply return. We can't afford to block kswapd on the
2861 * journal commit.
2862 *
2863 * In all cases it is actually safe for us to return without doing anything,
2864 * because the inode has been copied into a raw inode buffer in
2865 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2866 * knfsd.
2867 *
2868 * Note that we are absolutely dependent upon all inode dirtiers doing the
2869 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2870 * which we are interested.
2871 *
2872 * It would be a bug for them to not do this. The code:
2873 *
2874 * mark_inode_dirty(inode)
2875 * stuff();
2876 * inode->i_size = expr;
2877 *
2878 * is in error because a kswapd-driven write_inode() could occur while
2879 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2880 * will no longer be on the superblock's dirty inode list.
2881 */
2882int ext4_write_inode(struct inode *inode, int wait)
2883{
2884 if (current->flags & PF_MEMALLOC)
2885 return 0;
2886
2887 if (ext4_journal_current_handle()) {
2888 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2889 dump_stack();
2890 return -EIO;
2891 }
2892
2893 if (!wait)
2894 return 0;
2895
2896 return ext4_force_commit(inode->i_sb);
2897}
2898
2899/*
2900 * ext4_setattr()
2901 *
2902 * Called from notify_change.
2903 *
2904 * We want to trap VFS attempts to truncate the file as soon as
2905 * possible. In particular, we want to make sure that when the VFS
2906 * shrinks i_size, we put the inode on the orphan list and modify
2907 * i_disksize immediately, so that during the subsequent flushing of
2908 * dirty pages and freeing of disk blocks, we can guarantee that any
2909 * commit will leave the blocks being flushed in an unused state on
2910 * disk. (On recovery, the inode will get truncated and the blocks will
2911 * be freed, so we have a strong guarantee that no future commit will
2912 * leave these blocks visible to the user.)
2913 *
2914 * Called with inode->sem down.
2915 */
2916int ext4_setattr(struct dentry *dentry, struct iattr *attr)
2917{
2918 struct inode *inode = dentry->d_inode;
2919 int error, rc = 0;
2920 const unsigned int ia_valid = attr->ia_valid;
2921
2922 error = inode_change_ok(inode, attr);
2923 if (error)
2924 return error;
2925
2926 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2927 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2928 handle_t *handle;
2929
2930 /* (user+group)*(old+new) structure, inode write (sb,
2931 * inode block, ? - but truncate inode update has it) */
2932 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
2933 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2934 if (IS_ERR(handle)) {
2935 error = PTR_ERR(handle);
2936 goto err_out;
2937 }
2938 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2939 if (error) {
2940 ext4_journal_stop(handle);
2941 return error;
2942 }
2943 /* Update corresponding info in inode so that everything is in
2944 * one transaction */
2945 if (attr->ia_valid & ATTR_UID)
2946 inode->i_uid = attr->ia_uid;
2947 if (attr->ia_valid & ATTR_GID)
2948 inode->i_gid = attr->ia_gid;
2949 error = ext4_mark_inode_dirty(handle, inode);
2950 ext4_journal_stop(handle);
2951 }
2952
2953 if (S_ISREG(inode->i_mode) &&
2954 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2955 handle_t *handle;
2956
2957 handle = ext4_journal_start(inode, 3);
2958 if (IS_ERR(handle)) {
2959 error = PTR_ERR(handle);
2960 goto err_out;
2961 }
2962
2963 error = ext4_orphan_add(handle, inode);
2964 EXT4_I(inode)->i_disksize = attr->ia_size;
2965 rc = ext4_mark_inode_dirty(handle, inode);
2966 if (!error)
2967 error = rc;
2968 ext4_journal_stop(handle);
2969 }
2970
2971 rc = inode_setattr(inode, attr);
2972
2973 /* If inode_setattr's call to ext4_truncate failed to get a
2974 * transaction handle at all, we need to clean up the in-core
2975 * orphan list manually. */
2976 if (inode->i_nlink)
2977 ext4_orphan_del(NULL, inode);
2978
2979 if (!rc && (ia_valid & ATTR_MODE))
2980 rc = ext4_acl_chmod(inode);
2981
2982err_out:
2983 ext4_std_error(inode->i_sb, error);
2984 if (!error)
2985 error = rc;
2986 return error;
2987}
2988
2989
2990/*
2991 * How many blocks doth make a writepage()?
2992 *
2993 * With N blocks per page, it may be:
2994 * N data blocks
2995 * 2 indirect block
2996 * 2 dindirect
2997 * 1 tindirect
2998 * N+5 bitmap blocks (from the above)
2999 * N+5 group descriptor summary blocks
3000 * 1 inode block
3001 * 1 superblock.
3002 * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
3003 *
3004 * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
3005 *
3006 * With ordered or writeback data it's the same, less the N data blocks.
3007 *
3008 * If the inode's direct blocks can hold an integral number of pages then a
3009 * page cannot straddle two indirect blocks, and we can only touch one indirect
3010 * and dindirect block, and the "5" above becomes "3".
3011 *
3012 * This still overestimates under most circumstances. If we were to pass the
3013 * start and end offsets in here as well we could do block_to_path() on each
3014 * block and work out the exact number of indirects which are touched. Pah.
3015 */
3016
3017int ext4_writepage_trans_blocks(struct inode *inode)
3018{
3019 int bpp = ext4_journal_blocks_per_page(inode);
3020 int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
3021 int ret;
3022
3023 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3024 return ext4_ext_writepage_trans_blocks(inode, bpp);
3025
3026 if (ext4_should_journal_data(inode))
3027 ret = 3 * (bpp + indirects) + 2;
3028 else
3029 ret = 2 * (bpp + indirects) + 2;
3030
3031#ifdef CONFIG_QUOTA
3032 /* We know that structure was already allocated during DQUOT_INIT so
3033 * we will be updating only the data blocks + inodes */
3034 ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
3035#endif
3036
3037 return ret;
3038}
3039
3040/*
3041 * The caller must have previously called ext4_reserve_inode_write().
3042 * Give this, we know that the caller already has write access to iloc->bh.
3043 */
3044int ext4_mark_iloc_dirty(handle_t *handle,
3045 struct inode *inode, struct ext4_iloc *iloc)
3046{
3047 int err = 0;
3048
3049 /* the do_update_inode consumes one bh->b_count */
3050 get_bh(iloc->bh);
3051
3052 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
3053 err = ext4_do_update_inode(handle, inode, iloc);
3054 put_bh(iloc->bh);
3055 return err;
3056}
3057
3058/*
3059 * On success, We end up with an outstanding reference count against
3060 * iloc->bh. This _must_ be cleaned up later.
3061 */
3062
3063int
3064ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
3065 struct ext4_iloc *iloc)
3066{
3067 int err = 0;
3068 if (handle) {
3069 err = ext4_get_inode_loc(inode, iloc);
3070 if (!err) {
3071 BUFFER_TRACE(iloc->bh, "get_write_access");
3072 err = ext4_journal_get_write_access(handle, iloc->bh);
3073 if (err) {
3074 brelse(iloc->bh);
3075 iloc->bh = NULL;
3076 }
3077 }
3078 }
3079 ext4_std_error(inode->i_sb, err);
3080 return err;
3081}
3082
3083/*
3084 * What we do here is to mark the in-core inode as clean with respect to inode
3085 * dirtiness (it may still be data-dirty).
3086 * This means that the in-core inode may be reaped by prune_icache
3087 * without having to perform any I/O. This is a very good thing,
3088 * because *any* task may call prune_icache - even ones which
3089 * have a transaction open against a different journal.
3090 *
3091 * Is this cheating? Not really. Sure, we haven't written the
3092 * inode out, but prune_icache isn't a user-visible syncing function.
3093 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3094 * we start and wait on commits.
3095 *
3096 * Is this efficient/effective? Well, we're being nice to the system
3097 * by cleaning up our inodes proactively so they can be reaped
3098 * without I/O. But we are potentially leaving up to five seconds'
3099 * worth of inodes floating about which prune_icache wants us to
3100 * write out. One way to fix that would be to get prune_icache()
3101 * to do a write_super() to free up some memory. It has the desired
3102 * effect.
3103 */
3104int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
3105{
3106 struct ext4_iloc iloc;
3107 int err;
3108
3109 might_sleep();
3110 err = ext4_reserve_inode_write(handle, inode, &iloc);
3111 if (!err)
3112 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
3113 return err;
3114}
3115
3116/*
3117 * ext4_dirty_inode() is called from __mark_inode_dirty()
3118 *
3119 * We're really interested in the case where a file is being extended.
3120 * i_size has been changed by generic_commit_write() and we thus need
3121 * to include the updated inode in the current transaction.
3122 *
3123 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3124 * are allocated to the file.
3125 *
3126 * If the inode is marked synchronous, we don't honour that here - doing
3127 * so would cause a commit on atime updates, which we don't bother doing.
3128 * We handle synchronous inodes at the highest possible level.
3129 */
3130void ext4_dirty_inode(struct inode *inode)
3131{
3132 handle_t *current_handle = ext4_journal_current_handle();
3133 handle_t *handle;
3134
3135 handle = ext4_journal_start(inode, 2);
3136 if (IS_ERR(handle))
3137 goto out;
3138 if (current_handle &&
3139 current_handle->h_transaction != handle->h_transaction) {
3140 /* This task has a transaction open against a different fs */
3141 printk(KERN_EMERG "%s: transactions do not match!\n",
3142 __FUNCTION__);
3143 } else {
3144 jbd_debug(5, "marking dirty. outer handle=%p\n",
3145 current_handle);
3146 ext4_mark_inode_dirty(handle, inode);
3147 }
3148 ext4_journal_stop(handle);
3149out:
3150 return;
3151}
3152
3153#if 0
3154/*
3155 * Bind an inode's backing buffer_head into this transaction, to prevent
3156 * it from being flushed to disk early. Unlike
3157 * ext4_reserve_inode_write, this leaves behind no bh reference and
3158 * returns no iloc structure, so the caller needs to repeat the iloc
3159 * lookup to mark the inode dirty later.
3160 */
3161static int ext4_pin_inode(handle_t *handle, struct inode *inode)
3162{
3163 struct ext4_iloc iloc;
3164
3165 int err = 0;
3166 if (handle) {
3167 err = ext4_get_inode_loc(inode, &iloc);
3168 if (!err) {
3169 BUFFER_TRACE(iloc.bh, "get_write_access");
3170 err = jbd2_journal_get_write_access(handle, iloc.bh);
3171 if (!err)
3172 err = ext4_journal_dirty_metadata(handle,
3173 iloc.bh);
3174 brelse(iloc.bh);
3175 }
3176 }
3177 ext4_std_error(inode->i_sb, err);
3178 return err;
3179}
3180#endif
3181
3182int ext4_change_inode_journal_flag(struct inode *inode, int val)
3183{
3184 journal_t *journal;
3185 handle_t *handle;
3186 int err;
3187
3188 /*
3189 * We have to be very careful here: changing a data block's
3190 * journaling status dynamically is dangerous. If we write a
3191 * data block to the journal, change the status and then delete
3192 * that block, we risk forgetting to revoke the old log record
3193 * from the journal and so a subsequent replay can corrupt data.
3194 * So, first we make sure that the journal is empty and that
3195 * nobody is changing anything.
3196 */
3197
3198 journal = EXT4_JOURNAL(inode);
3199 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3200 return -EROFS;
3201
3202 jbd2_journal_lock_updates(journal);
3203 jbd2_journal_flush(journal);
3204
3205 /*
3206 * OK, there are no updates running now, and all cached data is
3207 * synced to disk. We are now in a completely consistent state
3208 * which doesn't have anything in the journal, and we know that
3209 * no filesystem updates are running, so it is safe to modify
3210 * the inode's in-core data-journaling state flag now.
3211 */
3212
3213 if (val)
3214 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
3215 else
3216 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
3217 ext4_set_aops(inode);
3218
3219 jbd2_journal_unlock_updates(journal);
3220
3221 /* Finally we can mark the inode as dirty. */
3222
3223 handle = ext4_journal_start(inode, 1);
3224 if (IS_ERR(handle))
3225 return PTR_ERR(handle);
3226
3227 err = ext4_mark_inode_dirty(handle, inode);
3228 handle->h_sync = 1;
3229 ext4_journal_stop(handle);
3230 ext4_std_error(inode->i_sb, err);
3231
3232 return err;
3233}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 000000000000..22a737c306c7
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,306 @@
1/*
2 * linux/fs/ext4/ioctl.c
3 *
4 * Copyright (C) 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/fs.h>
11#include <linux/jbd2.h>
12#include <linux/capability.h>
13#include <linux/ext4_fs.h>
14#include <linux/ext4_jbd2.h>
15#include <linux/time.h>
16#include <linux/compat.h>
17#include <linux/smp_lock.h>
18#include <asm/uaccess.h>
19
20int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
21 unsigned long arg)
22{
23 struct ext4_inode_info *ei = EXT4_I(inode);
24 unsigned int flags;
25 unsigned short rsv_window_size;
26
27 ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
28
29 switch (cmd) {
30 case EXT4_IOC_GETFLAGS:
31 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
32 return put_user(flags, (int __user *) arg);
33 case EXT4_IOC_SETFLAGS: {
34 handle_t *handle = NULL;
35 int err;
36 struct ext4_iloc iloc;
37 unsigned int oldflags;
38 unsigned int jflag;
39
40 if (IS_RDONLY(inode))
41 return -EROFS;
42
43 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
44 return -EACCES;
45
46 if (get_user(flags, (int __user *) arg))
47 return -EFAULT;
48
49 if (!S_ISDIR(inode->i_mode))
50 flags &= ~EXT4_DIRSYNC_FL;
51
52 mutex_lock(&inode->i_mutex);
53 oldflags = ei->i_flags;
54
55 /* The JOURNAL_DATA flag is modifiable only by root */
56 jflag = flags & EXT4_JOURNAL_DATA_FL;
57
58 /*
59 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
60 * the relevant capability.
61 *
62 * This test looks nicer. Thanks to Pauline Middelink
63 */
64 if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
65 if (!capable(CAP_LINUX_IMMUTABLE)) {
66 mutex_unlock(&inode->i_mutex);
67 return -EPERM;
68 }
69 }
70
71 /*
72 * The JOURNAL_DATA flag can only be changed by
73 * the relevant capability.
74 */
75 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
76 if (!capable(CAP_SYS_RESOURCE)) {
77 mutex_unlock(&inode->i_mutex);
78 return -EPERM;
79 }
80 }
81
82
83 handle = ext4_journal_start(inode, 1);
84 if (IS_ERR(handle)) {
85 mutex_unlock(&inode->i_mutex);
86 return PTR_ERR(handle);
87 }
88 if (IS_SYNC(inode))
89 handle->h_sync = 1;
90 err = ext4_reserve_inode_write(handle, inode, &iloc);
91 if (err)
92 goto flags_err;
93
94 flags = flags & EXT4_FL_USER_MODIFIABLE;
95 flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
96 ei->i_flags = flags;
97
98 ext4_set_inode_flags(inode);
99 inode->i_ctime = CURRENT_TIME_SEC;
100
101 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
102flags_err:
103 ext4_journal_stop(handle);
104 if (err) {
105 mutex_unlock(&inode->i_mutex);
106 return err;
107 }
108
109 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
110 err = ext4_change_inode_journal_flag(inode, jflag);
111 mutex_unlock(&inode->i_mutex);
112 return err;
113 }
114 case EXT4_IOC_GETVERSION:
115 case EXT4_IOC_GETVERSION_OLD:
116 return put_user(inode->i_generation, (int __user *) arg);
117 case EXT4_IOC_SETVERSION:
118 case EXT4_IOC_SETVERSION_OLD: {
119 handle_t *handle;
120 struct ext4_iloc iloc;
121 __u32 generation;
122 int err;
123
124 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
125 return -EPERM;
126 if (IS_RDONLY(inode))
127 return -EROFS;
128 if (get_user(generation, (int __user *) arg))
129 return -EFAULT;
130
131 handle = ext4_journal_start(inode, 1);
132 if (IS_ERR(handle))
133 return PTR_ERR(handle);
134 err = ext4_reserve_inode_write(handle, inode, &iloc);
135 if (err == 0) {
136 inode->i_ctime = CURRENT_TIME_SEC;
137 inode->i_generation = generation;
138 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
139 }
140 ext4_journal_stop(handle);
141 return err;
142 }
143#ifdef CONFIG_JBD_DEBUG
144 case EXT4_IOC_WAIT_FOR_READONLY:
145 /*
146 * This is racy - by the time we're woken up and running,
147 * the superblock could be released. And the module could
148 * have been unloaded. So sue me.
149 *
150 * Returns 1 if it slept, else zero.
151 */
152 {
153 struct super_block *sb = inode->i_sb;
154 DECLARE_WAITQUEUE(wait, current);
155 int ret = 0;
156
157 set_current_state(TASK_INTERRUPTIBLE);
158 add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
159 if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
160 schedule();
161 ret = 1;
162 }
163 remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
164 return ret;
165 }
166#endif
167 case EXT4_IOC_GETRSVSZ:
168 if (test_opt(inode->i_sb, RESERVATION)
169 && S_ISREG(inode->i_mode)
170 && ei->i_block_alloc_info) {
171 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
172 return put_user(rsv_window_size, (int __user *)arg);
173 }
174 return -ENOTTY;
175 case EXT4_IOC_SETRSVSZ: {
176
177 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
178 return -ENOTTY;
179
180 if (IS_RDONLY(inode))
181 return -EROFS;
182
183 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
184 return -EACCES;
185
186 if (get_user(rsv_window_size, (int __user *)arg))
187 return -EFAULT;
188
189 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
190 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
191
192 /*
193 * need to allocate reservation structure for this inode
194 * before set the window size
195 */
196 mutex_lock(&ei->truncate_mutex);
197 if (!ei->i_block_alloc_info)
198 ext4_init_block_alloc_info(inode);
199
200 if (ei->i_block_alloc_info){
201 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
202 rsv->rsv_goal_size = rsv_window_size;
203 }
204 mutex_unlock(&ei->truncate_mutex);
205 return 0;
206 }
207 case EXT4_IOC_GROUP_EXTEND: {
208 ext4_fsblk_t n_blocks_count;
209 struct super_block *sb = inode->i_sb;
210 int err;
211
212 if (!capable(CAP_SYS_RESOURCE))
213 return -EPERM;
214
215 if (IS_RDONLY(inode))
216 return -EROFS;
217
218 if (get_user(n_blocks_count, (__u32 __user *)arg))
219 return -EFAULT;
220
221 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
222 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
223 jbd2_journal_flush(EXT4_SB(sb)->s_journal);
224 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
225
226 return err;
227 }
228 case EXT4_IOC_GROUP_ADD: {
229 struct ext4_new_group_data input;
230 struct super_block *sb = inode->i_sb;
231 int err;
232
233 if (!capable(CAP_SYS_RESOURCE))
234 return -EPERM;
235
236 if (IS_RDONLY(inode))
237 return -EROFS;
238
239 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
240 sizeof(input)))
241 return -EFAULT;
242
243 err = ext4_group_add(sb, &input);
244 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
245 jbd2_journal_flush(EXT4_SB(sb)->s_journal);
246 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
247
248 return err;
249 }
250
251 default:
252 return -ENOTTY;
253 }
254}
255
256#ifdef CONFIG_COMPAT
257long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
258{
259 struct inode *inode = file->f_dentry->d_inode;
260 int ret;
261
262 /* These are just misnamed, they actually get/put from/to user an int */
263 switch (cmd) {
264 case EXT4_IOC32_GETFLAGS:
265 cmd = EXT4_IOC_GETFLAGS;
266 break;
267 case EXT4_IOC32_SETFLAGS:
268 cmd = EXT4_IOC_SETFLAGS;
269 break;
270 case EXT4_IOC32_GETVERSION:
271 cmd = EXT4_IOC_GETVERSION;
272 break;
273 case EXT4_IOC32_SETVERSION:
274 cmd = EXT4_IOC_SETVERSION;
275 break;
276 case EXT4_IOC32_GROUP_EXTEND:
277 cmd = EXT4_IOC_GROUP_EXTEND;
278 break;
279 case EXT4_IOC32_GETVERSION_OLD:
280 cmd = EXT4_IOC_GETVERSION_OLD;
281 break;
282 case EXT4_IOC32_SETVERSION_OLD:
283 cmd = EXT4_IOC_SETVERSION_OLD;
284 break;
285#ifdef CONFIG_JBD_DEBUG
286 case EXT4_IOC32_WAIT_FOR_READONLY:
287 cmd = EXT4_IOC_WAIT_FOR_READONLY;
288 break;
289#endif
290 case EXT4_IOC32_GETRSVSZ:
291 cmd = EXT4_IOC_GETRSVSZ;
292 break;
293 case EXT4_IOC32_SETRSVSZ:
294 cmd = EXT4_IOC_SETRSVSZ;
295 break;
296 case EXT4_IOC_GROUP_ADD:
297 break;
298 default:
299 return -ENOIOCTLCMD;
300 }
301 lock_kernel();
302 ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
303 unlock_kernel();
304 return ret;
305}
306#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 000000000000..8b1bd03d20f5
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2395 @@
1/*
2 * linux/fs/ext4/namei.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/namei.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 * Directory entry file type support and forward compatibility hooks
18 * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19 * Hash Tree Directory indexing (c)
20 * Daniel Phillips, 2001
21 * Hash Tree Directory indexing porting
22 * Christopher Li, 2002
23 * Hash Tree Directory indexing cleanup
24 * Theodore Ts'o, 2002
25 */
26
27#include <linux/fs.h>
28#include <linux/pagemap.h>
29#include <linux/jbd2.h>
30#include <linux/time.h>
31#include <linux/ext4_fs.h>
32#include <linux/ext4_jbd2.h>
33#include <linux/fcntl.h>
34#include <linux/stat.h>
35#include <linux/string.h>
36#include <linux/quotaops.h>
37#include <linux/buffer_head.h>
38#include <linux/bio.h>
39#include <linux/smp_lock.h>
40
41#include "namei.h"
42#include "xattr.h"
43#include "acl.h"
44
45/*
46 * define how far ahead to read directories while searching them.
47 */
48#define NAMEI_RA_CHUNKS 2
49#define NAMEI_RA_BLOCKS 4
50#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
51#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
52
53static struct buffer_head *ext4_append(handle_t *handle,
54 struct inode *inode,
55 u32 *block, int *err)
56{
57 struct buffer_head *bh;
58
59 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
60
61 if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
62 inode->i_size += inode->i_sb->s_blocksize;
63 EXT4_I(inode)->i_disksize = inode->i_size;
64 ext4_journal_get_write_access(handle,bh);
65 }
66 return bh;
67}
68
69#ifndef assert
70#define assert(test) J_ASSERT(test)
71#endif
72
73#ifndef swap
74#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
75#endif
76
77#ifdef DX_DEBUG
78#define dxtrace(command) command
79#else
80#define dxtrace(command)
81#endif
82
83struct fake_dirent
84{
85 __le32 inode;
86 __le16 rec_len;
87 u8 name_len;
88 u8 file_type;
89};
90
91struct dx_countlimit
92{
93 __le16 limit;
94 __le16 count;
95};
96
97struct dx_entry
98{
99 __le32 hash;
100 __le32 block;
101};
102
103/*
104 * dx_root_info is laid out so that if it should somehow get overlaid by a
105 * dirent the two low bits of the hash version will be zero. Therefore, the
106 * hash version mod 4 should never be 0. Sincerely, the paranoia department.
107 */
108
109struct dx_root
110{
111 struct fake_dirent dot;
112 char dot_name[4];
113 struct fake_dirent dotdot;
114 char dotdot_name[4];
115 struct dx_root_info
116 {
117 __le32 reserved_zero;
118 u8 hash_version;
119 u8 info_length; /* 8 */
120 u8 indirect_levels;
121 u8 unused_flags;
122 }
123 info;
124 struct dx_entry entries[0];
125};
126
127struct dx_node
128{
129 struct fake_dirent fake;
130 struct dx_entry entries[0];
131};
132
133
134struct dx_frame
135{
136 struct buffer_head *bh;
137 struct dx_entry *entries;
138 struct dx_entry *at;
139};
140
141struct dx_map_entry
142{
143 u32 hash;
144 u32 offs;
145};
146
147#ifdef CONFIG_EXT4_INDEX
148static inline unsigned dx_get_block (struct dx_entry *entry);
149static void dx_set_block (struct dx_entry *entry, unsigned value);
150static inline unsigned dx_get_hash (struct dx_entry *entry);
151static void dx_set_hash (struct dx_entry *entry, unsigned value);
152static unsigned dx_get_count (struct dx_entry *entries);
153static unsigned dx_get_limit (struct dx_entry *entries);
154static void dx_set_count (struct dx_entry *entries, unsigned value);
155static void dx_set_limit (struct dx_entry *entries, unsigned value);
156static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
157static unsigned dx_node_limit (struct inode *dir);
158static struct dx_frame *dx_probe(struct dentry *dentry,
159 struct inode *dir,
160 struct dx_hash_info *hinfo,
161 struct dx_frame *frame,
162 int *err);
163static void dx_release (struct dx_frame *frames);
164static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
168 struct dx_map_entry *offsets, int count);
169static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
171static int ext4_htree_next_block(struct inode *dir, __u32 hash,
172 struct dx_frame *frame,
173 struct dx_frame *frames,
174 __u32 *start_hash);
175static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
176 struct ext4_dir_entry_2 **res_dir, int *err);
177static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
178 struct inode *inode);
179
180/*
181 * Future: use high four bits of block for coalesce-on-delete flags
182 * Mask them off for now.
183 */
184
185static inline unsigned dx_get_block (struct dx_entry *entry)
186{
187 return le32_to_cpu(entry->block) & 0x00ffffff;
188}
189
190static inline void dx_set_block (struct dx_entry *entry, unsigned value)
191{
192 entry->block = cpu_to_le32(value);
193}
194
195static inline unsigned dx_get_hash (struct dx_entry *entry)
196{
197 return le32_to_cpu(entry->hash);
198}
199
200static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
201{
202 entry->hash = cpu_to_le32(value);
203}
204
205static inline unsigned dx_get_count (struct dx_entry *entries)
206{
207 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
208}
209
210static inline unsigned dx_get_limit (struct dx_entry *entries)
211{
212 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
213}
214
215static inline void dx_set_count (struct dx_entry *entries, unsigned value)
216{
217 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
218}
219
220static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
221{
222 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
223}
224
225static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
226{
227 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
228 EXT4_DIR_REC_LEN(2) - infosize;
229 return 0? 20: entry_space / sizeof(struct dx_entry);
230}
231
232static inline unsigned dx_node_limit (struct inode *dir)
233{
234 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
235 return 0? 22: entry_space / sizeof(struct dx_entry);
236}
237
238/*
239 * Debug
240 */
241#ifdef DX_DEBUG
242static void dx_show_index (char * label, struct dx_entry *entries)
243{
244 int i, n = dx_get_count (entries);
245 printk("%s index ", label);
246 for (i = 0; i < n; i++) {
247 printk("%x->%u ", i? dx_get_hash(entries + i) :
248 0, dx_get_block(entries + i));
249 }
250 printk("\n");
251}
252
253struct stats
254{
255 unsigned names;
256 unsigned space;
257 unsigned bcount;
258};
259
260static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
261 int size, int show_names)
262{
263 unsigned names = 0, space = 0;
264 char *base = (char *) de;
265 struct dx_hash_info h = *hinfo;
266
267 printk("names: ");
268 while ((char *) de < base + size)
269 {
270 if (de->inode)
271 {
272 if (show_names)
273 {
274 int len = de->name_len;
275 char *name = de->name;
276 while (len--) printk("%c", *name++);
277 ext4fs_dirhash(de->name, de->name_len, &h);
278 printk(":%x.%u ", h.hash,
279 ((char *) de - base));
280 }
281 space += EXT4_DIR_REC_LEN(de->name_len);
282 names++;
283 }
284 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
285 }
286 printk("(%i)\n", names);
287 return (struct stats) { names, space, 1 };
288}
289
290struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
291 struct dx_entry *entries, int levels)
292{
293 unsigned blocksize = dir->i_sb->s_blocksize;
294 unsigned count = dx_get_count (entries), names = 0, space = 0, i;
295 unsigned bcount = 0;
296 struct buffer_head *bh;
297 int err;
298 printk("%i indexed blocks...\n", count);
299 for (i = 0; i < count; i++, entries++)
300 {
301 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
302 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
303 struct stats stats;
304 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
305 if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
306 stats = levels?
307 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
308 dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
309 names += stats.names;
310 space += stats.space;
311 bcount += stats.bcount;
312 brelse (bh);
313 }
314 if (bcount)
315 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
316 names, space/bcount,(space/bcount)*100/blocksize);
317 return (struct stats) { names, space, bcount};
318}
319#endif /* DX_DEBUG */
320
321/*
322 * Probe for a directory leaf block to search.
323 *
324 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
325 * error in the directory index, and the caller should fall back to
326 * searching the directory normally. The callers of dx_probe **MUST**
327 * check for this error code, and make sure it never gets reflected
328 * back to userspace.
329 */
330static struct dx_frame *
331dx_probe(struct dentry *dentry, struct inode *dir,
332 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
333{
334 unsigned count, indirect;
335 struct dx_entry *at, *entries, *p, *q, *m;
336 struct dx_root *root;
337 struct buffer_head *bh;
338 struct dx_frame *frame = frame_in;
339 u32 hash;
340
341 frame->bh = NULL;
342 if (dentry)
343 dir = dentry->d_parent->d_inode;
344 if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
345 goto fail;
346 root = (struct dx_root *) bh->b_data;
347 if (root->info.hash_version != DX_HASH_TEA &&
348 root->info.hash_version != DX_HASH_HALF_MD4 &&
349 root->info.hash_version != DX_HASH_LEGACY) {
350 ext4_warning(dir->i_sb, __FUNCTION__,
351 "Unrecognised inode hash code %d",
352 root->info.hash_version);
353 brelse(bh);
354 *err = ERR_BAD_DX_DIR;
355 goto fail;
356 }
357 hinfo->hash_version = root->info.hash_version;
358 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
359 if (dentry)
360 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
361 hash = hinfo->hash;
362
363 if (root->info.unused_flags & 1) {
364 ext4_warning(dir->i_sb, __FUNCTION__,
365 "Unimplemented inode hash flags: %#06x",
366 root->info.unused_flags);
367 brelse(bh);
368 *err = ERR_BAD_DX_DIR;
369 goto fail;
370 }
371
372 if ((indirect = root->info.indirect_levels) > 1) {
373 ext4_warning(dir->i_sb, __FUNCTION__,
374 "Unimplemented inode hash depth: %#06x",
375 root->info.indirect_levels);
376 brelse(bh);
377 *err = ERR_BAD_DX_DIR;
378 goto fail;
379 }
380
381 entries = (struct dx_entry *) (((char *)&root->info) +
382 root->info.info_length);
383 assert(dx_get_limit(entries) == dx_root_limit(dir,
384 root->info.info_length));
385 dxtrace (printk("Look up %x", hash));
386 while (1)
387 {
388 count = dx_get_count(entries);
389 assert (count && count <= dx_get_limit(entries));
390 p = entries + 1;
391 q = entries + count - 1;
392 while (p <= q)
393 {
394 m = p + (q - p)/2;
395 dxtrace(printk("."));
396 if (dx_get_hash(m) > hash)
397 q = m - 1;
398 else
399 p = m + 1;
400 }
401
402 if (0) // linear search cross check
403 {
404 unsigned n = count - 1;
405 at = entries;
406 while (n--)
407 {
408 dxtrace(printk(","));
409 if (dx_get_hash(++at) > hash)
410 {
411 at--;
412 break;
413 }
414 }
415 assert (at == p - 1);
416 }
417
418 at = p - 1;
419 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
420 frame->bh = bh;
421 frame->entries = entries;
422 frame->at = at;
423 if (!indirect--) return frame;
424 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
425 goto fail2;
426 at = entries = ((struct dx_node *) bh->b_data)->entries;
427 assert (dx_get_limit(entries) == dx_node_limit (dir));
428 frame++;
429 }
430fail2:
431 while (frame >= frame_in) {
432 brelse(frame->bh);
433 frame--;
434 }
435fail:
436 return NULL;
437}
438
439static void dx_release (struct dx_frame *frames)
440{
441 if (frames[0].bh == NULL)
442 return;
443
444 if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
445 brelse(frames[1].bh);
446 brelse(frames[0].bh);
447}
448
449/*
450 * This function increments the frame pointer to search the next leaf
451 * block, and reads in the necessary intervening nodes if the search
452 * should be necessary. Whether or not the search is necessary is
453 * controlled by the hash parameter. If the hash value is even, then
454 * the search is only continued if the next block starts with that
455 * hash value. This is used if we are searching for a specific file.
456 *
457 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
458 *
459 * This function returns 1 if the caller should continue to search,
460 * or 0 if it should not. If there is an error reading one of the
461 * index blocks, it will a negative error code.
462 *
463 * If start_hash is non-null, it will be filled in with the starting
464 * hash of the next page.
465 */
466static int ext4_htree_next_block(struct inode *dir, __u32 hash,
467 struct dx_frame *frame,
468 struct dx_frame *frames,
469 __u32 *start_hash)
470{
471 struct dx_frame *p;
472 struct buffer_head *bh;
473 int err, num_frames = 0;
474 __u32 bhash;
475
476 p = frame;
477 /*
478 * Find the next leaf page by incrementing the frame pointer.
479 * If we run out of entries in the interior node, loop around and
480 * increment pointer in the parent node. When we break out of
481 * this loop, num_frames indicates the number of interior
482 * nodes need to be read.
483 */
484 while (1) {
485 if (++(p->at) < p->entries + dx_get_count(p->entries))
486 break;
487 if (p == frames)
488 return 0;
489 num_frames++;
490 p--;
491 }
492
493 /*
494 * If the hash is 1, then continue only if the next page has a
495 * continuation hash of any value. This is used for readdir
496 * handling. Otherwise, check to see if the hash matches the
497 * desired contiuation hash. If it doesn't, return since
498 * there's no point to read in the successive index pages.
499 */
500 bhash = dx_get_hash(p->at);
501 if (start_hash)
502 *start_hash = bhash;
503 if ((hash & 1) == 0) {
504 if ((bhash & ~1) != hash)
505 return 0;
506 }
507 /*
508 * If the hash is HASH_NB_ALWAYS, we always go to the next
509 * block so no check is necessary
510 */
511 while (num_frames--) {
512 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
513 0, &err)))
514 return err; /* Failure */
515 p++;
516 brelse (p->bh);
517 p->bh = bh;
518 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
519 }
520 return 1;
521}
522
523
524/*
525 * p is at least 6 bytes before the end of page
526 */
527static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
528{
529 return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
530}
531
532/*
533 * This function fills a red-black tree with information from a
534 * directory block. It returns the number directory entries loaded
535 * into the tree. If there is an error it is returned in err.
536 */
537static int htree_dirblock_to_tree(struct file *dir_file,
538 struct inode *dir, int block,
539 struct dx_hash_info *hinfo,
540 __u32 start_hash, __u32 start_minor_hash)
541{
542 struct buffer_head *bh;
543 struct ext4_dir_entry_2 *de, *top;
544 int err, count = 0;
545
546 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
547 if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
548 return err;
549
550 de = (struct ext4_dir_entry_2 *) bh->b_data;
551 top = (struct ext4_dir_entry_2 *) ((char *) de +
552 dir->i_sb->s_blocksize -
553 EXT4_DIR_REC_LEN(0));
554 for (; de < top; de = ext4_next_entry(de)) {
555 ext4fs_dirhash(de->name, de->name_len, hinfo);
556 if ((hinfo->hash < start_hash) ||
557 ((hinfo->hash == start_hash) &&
558 (hinfo->minor_hash < start_minor_hash)))
559 continue;
560 if (de->inode == 0)
561 continue;
562 if ((err = ext4_htree_store_dirent(dir_file,
563 hinfo->hash, hinfo->minor_hash, de)) != 0) {
564 brelse(bh);
565 return err;
566 }
567 count++;
568 }
569 brelse(bh);
570 return count;
571}
572
573
574/*
575 * This function fills a red-black tree with information from a
576 * directory. We start scanning the directory in hash order, starting
577 * at start_hash and start_minor_hash.
578 *
579 * This function returns the number of entries inserted into the tree,
580 * or a negative error code.
581 */
582int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
583 __u32 start_minor_hash, __u32 *next_hash)
584{
585 struct dx_hash_info hinfo;
586 struct ext4_dir_entry_2 *de;
587 struct dx_frame frames[2], *frame;
588 struct inode *dir;
589 int block, err;
590 int count = 0;
591 int ret;
592 __u32 hashval;
593
594 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
595 start_minor_hash));
596 dir = dir_file->f_dentry->d_inode;
597 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
598 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
599 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
600 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
601 start_hash, start_minor_hash);
602 *next_hash = ~0;
603 return count;
604 }
605 hinfo.hash = start_hash;
606 hinfo.minor_hash = 0;
607 frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
608 if (!frame)
609 return err;
610
611 /* Add '.' and '..' from the htree header */
612 if (!start_hash && !start_minor_hash) {
613 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
614 if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
615 goto errout;
616 count++;
617 }
618 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
619 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
620 de = ext4_next_entry(de);
621 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
622 goto errout;
623 count++;
624 }
625
626 while (1) {
627 block = dx_get_block(frame->at);
628 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
629 start_hash, start_minor_hash);
630 if (ret < 0) {
631 err = ret;
632 goto errout;
633 }
634 count += ret;
635 hashval = ~0;
636 ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
637 frame, frames, &hashval);
638 *next_hash = hashval;
639 if (ret < 0) {
640 err = ret;
641 goto errout;
642 }
643 /*
644 * Stop if: (a) there are no more entries, or
645 * (b) we have inserted at least one entry and the
646 * next hash value is not a continuation
647 */
648 if ((ret == 0) ||
649 (count && ((hashval & 1) == 0)))
650 break;
651 }
652 dx_release(frames);
653 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
654 count, *next_hash));
655 return count;
656errout:
657 dx_release(frames);
658 return (err);
659}
660
661
662/*
663 * Directory block splitting, compacting
664 */
665
666static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
667 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
668{
669 int count = 0;
670 char *base = (char *) de;
671 struct dx_hash_info h = *hinfo;
672
673 while ((char *) de < base + size)
674 {
675 if (de->name_len && de->inode) {
676 ext4fs_dirhash(de->name, de->name_len, &h);
677 map_tail--;
678 map_tail->hash = h.hash;
679 map_tail->offs = (u32) ((char *) de - base);
680 count++;
681 cond_resched();
682 }
683 /* XXX: do we need to check rec_len == 0 case? -Chris */
684 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
685 }
686 return count;
687}
688
689static void dx_sort_map (struct dx_map_entry *map, unsigned count)
690{
691 struct dx_map_entry *p, *q, *top = map + count - 1;
692 int more;
693 /* Combsort until bubble sort doesn't suck */
694 while (count > 2) {
695 count = count*10/13;
696 if (count - 9 < 2) /* 9, 10 -> 11 */
697 count = 11;
698 for (p = top, q = p - count; q >= map; p--, q--)
699 if (p->hash < q->hash)
700 swap(*p, *q);
701 }
702 /* Garden variety bubble sort */
703 do {
704 more = 0;
705 q = top;
706 while (q-- > map) {
707 if (q[1].hash >= q[0].hash)
708 continue;
709 swap(*(q+1), *q);
710 more = 1;
711 }
712 } while(more);
713}
714
715static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
716{
717 struct dx_entry *entries = frame->entries;
718 struct dx_entry *old = frame->at, *new = old + 1;
719 int count = dx_get_count(entries);
720
721 assert(count < dx_get_limit(entries));
722 assert(old < entries + count);
723 memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
724 dx_set_hash(new, hash);
725 dx_set_block(new, block);
726 dx_set_count(entries, count + 1);
727}
728#endif
729
730
731static void ext4_update_dx_flag(struct inode *inode)
732{
733 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
734 EXT4_FEATURE_COMPAT_DIR_INDEX))
735 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
736}
737
738/*
739 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
740 *
741 * `len <= EXT4_NAME_LEN' is guaranteed by caller.
742 * `de != NULL' is guaranteed by caller.
743 */
744static inline int ext4_match (int len, const char * const name,
745 struct ext4_dir_entry_2 * de)
746{
747 if (len != de->name_len)
748 return 0;
749 if (!de->inode)
750 return 0;
751 return !memcmp(name, de->name, len);
752}
753
754/*
755 * Returns 0 if not found, -1 on failure, and 1 on success
756 */
757static inline int search_dirblock(struct buffer_head * bh,
758 struct inode *dir,
759 struct dentry *dentry,
760 unsigned long offset,
761 struct ext4_dir_entry_2 ** res_dir)
762{
763 struct ext4_dir_entry_2 * de;
764 char * dlimit;
765 int de_len;
766 const char *name = dentry->d_name.name;
767 int namelen = dentry->d_name.len;
768
769 de = (struct ext4_dir_entry_2 *) bh->b_data;
770 dlimit = bh->b_data + dir->i_sb->s_blocksize;
771 while ((char *) de < dlimit) {
772 /* this code is executed quadratically often */
773 /* do minimal checking `by hand' */
774
775 if ((char *) de + namelen <= dlimit &&
776 ext4_match (namelen, name, de)) {
777 /* found a match - just to be sure, do a full check */
778 if (!ext4_check_dir_entry("ext4_find_entry",
779 dir, de, bh, offset))
780 return -1;
781 *res_dir = de;
782 return 1;
783 }
784 /* prevent looping on a bad block */
785 de_len = le16_to_cpu(de->rec_len);
786 if (de_len <= 0)
787 return -1;
788 offset += de_len;
789 de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
790 }
791 return 0;
792}
793
794
795/*
796 * ext4_find_entry()
797 *
798 * finds an entry in the specified directory with the wanted name. It
799 * returns the cache buffer in which the entry was found, and the entry
800 * itself (as a parameter - res_dir). It does NOT read the inode of the
801 * entry - you'll have to do that yourself if you want to.
802 *
803 * The returned buffer_head has ->b_count elevated. The caller is expected
804 * to brelse() it when appropriate.
805 */
806static struct buffer_head * ext4_find_entry (struct dentry *dentry,
807 struct ext4_dir_entry_2 ** res_dir)
808{
809 struct super_block * sb;
810 struct buffer_head * bh_use[NAMEI_RA_SIZE];
811 struct buffer_head * bh, *ret = NULL;
812 unsigned long start, block, b;
813 int ra_max = 0; /* Number of bh's in the readahead
814 buffer, bh_use[] */
815 int ra_ptr = 0; /* Current index into readahead
816 buffer */
817 int num = 0;
818 int nblocks, i, err;
819 struct inode *dir = dentry->d_parent->d_inode;
820 int namelen;
821 const u8 *name;
822 unsigned blocksize;
823
824 *res_dir = NULL;
825 sb = dir->i_sb;
826 blocksize = sb->s_blocksize;
827 namelen = dentry->d_name.len;
828 name = dentry->d_name.name;
829 if (namelen > EXT4_NAME_LEN)
830 return NULL;
831#ifdef CONFIG_EXT4_INDEX
832 if (is_dx(dir)) {
833 bh = ext4_dx_find_entry(dentry, res_dir, &err);
834 /*
835 * On success, or if the error was file not found,
836 * return. Otherwise, fall back to doing a search the
837 * old fashioned way.
838 */
839 if (bh || (err != ERR_BAD_DX_DIR))
840 return bh;
841 dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
842 }
843#endif
844 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
845 start = EXT4_I(dir)->i_dir_start_lookup;
846 if (start >= nblocks)
847 start = 0;
848 block = start;
849restart:
850 do {
851 /*
852 * We deal with the read-ahead logic here.
853 */
854 if (ra_ptr >= ra_max) {
855 /* Refill the readahead buffer */
856 ra_ptr = 0;
857 b = block;
858 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
859 /*
860 * Terminate if we reach the end of the
861 * directory and must wrap, or if our
862 * search has finished at this block.
863 */
864 if (b >= nblocks || (num && block == start)) {
865 bh_use[ra_max] = NULL;
866 break;
867 }
868 num++;
869 bh = ext4_getblk(NULL, dir, b++, 0, &err);
870 bh_use[ra_max] = bh;
871 if (bh)
872 ll_rw_block(READ_META, 1, &bh);
873 }
874 }
875 if ((bh = bh_use[ra_ptr++]) == NULL)
876 goto next;
877 wait_on_buffer(bh);
878 if (!buffer_uptodate(bh)) {
879 /* read error, skip block & hope for the best */
880 ext4_error(sb, __FUNCTION__, "reading directory #%lu "
881 "offset %lu", dir->i_ino, block);
882 brelse(bh);
883 goto next;
884 }
885 i = search_dirblock(bh, dir, dentry,
886 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
887 if (i == 1) {
888 EXT4_I(dir)->i_dir_start_lookup = block;
889 ret = bh;
890 goto cleanup_and_exit;
891 } else {
892 brelse(bh);
893 if (i < 0)
894 goto cleanup_and_exit;
895 }
896 next:
897 if (++block >= nblocks)
898 block = 0;
899 } while (block != start);
900
901 /*
902 * If the directory has grown while we were searching, then
903 * search the last part of the directory before giving up.
904 */
905 block = nblocks;
906 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
907 if (block < nblocks) {
908 start = 0;
909 goto restart;
910 }
911
912cleanup_and_exit:
913 /* Clean up the read-ahead blocks */
914 for (; ra_ptr < ra_max; ra_ptr++)
915 brelse (bh_use[ra_ptr]);
916 return ret;
917}
918
919#ifdef CONFIG_EXT4_INDEX
920static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
921 struct ext4_dir_entry_2 **res_dir, int *err)
922{
923 struct super_block * sb;
924 struct dx_hash_info hinfo;
925 u32 hash;
926 struct dx_frame frames[2], *frame;
927 struct ext4_dir_entry_2 *de, *top;
928 struct buffer_head *bh;
929 unsigned long block;
930 int retval;
931 int namelen = dentry->d_name.len;
932 const u8 *name = dentry->d_name.name;
933 struct inode *dir = dentry->d_parent->d_inode;
934
935 sb = dir->i_sb;
936 /* NFS may look up ".." - look at dx_root directory block */
937 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
938 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
939 return NULL;
940 } else {
941 frame = frames;
942 frame->bh = NULL; /* for dx_release() */
943 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
944 dx_set_block(frame->at, 0); /* dx_root block is 0 */
945 }
946 hash = hinfo.hash;
947 do {
948 block = dx_get_block(frame->at);
949 if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
950 goto errout;
951 de = (struct ext4_dir_entry_2 *) bh->b_data;
952 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
953 EXT4_DIR_REC_LEN(0));
954 for (; de < top; de = ext4_next_entry(de))
955 if (ext4_match (namelen, name, de)) {
956 if (!ext4_check_dir_entry("ext4_find_entry",
957 dir, de, bh,
958 (block<<EXT4_BLOCK_SIZE_BITS(sb))
959 +((char *)de - bh->b_data))) {
960 brelse (bh);
961 goto errout;
962 }
963 *res_dir = de;
964 dx_release (frames);
965 return bh;
966 }
967 brelse (bh);
968 /* Check to see if we should continue to search */
969 retval = ext4_htree_next_block(dir, hash, frame,
970 frames, NULL);
971 if (retval < 0) {
972 ext4_warning(sb, __FUNCTION__,
973 "error reading index page in directory #%lu",
974 dir->i_ino);
975 *err = retval;
976 goto errout;
977 }
978 } while (retval == 1);
979
980 *err = -ENOENT;
981errout:
982 dxtrace(printk("%s not found\n", name));
983 dx_release (frames);
984 return NULL;
985}
986#endif
987
988static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
989{
990 struct inode * inode;
991 struct ext4_dir_entry_2 * de;
992 struct buffer_head * bh;
993
994 if (dentry->d_name.len > EXT4_NAME_LEN)
995 return ERR_PTR(-ENAMETOOLONG);
996
997 bh = ext4_find_entry(dentry, &de);
998 inode = NULL;
999 if (bh) {
1000 unsigned long ino = le32_to_cpu(de->inode);
1001 brelse (bh);
1002 if (!ext4_valid_inum(dir->i_sb, ino)) {
1003 ext4_error(dir->i_sb, "ext4_lookup",
1004 "bad inode number: %lu", ino);
1005 inode = NULL;
1006 } else
1007 inode = iget(dir->i_sb, ino);
1008
1009 if (!inode)
1010 return ERR_PTR(-EACCES);
1011 }
1012 return d_splice_alias(inode, dentry);
1013}
1014
1015
1016struct dentry *ext4_get_parent(struct dentry *child)
1017{
1018 unsigned long ino;
1019 struct dentry *parent;
1020 struct inode *inode;
1021 struct dentry dotdot;
1022 struct ext4_dir_entry_2 * de;
1023 struct buffer_head *bh;
1024
1025 dotdot.d_name.name = "..";
1026 dotdot.d_name.len = 2;
1027 dotdot.d_parent = child; /* confusing, isn't it! */
1028
1029 bh = ext4_find_entry(&dotdot, &de);
1030 inode = NULL;
1031 if (!bh)
1032 return ERR_PTR(-ENOENT);
1033 ino = le32_to_cpu(de->inode);
1034 brelse(bh);
1035
1036 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1037 ext4_error(child->d_inode->i_sb, "ext4_get_parent",
1038 "bad inode number: %lu", ino);
1039 inode = NULL;
1040 } else
1041 inode = iget(child->d_inode->i_sb, ino);
1042
1043 if (!inode)
1044 return ERR_PTR(-EACCES);
1045
1046 parent = d_alloc_anon(inode);
1047 if (!parent) {
1048 iput(inode);
1049 parent = ERR_PTR(-ENOMEM);
1050 }
1051 return parent;
1052}
1053
1054#define S_SHIFT 12
1055static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
1056 [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
1057 [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
1058 [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
1059 [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
1060 [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
1061 [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
1062 [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
1063};
1064
1065static inline void ext4_set_de_type(struct super_block *sb,
1066 struct ext4_dir_entry_2 *de,
1067 umode_t mode) {
1068 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
1069 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1070}
1071
1072#ifdef CONFIG_EXT4_INDEX
1073static struct ext4_dir_entry_2 *
1074dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1075{
1076 unsigned rec_len = 0;
1077
1078 while (count--) {
1079 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
1080 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1081 memcpy (to, de, rec_len);
1082 ((struct ext4_dir_entry_2 *) to)->rec_len =
1083 cpu_to_le16(rec_len);
1084 de->inode = 0;
1085 map++;
1086 to += rec_len;
1087 }
1088 return (struct ext4_dir_entry_2 *) (to - rec_len);
1089}
1090
1091static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
1092{
1093 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1094 unsigned rec_len = 0;
1095
1096 prev = to = de;
1097 while ((char*)de < base + size) {
1098 next = (struct ext4_dir_entry_2 *) ((char *) de +
1099 le16_to_cpu(de->rec_len));
1100 if (de->inode && de->name_len) {
1101 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1102 if (de > to)
1103 memmove(to, de, rec_len);
1104 to->rec_len = cpu_to_le16(rec_len);
1105 prev = to;
1106 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1107 }
1108 de = next;
1109 }
1110 return prev;
1111}
1112
1113static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1114 struct buffer_head **bh,struct dx_frame *frame,
1115 struct dx_hash_info *hinfo, int *error)
1116{
1117 unsigned blocksize = dir->i_sb->s_blocksize;
1118 unsigned count, continued;
1119 struct buffer_head *bh2;
1120 u32 newblock;
1121 u32 hash2;
1122 struct dx_map_entry *map;
1123 char *data1 = (*bh)->b_data, *data2;
1124 unsigned split;
1125 struct ext4_dir_entry_2 *de = NULL, *de2;
1126 int err;
1127
1128 bh2 = ext4_append (handle, dir, &newblock, error);
1129 if (!(bh2)) {
1130 brelse(*bh);
1131 *bh = NULL;
1132 goto errout;
1133 }
1134
1135 BUFFER_TRACE(*bh, "get_write_access");
1136 err = ext4_journal_get_write_access(handle, *bh);
1137 if (err) {
1138 journal_error:
1139 brelse(*bh);
1140 brelse(bh2);
1141 *bh = NULL;
1142 ext4_std_error(dir->i_sb, err);
1143 goto errout;
1144 }
1145 BUFFER_TRACE(frame->bh, "get_write_access");
1146 err = ext4_journal_get_write_access(handle, frame->bh);
1147 if (err)
1148 goto journal_error;
1149
1150 data2 = bh2->b_data;
1151
1152 /* create map in the end of data2 block */
1153 map = (struct dx_map_entry *) (data2 + blocksize);
1154 count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
1155 blocksize, hinfo, map);
1156 map -= count;
1157 split = count/2; // need to adjust to actual middle
1158 dx_sort_map (map, count);
1159 hash2 = map[split].hash;
1160 continued = hash2 == map[split - 1].hash;
1161 dxtrace(printk("Split block %i at %x, %i/%i\n",
1162 dx_get_block(frame->at), hash2, split, count-split));
1163
1164 /* Fancy dance to stay within two buffers */
1165 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1166 de = dx_pack_dirents(data1,blocksize);
1167 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1168 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
1169 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1170 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1171
1172 /* Which block gets the new entry? */
1173 if (hinfo->hash >= hash2)
1174 {
1175 swap(*bh, bh2);
1176 de = de2;
1177 }
1178 dx_insert_block (frame, hash2 + continued, newblock);
1179 err = ext4_journal_dirty_metadata (handle, bh2);
1180 if (err)
1181 goto journal_error;
1182 err = ext4_journal_dirty_metadata (handle, frame->bh);
1183 if (err)
1184 goto journal_error;
1185 brelse (bh2);
1186 dxtrace(dx_show_index ("frame", frame->entries));
1187errout:
1188 return de;
1189}
1190#endif
1191
1192
1193/*
1194 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1195 * it points to a directory entry which is guaranteed to be large
1196 * enough for new directory entry. If de is NULL, then
1197 * add_dirent_to_buf will attempt search the directory block for
1198 * space. It will return -ENOSPC if no space is available, and -EIO
1199 * and -EEXIST if directory entry already exists.
1200 *
1201 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1202 * all other cases bh is released.
1203 */
1204static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1205 struct inode *inode, struct ext4_dir_entry_2 *de,
1206 struct buffer_head * bh)
1207{
1208 struct inode *dir = dentry->d_parent->d_inode;
1209 const char *name = dentry->d_name.name;
1210 int namelen = dentry->d_name.len;
1211 unsigned long offset = 0;
1212 unsigned short reclen;
1213 int nlen, rlen, err;
1214 char *top;
1215
1216 reclen = EXT4_DIR_REC_LEN(namelen);
1217 if (!de) {
1218 de = (struct ext4_dir_entry_2 *)bh->b_data;
1219 top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1220 while ((char *) de <= top) {
1221 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1222 bh, offset)) {
1223 brelse (bh);
1224 return -EIO;
1225 }
1226 if (ext4_match (namelen, name, de)) {
1227 brelse (bh);
1228 return -EEXIST;
1229 }
1230 nlen = EXT4_DIR_REC_LEN(de->name_len);
1231 rlen = le16_to_cpu(de->rec_len);
1232 if ((de->inode? rlen - nlen: rlen) >= reclen)
1233 break;
1234 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1235 offset += rlen;
1236 }
1237 if ((char *) de > top)
1238 return -ENOSPC;
1239 }
1240 BUFFER_TRACE(bh, "get_write_access");
1241 err = ext4_journal_get_write_access(handle, bh);
1242 if (err) {
1243 ext4_std_error(dir->i_sb, err);
1244 brelse(bh);
1245 return err;
1246 }
1247
1248 /* By now the buffer is marked for journaling */
1249 nlen = EXT4_DIR_REC_LEN(de->name_len);
1250 rlen = le16_to_cpu(de->rec_len);
1251 if (de->inode) {
1252 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1253 de1->rec_len = cpu_to_le16(rlen - nlen);
1254 de->rec_len = cpu_to_le16(nlen);
1255 de = de1;
1256 }
1257 de->file_type = EXT4_FT_UNKNOWN;
1258 if (inode) {
1259 de->inode = cpu_to_le32(inode->i_ino);
1260 ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1261 } else
1262 de->inode = 0;
1263 de->name_len = namelen;
1264 memcpy (de->name, name, namelen);
1265 /*
1266 * XXX shouldn't update any times until successful
1267 * completion of syscall, but too many callers depend
1268 * on this.
1269 *
1270 * XXX similarly, too many callers depend on
1271 * ext4_new_inode() setting the times, but error
1272 * recovery deletes the inode, so the worst that can
1273 * happen is that the times are slightly out of date
1274 * and/or different from the directory change time.
1275 */
1276 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1277 ext4_update_dx_flag(dir);
1278 dir->i_version++;
1279 ext4_mark_inode_dirty(handle, dir);
1280 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1281 err = ext4_journal_dirty_metadata(handle, bh);
1282 if (err)
1283 ext4_std_error(dir->i_sb, err);
1284 brelse(bh);
1285 return 0;
1286}
1287
1288#ifdef CONFIG_EXT4_INDEX
1289/*
1290 * This converts a one block unindexed directory to a 3 block indexed
1291 * directory, and adds the dentry to the indexed directory.
1292 */
1293static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1294 struct inode *inode, struct buffer_head *bh)
1295{
1296 struct inode *dir = dentry->d_parent->d_inode;
1297 const char *name = dentry->d_name.name;
1298 int namelen = dentry->d_name.len;
1299 struct buffer_head *bh2;
1300 struct dx_root *root;
1301 struct dx_frame frames[2], *frame;
1302 struct dx_entry *entries;
1303 struct ext4_dir_entry_2 *de, *de2;
1304 char *data1, *top;
1305 unsigned len;
1306 int retval;
1307 unsigned blocksize;
1308 struct dx_hash_info hinfo;
1309 u32 block;
1310 struct fake_dirent *fde;
1311
1312 blocksize = dir->i_sb->s_blocksize;
1313 dxtrace(printk("Creating index\n"));
1314 retval = ext4_journal_get_write_access(handle, bh);
1315 if (retval) {
1316 ext4_std_error(dir->i_sb, retval);
1317 brelse(bh);
1318 return retval;
1319 }
1320 root = (struct dx_root *) bh->b_data;
1321
1322 bh2 = ext4_append (handle, dir, &block, &retval);
1323 if (!(bh2)) {
1324 brelse(bh);
1325 return retval;
1326 }
1327 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
1328 data1 = bh2->b_data;
1329
1330 /* The 0th block becomes the root, move the dirents out */
1331 fde = &root->dotdot;
1332 de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
1333 len = ((char *) root) + blocksize - (char *) de;
1334 memcpy (data1, de, len);
1335 de = (struct ext4_dir_entry_2 *) data1;
1336 top = data1 + len;
1337 while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
1338 de = de2;
1339 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1340 /* Initialize the root; the dot dirents already exist */
1341 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1342 de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2));
1343 memset (&root->info, 0, sizeof(root->info));
1344 root->info.info_length = sizeof(root->info);
1345 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1346 entries = root->entries;
1347 dx_set_block (entries, 1);
1348 dx_set_count (entries, 1);
1349 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1350
1351 /* Initialize as for dx_probe */
1352 hinfo.hash_version = root->info.hash_version;
1353 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1354 ext4fs_dirhash(name, namelen, &hinfo);
1355 frame = frames;
1356 frame->entries = entries;
1357 frame->at = entries;
1358 frame->bh = bh;
1359 bh = bh2;
1360 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1361 dx_release (frames);
1362 if (!(de))
1363 return retval;
1364
1365 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1366}
1367#endif
1368
1369/*
1370 * ext4_add_entry()
1371 *
1372 * adds a file entry to the specified directory, using the same
1373 * semantics as ext4_find_entry(). It returns NULL if it failed.
1374 *
1375 * NOTE!! The inode part of 'de' is left at 0 - which means you
1376 * may not sleep between calling this and putting something into
1377 * the entry, as someone else might have used it while you slept.
1378 */
1379static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
1380 struct inode *inode)
1381{
1382 struct inode *dir = dentry->d_parent->d_inode;
1383 unsigned long offset;
1384 struct buffer_head * bh;
1385 struct ext4_dir_entry_2 *de;
1386 struct super_block * sb;
1387 int retval;
1388#ifdef CONFIG_EXT4_INDEX
1389 int dx_fallback=0;
1390#endif
1391 unsigned blocksize;
1392 u32 block, blocks;
1393
1394 sb = dir->i_sb;
1395 blocksize = sb->s_blocksize;
1396 if (!dentry->d_name.len)
1397 return -EINVAL;
1398#ifdef CONFIG_EXT4_INDEX
1399 if (is_dx(dir)) {
1400 retval = ext4_dx_add_entry(handle, dentry, inode);
1401 if (!retval || (retval != ERR_BAD_DX_DIR))
1402 return retval;
1403 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
1404 dx_fallback++;
1405 ext4_mark_inode_dirty(handle, dir);
1406 }
1407#endif
1408 blocks = dir->i_size >> sb->s_blocksize_bits;
1409 for (block = 0, offset = 0; block < blocks; block++) {
1410 bh = ext4_bread(handle, dir, block, 0, &retval);
1411 if(!bh)
1412 return retval;
1413 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1414 if (retval != -ENOSPC)
1415 return retval;
1416
1417#ifdef CONFIG_EXT4_INDEX
1418 if (blocks == 1 && !dx_fallback &&
1419 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
1420 return make_indexed_dir(handle, dentry, inode, bh);
1421#endif
1422 brelse(bh);
1423 }
1424 bh = ext4_append(handle, dir, &block, &retval);
1425 if (!bh)
1426 return retval;
1427 de = (struct ext4_dir_entry_2 *) bh->b_data;
1428 de->inode = 0;
1429 de->rec_len = cpu_to_le16(blocksize);
1430 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1431}
1432
1433#ifdef CONFIG_EXT4_INDEX
1434/*
1435 * Returns 0 for success, or a negative error value
1436 */
1437static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1438 struct inode *inode)
1439{
1440 struct dx_frame frames[2], *frame;
1441 struct dx_entry *entries, *at;
1442 struct dx_hash_info hinfo;
1443 struct buffer_head * bh;
1444 struct inode *dir = dentry->d_parent->d_inode;
1445 struct super_block * sb = dir->i_sb;
1446 struct ext4_dir_entry_2 *de;
1447 int err;
1448
1449 frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
1450 if (!frame)
1451 return err;
1452 entries = frame->entries;
1453 at = frame->at;
1454
1455 if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1456 goto cleanup;
1457
1458 BUFFER_TRACE(bh, "get_write_access");
1459 err = ext4_journal_get_write_access(handle, bh);
1460 if (err)
1461 goto journal_error;
1462
1463 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1464 if (err != -ENOSPC) {
1465 bh = NULL;
1466 goto cleanup;
1467 }
1468
1469 /* Block full, should compress but for now just split */
1470 dxtrace(printk("using %u of %u node entries\n",
1471 dx_get_count(entries), dx_get_limit(entries)));
1472 /* Need to split index? */
1473 if (dx_get_count(entries) == dx_get_limit(entries)) {
1474 u32 newblock;
1475 unsigned icount = dx_get_count(entries);
1476 int levels = frame - frames;
1477 struct dx_entry *entries2;
1478 struct dx_node *node2;
1479 struct buffer_head *bh2;
1480
1481 if (levels && (dx_get_count(frames->entries) ==
1482 dx_get_limit(frames->entries))) {
1483 ext4_warning(sb, __FUNCTION__,
1484 "Directory index full!");
1485 err = -ENOSPC;
1486 goto cleanup;
1487 }
1488 bh2 = ext4_append (handle, dir, &newblock, &err);
1489 if (!(bh2))
1490 goto cleanup;
1491 node2 = (struct dx_node *)(bh2->b_data);
1492 entries2 = node2->entries;
1493 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
1494 node2->fake.inode = 0;
1495 BUFFER_TRACE(frame->bh, "get_write_access");
1496 err = ext4_journal_get_write_access(handle, frame->bh);
1497 if (err)
1498 goto journal_error;
1499 if (levels) {
1500 unsigned icount1 = icount/2, icount2 = icount - icount1;
1501 unsigned hash2 = dx_get_hash(entries + icount1);
1502 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1503
1504 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1505 err = ext4_journal_get_write_access(handle,
1506 frames[0].bh);
1507 if (err)
1508 goto journal_error;
1509
1510 memcpy ((char *) entries2, (char *) (entries + icount1),
1511 icount2 * sizeof(struct dx_entry));
1512 dx_set_count (entries, icount1);
1513 dx_set_count (entries2, icount2);
1514 dx_set_limit (entries2, dx_node_limit(dir));
1515
1516 /* Which index block gets the new entry? */
1517 if (at - entries >= icount1) {
1518 frame->at = at = at - entries - icount1 + entries2;
1519 frame->entries = entries = entries2;
1520 swap(frame->bh, bh2);
1521 }
1522 dx_insert_block (frames + 0, hash2, newblock);
1523 dxtrace(dx_show_index ("node", frames[1].entries));
1524 dxtrace(dx_show_index ("node",
1525 ((struct dx_node *) bh2->b_data)->entries));
1526 err = ext4_journal_dirty_metadata(handle, bh2);
1527 if (err)
1528 goto journal_error;
1529 brelse (bh2);
1530 } else {
1531 dxtrace(printk("Creating second level index...\n"));
1532 memcpy((char *) entries2, (char *) entries,
1533 icount * sizeof(struct dx_entry));
1534 dx_set_limit(entries2, dx_node_limit(dir));
1535
1536 /* Set up root */
1537 dx_set_count(entries, 1);
1538 dx_set_block(entries + 0, newblock);
1539 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1540
1541 /* Add new access path frame */
1542 frame = frames + 1;
1543 frame->at = at = at - entries + entries2;
1544 frame->entries = entries = entries2;
1545 frame->bh = bh2;
1546 err = ext4_journal_get_write_access(handle,
1547 frame->bh);
1548 if (err)
1549 goto journal_error;
1550 }
1551 ext4_journal_dirty_metadata(handle, frames[0].bh);
1552 }
1553 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1554 if (!de)
1555 goto cleanup;
1556 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1557 bh = NULL;
1558 goto cleanup;
1559
1560journal_error:
1561 ext4_std_error(dir->i_sb, err);
1562cleanup:
1563 if (bh)
1564 brelse(bh);
1565 dx_release(frames);
1566 return err;
1567}
1568#endif
1569
1570/*
1571 * ext4_delete_entry deletes a directory entry by merging it with the
1572 * previous entry
1573 */
1574static int ext4_delete_entry (handle_t *handle,
1575 struct inode * dir,
1576 struct ext4_dir_entry_2 * de_del,
1577 struct buffer_head * bh)
1578{
1579 struct ext4_dir_entry_2 * de, * pde;
1580 int i;
1581
1582 i = 0;
1583 pde = NULL;
1584 de = (struct ext4_dir_entry_2 *) bh->b_data;
1585 while (i < bh->b_size) {
1586 if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
1587 return -EIO;
1588 if (de == de_del) {
1589 BUFFER_TRACE(bh, "get_write_access");
1590 ext4_journal_get_write_access(handle, bh);
1591 if (pde)
1592 pde->rec_len =
1593 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1594 le16_to_cpu(de->rec_len));
1595 else
1596 de->inode = 0;
1597 dir->i_version++;
1598 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1599 ext4_journal_dirty_metadata(handle, bh);
1600 return 0;
1601 }
1602 i += le16_to_cpu(de->rec_len);
1603 pde = de;
1604 de = (struct ext4_dir_entry_2 *)
1605 ((char *) de + le16_to_cpu(de->rec_len));
1606 }
1607 return -ENOENT;
1608}
1609
1610/*
1611 * ext4_mark_inode_dirty is somewhat expensive, so unlike ext2 we
1612 * do not perform it in these functions. We perform it at the call site,
1613 * if it is needed.
1614 */
1615static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
1616{
1617 inc_nlink(inode);
1618}
1619
1620static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
1621{
1622 drop_nlink(inode);
1623}
1624
1625static int ext4_add_nondir(handle_t *handle,
1626 struct dentry *dentry, struct inode *inode)
1627{
1628 int err = ext4_add_entry(handle, dentry, inode);
1629 if (!err) {
1630 ext4_mark_inode_dirty(handle, inode);
1631 d_instantiate(dentry, inode);
1632 return 0;
1633 }
1634 ext4_dec_count(handle, inode);
1635 iput(inode);
1636 return err;
1637}
1638
1639/*
1640 * By the time this is called, we already have created
1641 * the directory cache entry for the new file, but it
1642 * is so far negative - it has no inode.
1643 *
1644 * If the create succeeds, we fill in the inode information
1645 * with d_instantiate().
1646 */
1647static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
1648 struct nameidata *nd)
1649{
1650 handle_t *handle;
1651 struct inode * inode;
1652 int err, retries = 0;
1653
1654retry:
1655 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1656 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1657 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1658 if (IS_ERR(handle))
1659 return PTR_ERR(handle);
1660
1661 if (IS_DIRSYNC(dir))
1662 handle->h_sync = 1;
1663
1664 inode = ext4_new_inode (handle, dir, mode);
1665 err = PTR_ERR(inode);
1666 if (!IS_ERR(inode)) {
1667 inode->i_op = &ext4_file_inode_operations;
1668 inode->i_fop = &ext4_file_operations;
1669 ext4_set_aops(inode);
1670 err = ext4_add_nondir(handle, dentry, inode);
1671 }
1672 ext4_journal_stop(handle);
1673 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1674 goto retry;
1675 return err;
1676}
1677
1678static int ext4_mknod (struct inode * dir, struct dentry *dentry,
1679 int mode, dev_t rdev)
1680{
1681 handle_t *handle;
1682 struct inode *inode;
1683 int err, retries = 0;
1684
1685 if (!new_valid_dev(rdev))
1686 return -EINVAL;
1687
1688retry:
1689 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1690 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1691 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1692 if (IS_ERR(handle))
1693 return PTR_ERR(handle);
1694
1695 if (IS_DIRSYNC(dir))
1696 handle->h_sync = 1;
1697
1698 inode = ext4_new_inode (handle, dir, mode);
1699 err = PTR_ERR(inode);
1700 if (!IS_ERR(inode)) {
1701 init_special_inode(inode, inode->i_mode, rdev);
1702#ifdef CONFIG_EXT4DEV_FS_XATTR
1703 inode->i_op = &ext4_special_inode_operations;
1704#endif
1705 err = ext4_add_nondir(handle, dentry, inode);
1706 }
1707 ext4_journal_stop(handle);
1708 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1709 goto retry;
1710 return err;
1711}
1712
1713static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1714{
1715 handle_t *handle;
1716 struct inode * inode;
1717 struct buffer_head * dir_block;
1718 struct ext4_dir_entry_2 * de;
1719 int err, retries = 0;
1720
1721 if (dir->i_nlink >= EXT4_LINK_MAX)
1722 return -EMLINK;
1723
1724retry:
1725 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1726 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1727 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1728 if (IS_ERR(handle))
1729 return PTR_ERR(handle);
1730
1731 if (IS_DIRSYNC(dir))
1732 handle->h_sync = 1;
1733
1734 inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
1735 err = PTR_ERR(inode);
1736 if (IS_ERR(inode))
1737 goto out_stop;
1738
1739 inode->i_op = &ext4_dir_inode_operations;
1740 inode->i_fop = &ext4_dir_operations;
1741 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1742 dir_block = ext4_bread (handle, inode, 0, 1, &err);
1743 if (!dir_block) {
1744 drop_nlink(inode); /* is this nlink == 0? */
1745 ext4_mark_inode_dirty(handle, inode);
1746 iput (inode);
1747 goto out_stop;
1748 }
1749 BUFFER_TRACE(dir_block, "get_write_access");
1750 ext4_journal_get_write_access(handle, dir_block);
1751 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1752 de->inode = cpu_to_le32(inode->i_ino);
1753 de->name_len = 1;
1754 de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len));
1755 strcpy (de->name, ".");
1756 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1757 de = (struct ext4_dir_entry_2 *)
1758 ((char *) de + le16_to_cpu(de->rec_len));
1759 de->inode = cpu_to_le32(dir->i_ino);
1760 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1));
1761 de->name_len = 2;
1762 strcpy (de->name, "..");
1763 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1764 inode->i_nlink = 2;
1765 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
1766 ext4_journal_dirty_metadata(handle, dir_block);
1767 brelse (dir_block);
1768 ext4_mark_inode_dirty(handle, inode);
1769 err = ext4_add_entry (handle, dentry, inode);
1770 if (err) {
1771 inode->i_nlink = 0;
1772 ext4_mark_inode_dirty(handle, inode);
1773 iput (inode);
1774 goto out_stop;
1775 }
1776 inc_nlink(dir);
1777 ext4_update_dx_flag(dir);
1778 ext4_mark_inode_dirty(handle, dir);
1779 d_instantiate(dentry, inode);
1780out_stop:
1781 ext4_journal_stop(handle);
1782 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1783 goto retry;
1784 return err;
1785}
1786
1787/*
1788 * routine to check that the specified directory is empty (for rmdir)
1789 */
1790static int empty_dir (struct inode * inode)
1791{
1792 unsigned long offset;
1793 struct buffer_head * bh;
1794 struct ext4_dir_entry_2 * de, * de1;
1795 struct super_block * sb;
1796 int err = 0;
1797
1798 sb = inode->i_sb;
1799 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1800 !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
1801 if (err)
1802 ext4_error(inode->i_sb, __FUNCTION__,
1803 "error %d reading directory #%lu offset 0",
1804 err, inode->i_ino);
1805 else
1806 ext4_warning(inode->i_sb, __FUNCTION__,
1807 "bad directory (dir #%lu) - no data block",
1808 inode->i_ino);
1809 return 1;
1810 }
1811 de = (struct ext4_dir_entry_2 *) bh->b_data;
1812 de1 = (struct ext4_dir_entry_2 *)
1813 ((char *) de + le16_to_cpu(de->rec_len));
1814 if (le32_to_cpu(de->inode) != inode->i_ino ||
1815 !le32_to_cpu(de1->inode) ||
1816 strcmp (".", de->name) ||
1817 strcmp ("..", de1->name)) {
1818 ext4_warning (inode->i_sb, "empty_dir",
1819 "bad directory (dir #%lu) - no `.' or `..'",
1820 inode->i_ino);
1821 brelse (bh);
1822 return 1;
1823 }
1824 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
1825 de = (struct ext4_dir_entry_2 *)
1826 ((char *) de1 + le16_to_cpu(de1->rec_len));
1827 while (offset < inode->i_size ) {
1828 if (!bh ||
1829 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1830 err = 0;
1831 brelse (bh);
1832 bh = ext4_bread (NULL, inode,
1833 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1834 if (!bh) {
1835 if (err)
1836 ext4_error(sb, __FUNCTION__,
1837 "error %d reading directory"
1838 " #%lu offset %lu",
1839 err, inode->i_ino, offset);
1840 offset += sb->s_blocksize;
1841 continue;
1842 }
1843 de = (struct ext4_dir_entry_2 *) bh->b_data;
1844 }
1845 if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1846 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1847 sb->s_blocksize);
1848 offset = (offset | (sb->s_blocksize - 1)) + 1;
1849 continue;
1850 }
1851 if (le32_to_cpu(de->inode)) {
1852 brelse (bh);
1853 return 0;
1854 }
1855 offset += le16_to_cpu(de->rec_len);
1856 de = (struct ext4_dir_entry_2 *)
1857 ((char *) de + le16_to_cpu(de->rec_len));
1858 }
1859 brelse (bh);
1860 return 1;
1861}
1862
1863/* ext4_orphan_add() links an unlinked or truncated inode into a list of
1864 * such inodes, starting at the superblock, in case we crash before the
1865 * file is closed/deleted, or in case the inode truncate spans multiple
1866 * transactions and the last transaction is not recovered after a crash.
1867 *
1868 * At filesystem recovery time, we walk this list deleting unlinked
1869 * inodes and truncating linked inodes in ext4_orphan_cleanup().
1870 */
1871int ext4_orphan_add(handle_t *handle, struct inode *inode)
1872{
1873 struct super_block *sb = inode->i_sb;
1874 struct ext4_iloc iloc;
1875 int err = 0, rc;
1876
1877 lock_super(sb);
1878 if (!list_empty(&EXT4_I(inode)->i_orphan))
1879 goto out_unlock;
1880
1881 /* Orphan handling is only valid for files with data blocks
1882 * being truncated, or files being unlinked. */
1883
1884 /* @@@ FIXME: Observation from aviro:
1885 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block
1886 * here (on lock_super()), so race with ext4_link() which might bump
1887 * ->i_nlink. For, say it, character device. Not a regular file,
1888 * not a directory, not a symlink and ->i_nlink > 0.
1889 */
1890 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1891 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1892
1893 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
1894 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
1895 if (err)
1896 goto out_unlock;
1897
1898 err = ext4_reserve_inode_write(handle, inode, &iloc);
1899 if (err)
1900 goto out_unlock;
1901
1902 /* Insert this inode at the head of the on-disk orphan list... */
1903 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
1904 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1905 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
1906 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
1907 if (!err)
1908 err = rc;
1909
1910 /* Only add to the head of the in-memory list if all the
1911 * previous operations succeeded. If the orphan_add is going to
1912 * fail (possibly taking the journal offline), we can't risk
1913 * leaving the inode on the orphan list: stray orphan-list
1914 * entries can cause panics at unmount time.
1915 *
1916 * This is safe: on error we're going to ignore the orphan list
1917 * anyway on the next recovery. */
1918 if (!err)
1919 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1920
1921 jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
1922 jbd_debug(4, "orphan inode %lu will point to %d\n",
1923 inode->i_ino, NEXT_ORPHAN(inode));
1924out_unlock:
1925 unlock_super(sb);
1926 ext4_std_error(inode->i_sb, err);
1927 return err;
1928}
1929
1930/*
1931 * ext4_orphan_del() removes an unlinked or truncated inode from the list
1932 * of such inodes stored on disk, because it is finally being cleaned up.
1933 */
1934int ext4_orphan_del(handle_t *handle, struct inode *inode)
1935{
1936 struct list_head *prev;
1937 struct ext4_inode_info *ei = EXT4_I(inode);
1938 struct ext4_sb_info *sbi;
1939 unsigned long ino_next;
1940 struct ext4_iloc iloc;
1941 int err = 0;
1942
1943 lock_super(inode->i_sb);
1944 if (list_empty(&ei->i_orphan)) {
1945 unlock_super(inode->i_sb);
1946 return 0;
1947 }
1948
1949 ino_next = NEXT_ORPHAN(inode);
1950 prev = ei->i_orphan.prev;
1951 sbi = EXT4_SB(inode->i_sb);
1952
1953 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
1954
1955 list_del_init(&ei->i_orphan);
1956
1957 /* If we're on an error path, we may not have a valid
1958 * transaction handle with which to update the orphan list on
1959 * disk, but we still need to remove the inode from the linked
1960 * list in memory. */
1961 if (!handle)
1962 goto out;
1963
1964 err = ext4_reserve_inode_write(handle, inode, &iloc);
1965 if (err)
1966 goto out_err;
1967
1968 if (prev == &sbi->s_orphan) {
1969 jbd_debug(4, "superblock will point to %lu\n", ino_next);
1970 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
1971 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
1972 if (err)
1973 goto out_brelse;
1974 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
1975 err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
1976 } else {
1977 struct ext4_iloc iloc2;
1978 struct inode *i_prev =
1979 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
1980
1981 jbd_debug(4, "orphan inode %lu will point to %lu\n",
1982 i_prev->i_ino, ino_next);
1983 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
1984 if (err)
1985 goto out_brelse;
1986 NEXT_ORPHAN(i_prev) = ino_next;
1987 err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
1988 }
1989 if (err)
1990 goto out_brelse;
1991 NEXT_ORPHAN(inode) = 0;
1992 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
1993
1994out_err:
1995 ext4_std_error(inode->i_sb, err);
1996out:
1997 unlock_super(inode->i_sb);
1998 return err;
1999
2000out_brelse:
2001 brelse(iloc.bh);
2002 goto out_err;
2003}
2004
2005static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2006{
2007 int retval;
2008 struct inode * inode;
2009 struct buffer_head * bh;
2010 struct ext4_dir_entry_2 * de;
2011 handle_t *handle;
2012
2013 /* Initialize quotas before so that eventual writes go in
2014 * separate transaction */
2015 DQUOT_INIT(dentry->d_inode);
2016 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2017 if (IS_ERR(handle))
2018 return PTR_ERR(handle);
2019
2020 retval = -ENOENT;
2021 bh = ext4_find_entry (dentry, &de);
2022 if (!bh)
2023 goto end_rmdir;
2024
2025 if (IS_DIRSYNC(dir))
2026 handle->h_sync = 1;
2027
2028 inode = dentry->d_inode;
2029
2030 retval = -EIO;
2031 if (le32_to_cpu(de->inode) != inode->i_ino)
2032 goto end_rmdir;
2033
2034 retval = -ENOTEMPTY;
2035 if (!empty_dir (inode))
2036 goto end_rmdir;
2037
2038 retval = ext4_delete_entry(handle, dir, de, bh);
2039 if (retval)
2040 goto end_rmdir;
2041 if (inode->i_nlink != 2)
2042 ext4_warning (inode->i_sb, "ext4_rmdir",
2043 "empty directory has nlink!=2 (%d)",
2044 inode->i_nlink);
2045 inode->i_version++;
2046 clear_nlink(inode);
2047 /* There's no need to set i_disksize: the fact that i_nlink is
2048 * zero will ensure that the right thing happens during any
2049 * recovery. */
2050 inode->i_size = 0;
2051 ext4_orphan_add(handle, inode);
2052 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2053 ext4_mark_inode_dirty(handle, inode);
2054 drop_nlink(dir);
2055 ext4_update_dx_flag(dir);
2056 ext4_mark_inode_dirty(handle, dir);
2057
2058end_rmdir:
2059 ext4_journal_stop(handle);
2060 brelse (bh);
2061 return retval;
2062}
2063
2064static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2065{
2066 int retval;
2067 struct inode * inode;
2068 struct buffer_head * bh;
2069 struct ext4_dir_entry_2 * de;
2070 handle_t *handle;
2071
2072 /* Initialize quotas before so that eventual writes go
2073 * in separate transaction */
2074 DQUOT_INIT(dentry->d_inode);
2075 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2076 if (IS_ERR(handle))
2077 return PTR_ERR(handle);
2078
2079 if (IS_DIRSYNC(dir))
2080 handle->h_sync = 1;
2081
2082 retval = -ENOENT;
2083 bh = ext4_find_entry (dentry, &de);
2084 if (!bh)
2085 goto end_unlink;
2086
2087 inode = dentry->d_inode;
2088
2089 retval = -EIO;
2090 if (le32_to_cpu(de->inode) != inode->i_ino)
2091 goto end_unlink;
2092
2093 if (!inode->i_nlink) {
2094 ext4_warning (inode->i_sb, "ext4_unlink",
2095 "Deleting nonexistent file (%lu), %d",
2096 inode->i_ino, inode->i_nlink);
2097 inode->i_nlink = 1;
2098 }
2099 retval = ext4_delete_entry(handle, dir, de, bh);
2100 if (retval)
2101 goto end_unlink;
2102 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2103 ext4_update_dx_flag(dir);
2104 ext4_mark_inode_dirty(handle, dir);
2105 drop_nlink(inode);
2106 if (!inode->i_nlink)
2107 ext4_orphan_add(handle, inode);
2108 inode->i_ctime = dir->i_ctime;
2109 ext4_mark_inode_dirty(handle, inode);
2110 retval = 0;
2111
2112end_unlink:
2113 ext4_journal_stop(handle);
2114 brelse (bh);
2115 return retval;
2116}
2117
2118static int ext4_symlink (struct inode * dir,
2119 struct dentry *dentry, const char * symname)
2120{
2121 handle_t *handle;
2122 struct inode * inode;
2123 int l, err, retries = 0;
2124
2125 l = strlen(symname)+1;
2126 if (l > dir->i_sb->s_blocksize)
2127 return -ENAMETOOLONG;
2128
2129retry:
2130 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2131 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2132 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
2133 if (IS_ERR(handle))
2134 return PTR_ERR(handle);
2135
2136 if (IS_DIRSYNC(dir))
2137 handle->h_sync = 1;
2138
2139 inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
2140 err = PTR_ERR(inode);
2141 if (IS_ERR(inode))
2142 goto out_stop;
2143
2144 if (l > sizeof (EXT4_I(inode)->i_data)) {
2145 inode->i_op = &ext4_symlink_inode_operations;
2146 ext4_set_aops(inode);
2147 /*
2148 * page_symlink() calls into ext4_prepare/commit_write.
2149 * We have a transaction open. All is sweetness. It also sets
2150 * i_size in generic_commit_write().
2151 */
2152 err = __page_symlink(inode, symname, l,
2153 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2154 if (err) {
2155 ext4_dec_count(handle, inode);
2156 ext4_mark_inode_dirty(handle, inode);
2157 iput (inode);
2158 goto out_stop;
2159 }
2160 } else {
2161 inode->i_op = &ext4_fast_symlink_inode_operations;
2162 memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
2163 inode->i_size = l-1;
2164 }
2165 EXT4_I(inode)->i_disksize = inode->i_size;
2166 err = ext4_add_nondir(handle, dentry, inode);
2167out_stop:
2168 ext4_journal_stop(handle);
2169 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2170 goto retry;
2171 return err;
2172}
2173
2174static int ext4_link (struct dentry * old_dentry,
2175 struct inode * dir, struct dentry *dentry)
2176{
2177 handle_t *handle;
2178 struct inode *inode = old_dentry->d_inode;
2179 int err, retries = 0;
2180
2181 if (inode->i_nlink >= EXT4_LINK_MAX)
2182 return -EMLINK;
2183
2184retry:
2185 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2186 EXT4_INDEX_EXTRA_TRANS_BLOCKS);
2187 if (IS_ERR(handle))
2188 return PTR_ERR(handle);
2189
2190 if (IS_DIRSYNC(dir))
2191 handle->h_sync = 1;
2192
2193 inode->i_ctime = CURRENT_TIME_SEC;
2194 ext4_inc_count(handle, inode);
2195 atomic_inc(&inode->i_count);
2196
2197 err = ext4_add_nondir(handle, dentry, inode);
2198 ext4_journal_stop(handle);
2199 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2200 goto retry;
2201 return err;
2202}
2203
2204#define PARENT_INO(buffer) \
2205 ((struct ext4_dir_entry_2 *) ((char *) buffer + \
2206 le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
2207
2208/*
2209 * Anybody can rename anything with this: the permission checks are left to the
2210 * higher-level routines.
2211 */
2212static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2213 struct inode * new_dir,struct dentry *new_dentry)
2214{
2215 handle_t *handle;
2216 struct inode * old_inode, * new_inode;
2217 struct buffer_head * old_bh, * new_bh, * dir_bh;
2218 struct ext4_dir_entry_2 * old_de, * new_de;
2219 int retval;
2220
2221 old_bh = new_bh = dir_bh = NULL;
2222
2223 /* Initialize quotas before so that eventual writes go
2224 * in separate transaction */
2225 if (new_dentry->d_inode)
2226 DQUOT_INIT(new_dentry->d_inode);
2227 handle = ext4_journal_start(old_dir, 2 *
2228 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2229 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
2230 if (IS_ERR(handle))
2231 return PTR_ERR(handle);
2232
2233 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2234 handle->h_sync = 1;
2235
2236 old_bh = ext4_find_entry (old_dentry, &old_de);
2237 /*
2238 * Check for inode number is _not_ due to possible IO errors.
2239 * We might rmdir the source, keep it as pwd of some process
2240 * and merrily kill the link to whatever was created under the
2241 * same name. Goodbye sticky bit ;-<
2242 */
2243 old_inode = old_dentry->d_inode;
2244 retval = -ENOENT;
2245 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2246 goto end_rename;
2247
2248 new_inode = new_dentry->d_inode;
2249 new_bh = ext4_find_entry (new_dentry, &new_de);
2250 if (new_bh) {
2251 if (!new_inode) {
2252 brelse (new_bh);
2253 new_bh = NULL;
2254 }
2255 }
2256 if (S_ISDIR(old_inode->i_mode)) {
2257 if (new_inode) {
2258 retval = -ENOTEMPTY;
2259 if (!empty_dir (new_inode))
2260 goto end_rename;
2261 }
2262 retval = -EIO;
2263 dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
2264 if (!dir_bh)
2265 goto end_rename;
2266 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2267 goto end_rename;
2268 retval = -EMLINK;
2269 if (!new_inode && new_dir!=old_dir &&
2270 new_dir->i_nlink >= EXT4_LINK_MAX)
2271 goto end_rename;
2272 }
2273 if (!new_bh) {
2274 retval = ext4_add_entry (handle, new_dentry, old_inode);
2275 if (retval)
2276 goto end_rename;
2277 } else {
2278 BUFFER_TRACE(new_bh, "get write access");
2279 ext4_journal_get_write_access(handle, new_bh);
2280 new_de->inode = cpu_to_le32(old_inode->i_ino);
2281 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2282 EXT4_FEATURE_INCOMPAT_FILETYPE))
2283 new_de->file_type = old_de->file_type;
2284 new_dir->i_version++;
2285 BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
2286 ext4_journal_dirty_metadata(handle, new_bh);
2287 brelse(new_bh);
2288 new_bh = NULL;
2289 }
2290
2291 /*
2292 * Like most other Unix systems, set the ctime for inodes on a
2293 * rename.
2294 */
2295 old_inode->i_ctime = CURRENT_TIME_SEC;
2296 ext4_mark_inode_dirty(handle, old_inode);
2297
2298 /*
2299 * ok, that's it
2300 */
2301 if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2302 old_de->name_len != old_dentry->d_name.len ||
2303 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2304 (retval = ext4_delete_entry(handle, old_dir,
2305 old_de, old_bh)) == -ENOENT) {
2306 /* old_de could have moved from under us during htree split, so
2307 * make sure that we are deleting the right entry. We might
2308 * also be pointing to a stale entry in the unused part of
2309 * old_bh so just checking inum and the name isn't enough. */
2310 struct buffer_head *old_bh2;
2311 struct ext4_dir_entry_2 *old_de2;
2312
2313 old_bh2 = ext4_find_entry(old_dentry, &old_de2);
2314 if (old_bh2) {
2315 retval = ext4_delete_entry(handle, old_dir,
2316 old_de2, old_bh2);
2317 brelse(old_bh2);
2318 }
2319 }
2320 if (retval) {
2321 ext4_warning(old_dir->i_sb, "ext4_rename",
2322 "Deleting old file (%lu), %d, error=%d",
2323 old_dir->i_ino, old_dir->i_nlink, retval);
2324 }
2325
2326 if (new_inode) {
2327 drop_nlink(new_inode);
2328 new_inode->i_ctime = CURRENT_TIME_SEC;
2329 }
2330 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2331 ext4_update_dx_flag(old_dir);
2332 if (dir_bh) {
2333 BUFFER_TRACE(dir_bh, "get_write_access");
2334 ext4_journal_get_write_access(handle, dir_bh);
2335 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2336 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
2337 ext4_journal_dirty_metadata(handle, dir_bh);
2338 drop_nlink(old_dir);
2339 if (new_inode) {
2340 drop_nlink(new_inode);
2341 } else {
2342 inc_nlink(new_dir);
2343 ext4_update_dx_flag(new_dir);
2344 ext4_mark_inode_dirty(handle, new_dir);
2345 }
2346 }
2347 ext4_mark_inode_dirty(handle, old_dir);
2348 if (new_inode) {
2349 ext4_mark_inode_dirty(handle, new_inode);
2350 if (!new_inode->i_nlink)
2351 ext4_orphan_add(handle, new_inode);
2352 }
2353 retval = 0;
2354
2355end_rename:
2356 brelse (dir_bh);
2357 brelse (old_bh);
2358 brelse (new_bh);
2359 ext4_journal_stop(handle);
2360 return retval;
2361}
2362
2363/*
2364 * directories can handle most operations...
2365 */
2366struct inode_operations ext4_dir_inode_operations = {
2367 .create = ext4_create,
2368 .lookup = ext4_lookup,
2369 .link = ext4_link,
2370 .unlink = ext4_unlink,
2371 .symlink = ext4_symlink,
2372 .mkdir = ext4_mkdir,
2373 .rmdir = ext4_rmdir,
2374 .mknod = ext4_mknod,
2375 .rename = ext4_rename,
2376 .setattr = ext4_setattr,
2377#ifdef CONFIG_EXT4DEV_FS_XATTR
2378 .setxattr = generic_setxattr,
2379 .getxattr = generic_getxattr,
2380 .listxattr = ext4_listxattr,
2381 .removexattr = generic_removexattr,
2382#endif
2383 .permission = ext4_permission,
2384};
2385
2386struct inode_operations ext4_special_inode_operations = {
2387 .setattr = ext4_setattr,
2388#ifdef CONFIG_EXT4DEV_FS_XATTR
2389 .setxattr = generic_setxattr,
2390 .getxattr = generic_getxattr,
2391 .listxattr = ext4_listxattr,
2392 .removexattr = generic_removexattr,
2393#endif
2394 .permission = ext4_permission,
2395};
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
new file mode 100644
index 000000000000..5e4dfff36a00
--- /dev/null
+++ b/fs/ext4/namei.h
@@ -0,0 +1,8 @@
1/* linux/fs/ext4/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 000000000000..1e9578052cd3
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1045 @@
1/*
2 * linux/fs/ext4/resize.c
3 *
4 * Support for resizing an ext4 filesystem while it is mounted.
5 *
6 * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
7 *
8 * This could probably be made into a module, because it is not often in use.
9 */
10
11
12#define EXT4FS_DEBUG
13
14#include <linux/sched.h>
15#include <linux/smp_lock.h>
16#include <linux/ext4_jbd2.h>
17
18#include <linux/errno.h>
19#include <linux/slab.h>
20
21
22#define outside(b, first, last) ((b) < (first) || (b) >= (last))
23#define inside(b, first, last) ((b) >= (first) && (b) < (last))
24
25static int verify_group_input(struct super_block *sb,
26 struct ext4_new_group_data *input)
27{
28 struct ext4_sb_info *sbi = EXT4_SB(sb);
29 struct ext4_super_block *es = sbi->s_es;
30 ext4_fsblk_t start = ext4_blocks_count(es);
31 ext4_fsblk_t end = start + input->blocks_count;
32 unsigned group = input->group;
33 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
34 unsigned overhead = ext4_bg_has_super(sb, group) ?
35 (1 + ext4_bg_num_gdb(sb, group) +
36 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
37 ext4_fsblk_t metaend = start + overhead;
38 struct buffer_head *bh = NULL;
39 ext4_grpblk_t free_blocks_count, offset;
40 int err = -EINVAL;
41
42 input->free_blocks_count = free_blocks_count =
43 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
44
45 if (test_opt(sb, DEBUG))
46 printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
47 "(%d free, %u reserved)\n",
48 ext4_bg_has_super(sb, input->group) ? "normal" :
49 "no-super", input->group, input->blocks_count,
50 free_blocks_count, input->reserved_blocks);
51
52 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
53 if (group != sbi->s_groups_count)
54 ext4_warning(sb, __FUNCTION__,
55 "Cannot add at group %u (only %lu groups)",
56 input->group, sbi->s_groups_count);
57 else if (offset != 0)
58 ext4_warning(sb, __FUNCTION__, "Last group not full");
59 else if (input->reserved_blocks > input->blocks_count / 5)
60 ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
61 input->reserved_blocks);
62 else if (free_blocks_count < 0)
63 ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
64 input->blocks_count);
65 else if (!(bh = sb_bread(sb, end - 1)))
66 ext4_warning(sb, __FUNCTION__,
67 "Cannot read last block (%llu)",
68 end - 1);
69 else if (outside(input->block_bitmap, start, end))
70 ext4_warning(sb, __FUNCTION__,
71 "Block bitmap not in group (block %llu)",
72 input->block_bitmap);
73 else if (outside(input->inode_bitmap, start, end))
74 ext4_warning(sb, __FUNCTION__,
75 "Inode bitmap not in group (block %llu)",
76 input->inode_bitmap);
77 else if (outside(input->inode_table, start, end) ||
78 outside(itend - 1, start, end))
79 ext4_warning(sb, __FUNCTION__,
80 "Inode table not in group (blocks %llu-%llu)",
81 input->inode_table, itend - 1);
82 else if (input->inode_bitmap == input->block_bitmap)
83 ext4_warning(sb, __FUNCTION__,
84 "Block bitmap same as inode bitmap (%llu)",
85 input->block_bitmap);
86 else if (inside(input->block_bitmap, input->inode_table, itend))
87 ext4_warning(sb, __FUNCTION__,
88 "Block bitmap (%llu) in inode table (%llu-%llu)",
89 input->block_bitmap, input->inode_table, itend-1);
90 else if (inside(input->inode_bitmap, input->inode_table, itend))
91 ext4_warning(sb, __FUNCTION__,
92 "Inode bitmap (%llu) in inode table (%llu-%llu)",
93 input->inode_bitmap, input->inode_table, itend-1);
94 else if (inside(input->block_bitmap, start, metaend))
95 ext4_warning(sb, __FUNCTION__,
96 "Block bitmap (%llu) in GDT table"
97 " (%llu-%llu)",
98 input->block_bitmap, start, metaend - 1);
99 else if (inside(input->inode_bitmap, start, metaend))
100 ext4_warning(sb, __FUNCTION__,
101 "Inode bitmap (%llu) in GDT table"
102 " (%llu-%llu)",
103 input->inode_bitmap, start, metaend - 1);
104 else if (inside(input->inode_table, start, metaend) ||
105 inside(itend - 1, start, metaend))
106 ext4_warning(sb, __FUNCTION__,
107 "Inode table (%llu-%llu) overlaps"
108 "GDT table (%llu-%llu)",
109 input->inode_table, itend - 1, start, metaend - 1);
110 else
111 err = 0;
112 brelse(bh);
113
114 return err;
115}
116
117static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
118 ext4_fsblk_t blk)
119{
120 struct buffer_head *bh;
121 int err;
122
123 bh = sb_getblk(sb, blk);
124 if (!bh)
125 return ERR_PTR(-EIO);
126 if ((err = ext4_journal_get_write_access(handle, bh))) {
127 brelse(bh);
128 bh = ERR_PTR(err);
129 } else {
130 lock_buffer(bh);
131 memset(bh->b_data, 0, sb->s_blocksize);
132 set_buffer_uptodate(bh);
133 unlock_buffer(bh);
134 }
135
136 return bh;
137}
138
139/*
140 * To avoid calling the atomic setbit hundreds or thousands of times, we only
141 * need to use it within a single byte (to ensure we get endianness right).
142 * We can use memset for the rest of the bitmap as there are no other users.
143 */
144static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
145{
146 int i;
147
148 if (start_bit >= end_bit)
149 return;
150
151 ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
152 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
153 ext4_set_bit(i, bitmap);
154 if (i < end_bit)
155 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
156}
157
158/*
159 * Set up the block and inode bitmaps, and the inode table for the new group.
160 * This doesn't need to be part of the main transaction, since we are only
161 * changing blocks outside the actual filesystem. We still do journaling to
162 * ensure the recovery is correct in case of a failure just after resize.
163 * If any part of this fails, we simply abort the resize.
164 */
165static int setup_new_group_blocks(struct super_block *sb,
166 struct ext4_new_group_data *input)
167{
168 struct ext4_sb_info *sbi = EXT4_SB(sb);
169 ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
170 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
171 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
172 unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
173 struct buffer_head *bh;
174 handle_t *handle;
175 ext4_fsblk_t block;
176 ext4_grpblk_t bit;
177 int i;
178 int err = 0, err2;
179
180 handle = ext4_journal_start_sb(sb, reserved_gdb + gdblocks +
181 2 + sbi->s_itb_per_group);
182 if (IS_ERR(handle))
183 return PTR_ERR(handle);
184
185 lock_super(sb);
186 if (input->group != sbi->s_groups_count) {
187 err = -EBUSY;
188 goto exit_journal;
189 }
190
191 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
192 err = PTR_ERR(bh);
193 goto exit_journal;
194 }
195
196 if (ext4_bg_has_super(sb, input->group)) {
197 ext4_debug("mark backup superblock %#04lx (+0)\n", start);
198 ext4_set_bit(0, bh->b_data);
199 }
200
201 /* Copy all of the GDT blocks into the backup in this group */
202 for (i = 0, bit = 1, block = start + 1;
203 i < gdblocks; i++, block++, bit++) {
204 struct buffer_head *gdb;
205
206 ext4_debug("update backup group %#04lx (+%d)\n", block, bit);
207
208 gdb = sb_getblk(sb, block);
209 if (!gdb) {
210 err = -EIO;
211 goto exit_bh;
212 }
213 if ((err = ext4_journal_get_write_access(handle, gdb))) {
214 brelse(gdb);
215 goto exit_bh;
216 }
217 lock_buffer(bh);
218 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
219 set_buffer_uptodate(gdb);
220 unlock_buffer(bh);
221 ext4_journal_dirty_metadata(handle, gdb);
222 ext4_set_bit(bit, bh->b_data);
223 brelse(gdb);
224 }
225
226 /* Zero out all of the reserved backup group descriptor table blocks */
227 for (i = 0, bit = gdblocks + 1, block = start + bit;
228 i < reserved_gdb; i++, block++, bit++) {
229 struct buffer_head *gdb;
230
231 ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit);
232
233 if (IS_ERR(gdb = bclean(handle, sb, block))) {
234 err = PTR_ERR(bh);
235 goto exit_bh;
236 }
237 ext4_journal_dirty_metadata(handle, gdb);
238 ext4_set_bit(bit, bh->b_data);
239 brelse(gdb);
240 }
241 ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
242 input->block_bitmap - start);
243 ext4_set_bit(input->block_bitmap - start, bh->b_data);
244 ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
245 input->inode_bitmap - start);
246 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
247
248 /* Zero out all of the inode table blocks */
249 for (i = 0, block = input->inode_table, bit = block - start;
250 i < sbi->s_itb_per_group; i++, bit++, block++) {
251 struct buffer_head *it;
252
253 ext4_debug("clear inode block %#04lx (+%d)\n", block, bit);
254 if (IS_ERR(it = bclean(handle, sb, block))) {
255 err = PTR_ERR(it);
256 goto exit_bh;
257 }
258 ext4_journal_dirty_metadata(handle, it);
259 brelse(it);
260 ext4_set_bit(bit, bh->b_data);
261 }
262 mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
263 bh->b_data);
264 ext4_journal_dirty_metadata(handle, bh);
265 brelse(bh);
266
267 /* Mark unused entries in inode bitmap used */
268 ext4_debug("clear inode bitmap %#04x (+%ld)\n",
269 input->inode_bitmap, input->inode_bitmap - start);
270 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
271 err = PTR_ERR(bh);
272 goto exit_journal;
273 }
274
275 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
276 bh->b_data);
277 ext4_journal_dirty_metadata(handle, bh);
278exit_bh:
279 brelse(bh);
280
281exit_journal:
282 unlock_super(sb);
283 if ((err2 = ext4_journal_stop(handle)) && !err)
284 err = err2;
285
286 return err;
287}
288
289
290/*
291 * Iterate through the groups which hold BACKUP superblock/GDT copies in an
292 * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before
293 * calling this for the first time. In a sparse filesystem it will be the
294 * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
295 * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
296 */
297static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
298 unsigned *five, unsigned *seven)
299{
300 unsigned *min = three;
301 int mult = 3;
302 unsigned ret;
303
304 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
305 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
306 ret = *min;
307 *min += 1;
308 return ret;
309 }
310
311 if (*five < *min) {
312 min = five;
313 mult = 5;
314 }
315 if (*seven < *min) {
316 min = seven;
317 mult = 7;
318 }
319
320 ret = *min;
321 *min *= mult;
322
323 return ret;
324}
325
326/*
327 * Check that all of the backup GDT blocks are held in the primary GDT block.
328 * It is assumed that they are stored in group order. Returns the number of
329 * groups in current filesystem that have BACKUPS, or -ve error code.
330 */
331static int verify_reserved_gdb(struct super_block *sb,
332 struct buffer_head *primary)
333{
334 const ext4_fsblk_t blk = primary->b_blocknr;
335 const unsigned long end = EXT4_SB(sb)->s_groups_count;
336 unsigned three = 1;
337 unsigned five = 5;
338 unsigned seven = 7;
339 unsigned grp;
340 __le32 *p = (__le32 *)primary->b_data;
341 int gdbackups = 0;
342
343 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
344 if (le32_to_cpu(*p++) !=
345 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
346 ext4_warning(sb, __FUNCTION__,
347 "reserved GDT %llu"
348 " missing grp %d (%llu)",
349 blk, grp,
350 grp *
351 (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
352 blk);
353 return -EINVAL;
354 }
355 if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
356 return -EFBIG;
357 }
358
359 return gdbackups;
360}
361
362/*
363 * Called when we need to bring a reserved group descriptor table block into
364 * use from the resize inode. The primary copy of the new GDT block currently
365 * is an indirect block (under the double indirect block in the resize inode).
366 * The new backup GDT blocks will be stored as leaf blocks in this indirect
367 * block, in group order. Even though we know all the block numbers we need,
368 * we check to ensure that the resize inode has actually reserved these blocks.
369 *
370 * Don't need to update the block bitmaps because the blocks are still in use.
371 *
372 * We get all of the error cases out of the way, so that we are sure to not
373 * fail once we start modifying the data on disk, because JBD has no rollback.
374 */
375static int add_new_gdb(handle_t *handle, struct inode *inode,
376 struct ext4_new_group_data *input,
377 struct buffer_head **primary)
378{
379 struct super_block *sb = inode->i_sb;
380 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
381 unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
382 ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
383 struct buffer_head **o_group_desc, **n_group_desc;
384 struct buffer_head *dind;
385 int gdbackups;
386 struct ext4_iloc iloc;
387 __le32 *data;
388 int err;
389
390 if (test_opt(sb, DEBUG))
391 printk(KERN_DEBUG
392 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
393 gdb_num);
394
395 /*
396 * If we are not using the primary superblock/GDT copy don't resize,
397 * because the user tools have no way of handling this. Probably a
398 * bad time to do it anyways.
399 */
400 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
401 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
402 ext4_warning(sb, __FUNCTION__,
403 "won't resize using backup superblock at %llu",
404 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
405 return -EPERM;
406 }
407
408 *primary = sb_bread(sb, gdblock);
409 if (!*primary)
410 return -EIO;
411
412 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
413 err = gdbackups;
414 goto exit_bh;
415 }
416
417 data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
418 dind = sb_bread(sb, le32_to_cpu(*data));
419 if (!dind) {
420 err = -EIO;
421 goto exit_bh;
422 }
423
424 data = (__le32 *)dind->b_data;
425 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
426 ext4_warning(sb, __FUNCTION__,
427 "new group %u GDT block %llu not reserved",
428 input->group, gdblock);
429 err = -EINVAL;
430 goto exit_dind;
431 }
432
433 if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
434 goto exit_dind;
435
436 if ((err = ext4_journal_get_write_access(handle, *primary)))
437 goto exit_sbh;
438
439 if ((err = ext4_journal_get_write_access(handle, dind)))
440 goto exit_primary;
441
442 /* ext4_reserve_inode_write() gets a reference on the iloc */
443 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
444 goto exit_dindj;
445
446 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
447 GFP_KERNEL);
448 if (!n_group_desc) {
449 err = -ENOMEM;
450 ext4_warning (sb, __FUNCTION__,
451 "not enough memory for %lu groups", gdb_num + 1);
452 goto exit_inode;
453 }
454
455 /*
456 * Finally, we have all of the possible failures behind us...
457 *
458 * Remove new GDT block from inode double-indirect block and clear out
459 * the new GDT block for use (which also "frees" the backup GDT blocks
460 * from the reserved inode). We don't need to change the bitmaps for
461 * these blocks, because they are marked as in-use from being in the
462 * reserved inode, and will become GDT blocks (primary and backup).
463 */
464 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
465 ext4_journal_dirty_metadata(handle, dind);
466 brelse(dind);
467 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
468 ext4_mark_iloc_dirty(handle, inode, &iloc);
469 memset((*primary)->b_data, 0, sb->s_blocksize);
470 ext4_journal_dirty_metadata(handle, *primary);
471
472 o_group_desc = EXT4_SB(sb)->s_group_desc;
473 memcpy(n_group_desc, o_group_desc,
474 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
475 n_group_desc[gdb_num] = *primary;
476 EXT4_SB(sb)->s_group_desc = n_group_desc;
477 EXT4_SB(sb)->s_gdb_count++;
478 kfree(o_group_desc);
479
480 es->s_reserved_gdt_blocks =
481 cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
482 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
483
484 return 0;
485
486exit_inode:
487 //ext4_journal_release_buffer(handle, iloc.bh);
488 brelse(iloc.bh);
489exit_dindj:
490 //ext4_journal_release_buffer(handle, dind);
491exit_primary:
492 //ext4_journal_release_buffer(handle, *primary);
493exit_sbh:
494 //ext4_journal_release_buffer(handle, *primary);
495exit_dind:
496 brelse(dind);
497exit_bh:
498 brelse(*primary);
499
500 ext4_debug("leaving with error %d\n", err);
501 return err;
502}
503
504/*
505 * Called when we are adding a new group which has a backup copy of each of
506 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
507 * We need to add these reserved backup GDT blocks to the resize inode, so
508 * that they are kept for future resizing and not allocated to files.
509 *
510 * Each reserved backup GDT block will go into a different indirect block.
511 * The indirect blocks are actually the primary reserved GDT blocks,
512 * so we know in advance what their block numbers are. We only get the
513 * double-indirect block to verify it is pointing to the primary reserved
514 * GDT blocks so we don't overwrite a data block by accident. The reserved
515 * backup GDT blocks are stored in their reserved primary GDT block.
516 */
517static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
518 struct ext4_new_group_data *input)
519{
520 struct super_block *sb = inode->i_sb;
521 int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
522 struct buffer_head **primary;
523 struct buffer_head *dind;
524 struct ext4_iloc iloc;
525 ext4_fsblk_t blk;
526 __le32 *data, *end;
527 int gdbackups = 0;
528 int res, i;
529 int err;
530
531 primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
532 if (!primary)
533 return -ENOMEM;
534
535 data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
536 dind = sb_bread(sb, le32_to_cpu(*data));
537 if (!dind) {
538 err = -EIO;
539 goto exit_free;
540 }
541
542 blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
543 data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
544 end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
545
546 /* Get each reserved primary GDT block and verify it holds backups */
547 for (res = 0; res < reserved_gdb; res++, blk++) {
548 if (le32_to_cpu(*data) != blk) {
549 ext4_warning(sb, __FUNCTION__,
550 "reserved block %llu"
551 " not at offset %ld",
552 blk,
553 (long)(data - (__le32 *)dind->b_data));
554 err = -EINVAL;
555 goto exit_bh;
556 }
557 primary[res] = sb_bread(sb, blk);
558 if (!primary[res]) {
559 err = -EIO;
560 goto exit_bh;
561 }
562 if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
563 brelse(primary[res]);
564 err = gdbackups;
565 goto exit_bh;
566 }
567 if (++data >= end)
568 data = (__le32 *)dind->b_data;
569 }
570
571 for (i = 0; i < reserved_gdb; i++) {
572 if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
573 /*
574 int j;
575 for (j = 0; j < i; j++)
576 ext4_journal_release_buffer(handle, primary[j]);
577 */
578 goto exit_bh;
579 }
580 }
581
582 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
583 goto exit_bh;
584
585 /*
586 * Finally we can add each of the reserved backup GDT blocks from
587 * the new group to its reserved primary GDT block.
588 */
589 blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
590 for (i = 0; i < reserved_gdb; i++) {
591 int err2;
592 data = (__le32 *)primary[i]->b_data;
593 /* printk("reserving backup %lu[%u] = %lu\n",
594 primary[i]->b_blocknr, gdbackups,
595 blk + primary[i]->b_blocknr); */
596 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
597 err2 = ext4_journal_dirty_metadata(handle, primary[i]);
598 if (!err)
599 err = err2;
600 }
601 inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
602 ext4_mark_iloc_dirty(handle, inode, &iloc);
603
604exit_bh:
605 while (--res >= 0)
606 brelse(primary[res]);
607 brelse(dind);
608
609exit_free:
610 kfree(primary);
611
612 return err;
613}
614
615/*
616 * Update the backup copies of the ext4 metadata. These don't need to be part
617 * of the main resize transaction, because e2fsck will re-write them if there
618 * is a problem (basically only OOM will cause a problem). However, we
619 * _should_ update the backups if possible, in case the primary gets trashed
620 * for some reason and we need to run e2fsck from a backup superblock. The
621 * important part is that the new block and inode counts are in the backup
622 * superblocks, and the location of the new group metadata in the GDT backups.
623 *
624 * We do not need lock_super() for this, because these blocks are not
625 * otherwise touched by the filesystem code when it is mounted. We don't
626 * need to worry about last changing from sbi->s_groups_count, because the
627 * worst that can happen is that we do not copy the full number of backups
628 * at this time. The resize which changed s_groups_count will backup again.
629 */
630static void update_backups(struct super_block *sb,
631 int blk_off, char *data, int size)
632{
633 struct ext4_sb_info *sbi = EXT4_SB(sb);
634 const unsigned long last = sbi->s_groups_count;
635 const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
636 unsigned three = 1;
637 unsigned five = 5;
638 unsigned seven = 7;
639 unsigned group;
640 int rest = sb->s_blocksize - size;
641 handle_t *handle;
642 int err = 0, err2;
643
644 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
645 if (IS_ERR(handle)) {
646 group = 1;
647 err = PTR_ERR(handle);
648 goto exit_err;
649 }
650
651 while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
652 struct buffer_head *bh;
653
654 /* Out of journal space, and can't get more - abort - so sad */
655 if (handle->h_buffer_credits == 0 &&
656 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
657 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
658 break;
659
660 bh = sb_getblk(sb, group * bpg + blk_off);
661 if (!bh) {
662 err = -EIO;
663 break;
664 }
665 ext4_debug("update metadata backup %#04lx\n",
666 (unsigned long)bh->b_blocknr);
667 if ((err = ext4_journal_get_write_access(handle, bh)))
668 break;
669 lock_buffer(bh);
670 memcpy(bh->b_data, data, size);
671 if (rest)
672 memset(bh->b_data + size, 0, rest);
673 set_buffer_uptodate(bh);
674 unlock_buffer(bh);
675 ext4_journal_dirty_metadata(handle, bh);
676 brelse(bh);
677 }
678 if ((err2 = ext4_journal_stop(handle)) && !err)
679 err = err2;
680
681 /*
682 * Ugh! Need to have e2fsck write the backup copies. It is too
683 * late to revert the resize, we shouldn't fail just because of
684 * the backup copies (they are only needed in case of corruption).
685 *
686 * However, if we got here we have a journal problem too, so we
687 * can't really start a transaction to mark the superblock.
688 * Chicken out and just set the flag on the hope it will be written
689 * to disk, and if not - we will simply wait until next fsck.
690 */
691exit_err:
692 if (err) {
693 ext4_warning(sb, __FUNCTION__,
694 "can't update backup for group %d (err %d), "
695 "forcing fsck on next reboot", group, err);
696 sbi->s_mount_state &= ~EXT4_VALID_FS;
697 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
698 mark_buffer_dirty(sbi->s_sbh);
699 }
700}
701
702/* Add group descriptor data to an existing or new group descriptor block.
703 * Ensure we handle all possible error conditions _before_ we start modifying
704 * the filesystem, because we cannot abort the transaction and not have it
705 * write the data to disk.
706 *
707 * If we are on a GDT block boundary, we need to get the reserved GDT block.
708 * Otherwise, we may need to add backup GDT blocks for a sparse group.
709 *
710 * We only need to hold the superblock lock while we are actually adding
711 * in the new group's counts to the superblock. Prior to that we have
712 * not really "added" the group at all. We re-check that we are still
713 * adding in the last group in case things have changed since verifying.
714 */
715int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
716{
717 struct ext4_sb_info *sbi = EXT4_SB(sb);
718 struct ext4_super_block *es = sbi->s_es;
719 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
720 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
721 struct buffer_head *primary = NULL;
722 struct ext4_group_desc *gdp;
723 struct inode *inode = NULL;
724 handle_t *handle;
725 int gdb_off, gdb_num;
726 int err, err2;
727
728 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
729 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
730
731 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
732 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
733 ext4_warning(sb, __FUNCTION__,
734 "Can't resize non-sparse filesystem further");
735 return -EPERM;
736 }
737
738 if (ext4_blocks_count(es) + input->blocks_count <
739 ext4_blocks_count(es)) {
740 ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
741 return -EINVAL;
742 }
743
744 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
745 le32_to_cpu(es->s_inodes_count)) {
746 ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
747 return -EINVAL;
748 }
749
750 if (reserved_gdb || gdb_off == 0) {
751 if (!EXT4_HAS_COMPAT_FEATURE(sb,
752 EXT4_FEATURE_COMPAT_RESIZE_INODE)){
753 ext4_warning(sb, __FUNCTION__,
754 "No reserved GDT blocks, can't resize");
755 return -EPERM;
756 }
757 inode = iget(sb, EXT4_RESIZE_INO);
758 if (!inode || is_bad_inode(inode)) {
759 ext4_warning(sb, __FUNCTION__,
760 "Error opening resize inode");
761 iput(inode);
762 return -ENOENT;
763 }
764 }
765
766 if ((err = verify_group_input(sb, input)))
767 goto exit_put;
768
769 if ((err = setup_new_group_blocks(sb, input)))
770 goto exit_put;
771
772 /*
773 * We will always be modifying at least the superblock and a GDT
774 * block. If we are adding a group past the last current GDT block,
775 * we will also modify the inode and the dindirect block. If we
776 * are adding a group with superblock/GDT backups we will also
777 * modify each of the reserved GDT dindirect blocks.
778 */
779 handle = ext4_journal_start_sb(sb,
780 ext4_bg_has_super(sb, input->group) ?
781 3 + reserved_gdb : 4);
782 if (IS_ERR(handle)) {
783 err = PTR_ERR(handle);
784 goto exit_put;
785 }
786
787 lock_super(sb);
788 if (input->group != sbi->s_groups_count) {
789 ext4_warning(sb, __FUNCTION__,
790 "multiple resizers run on filesystem!");
791 err = -EBUSY;
792 goto exit_journal;
793 }
794
795 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
796 goto exit_journal;
797
798 /*
799 * We will only either add reserved group blocks to a backup group
800 * or remove reserved blocks for the first group in a new group block.
801 * Doing both would be mean more complex code, and sane people don't
802 * use non-sparse filesystems anymore. This is already checked above.
803 */
804 if (gdb_off) {
805 primary = sbi->s_group_desc[gdb_num];
806 if ((err = ext4_journal_get_write_access(handle, primary)))
807 goto exit_journal;
808
809 if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
810 (err = reserve_backup_gdb(handle, inode, input)))
811 goto exit_journal;
812 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
813 goto exit_journal;
814
815 /*
816 * OK, now we've set up the new group. Time to make it active.
817 *
818 * Current kernels don't lock all allocations via lock_super(),
819 * so we have to be safe wrt. concurrent accesses the group
820 * data. So we need to be careful to set all of the relevant
821 * group descriptor data etc. *before* we enable the group.
822 *
823 * The key field here is sbi->s_groups_count: as long as
824 * that retains its old value, nobody is going to access the new
825 * group.
826 *
827 * So first we update all the descriptor metadata for the new
828 * group; then we update the total disk blocks count; then we
829 * update the groups count to enable the group; then finally we
830 * update the free space counts so that the system can start
831 * using the new disk blocks.
832 */
833
834 /* Update group descriptor block for new group */
835 gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
836
837 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
838 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
839 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
840 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
841 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
842
843 /*
844 * Make the new blocks and inodes valid next. We do this before
845 * increasing the group count so that once the group is enabled,
846 * all of its blocks and inodes are already valid.
847 *
848 * We always allocate group-by-group, then block-by-block or
849 * inode-by-inode within a group, so enabling these
850 * blocks/inodes before the group is live won't actually let us
851 * allocate the new space yet.
852 */
853 ext4_blocks_count_set(es, ext4_blocks_count(es) +
854 input->blocks_count);
855 es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
856 EXT4_INODES_PER_GROUP(sb));
857
858 /*
859 * We need to protect s_groups_count against other CPUs seeing
860 * inconsistent state in the superblock.
861 *
862 * The precise rules we use are:
863 *
864 * * Writers of s_groups_count *must* hold lock_super
865 * AND
866 * * Writers must perform a smp_wmb() after updating all dependent
867 * data and before modifying the groups count
868 *
869 * * Readers must hold lock_super() over the access
870 * OR
871 * * Readers must perform an smp_rmb() after reading the groups count
872 * and before reading any dependent data.
873 *
874 * NB. These rules can be relaxed when checking the group count
875 * while freeing data, as we can only allocate from a block
876 * group after serialising against the group count, and we can
877 * only then free after serialising in turn against that
878 * allocation.
879 */
880 smp_wmb();
881
882 /* Update the global fs size fields */
883 sbi->s_groups_count++;
884
885 ext4_journal_dirty_metadata(handle, primary);
886
887 /* Update the reserved block counts only once the new group is
888 * active. */
889 ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
890 input->reserved_blocks);
891
892 /* Update the free space counts */
893 percpu_counter_mod(&sbi->s_freeblocks_counter,
894 input->free_blocks_count);
895 percpu_counter_mod(&sbi->s_freeinodes_counter,
896 EXT4_INODES_PER_GROUP(sb));
897
898 ext4_journal_dirty_metadata(handle, sbi->s_sbh);
899 sb->s_dirt = 1;
900
901exit_journal:
902 unlock_super(sb);
903 if ((err2 = ext4_journal_stop(handle)) && !err)
904 err = err2;
905 if (!err) {
906 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
907 sizeof(struct ext4_super_block));
908 update_backups(sb, primary->b_blocknr, primary->b_data,
909 primary->b_size);
910 }
911exit_put:
912 iput(inode);
913 return err;
914} /* ext4_group_add */
915
916/* Extend the filesystem to the new number of blocks specified. This entry
917 * point is only used to extend the current filesystem to the end of the last
918 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
919 * for emergencies (because it has no dependencies on reserved blocks).
920 *
921 * If we _really_ wanted, we could use default values to call ext4_group_add()
922 * allow the "remount" trick to work for arbitrary resizing, assuming enough
923 * GDT blocks are reserved to grow to the desired size.
924 */
925int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
926 ext4_fsblk_t n_blocks_count)
927{
928 ext4_fsblk_t o_blocks_count;
929 unsigned long o_groups_count;
930 ext4_grpblk_t last;
931 ext4_grpblk_t add;
932 struct buffer_head * bh;
933 handle_t *handle;
934 int err;
935 unsigned long freed_blocks;
936
937 /* We don't need to worry about locking wrt other resizers just
938 * yet: we're going to revalidate es->s_blocks_count after
939 * taking lock_super() below. */
940 o_blocks_count = ext4_blocks_count(es);
941 o_groups_count = EXT4_SB(sb)->s_groups_count;
942
943 if (test_opt(sb, DEBUG))
944 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
945 o_blocks_count, n_blocks_count);
946
947 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
948 return 0;
949
950 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
951 printk(KERN_ERR "EXT4-fs: filesystem on %s:"
952 " too large to resize to %llu blocks safely\n",
953 sb->s_id, n_blocks_count);
954 if (sizeof(sector_t) < 8)
955 ext4_warning(sb, __FUNCTION__,
956 "CONFIG_LBD not enabled\n");
957 return -EINVAL;
958 }
959
960 if (n_blocks_count < o_blocks_count) {
961 ext4_warning(sb, __FUNCTION__,
962 "can't shrink FS - resize aborted");
963 return -EBUSY;
964 }
965
966 /* Handle the remaining blocks in the last group only. */
967 ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
968
969 if (last == 0) {
970 ext4_warning(sb, __FUNCTION__,
971 "need to use ext2online to resize further");
972 return -EPERM;
973 }
974
975 add = EXT4_BLOCKS_PER_GROUP(sb) - last;
976
977 if (o_blocks_count + add < o_blocks_count) {
978 ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
979 return -EINVAL;
980 }
981
982 if (o_blocks_count + add > n_blocks_count)
983 add = n_blocks_count - o_blocks_count;
984
985 if (o_blocks_count + add < n_blocks_count)
986 ext4_warning(sb, __FUNCTION__,
987 "will only finish group (%llu"
988 " blocks, %u new)",
989 o_blocks_count + add, add);
990
991 /* See if the device is actually as big as what was requested */
992 bh = sb_bread(sb, o_blocks_count + add -1);
993 if (!bh) {
994 ext4_warning(sb, __FUNCTION__,
995 "can't read last block, resize aborted");
996 return -ENOSPC;
997 }
998 brelse(bh);
999
1000 /* We will update the superblock, one block bitmap, and
1001 * one group descriptor via ext4_free_blocks().
1002 */
1003 handle = ext4_journal_start_sb(sb, 3);
1004 if (IS_ERR(handle)) {
1005 err = PTR_ERR(handle);
1006 ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
1007 goto exit_put;
1008 }
1009
1010 lock_super(sb);
1011 if (o_blocks_count != ext4_blocks_count(es)) {
1012 ext4_warning(sb, __FUNCTION__,
1013 "multiple resizers run on filesystem!");
1014 unlock_super(sb);
1015 err = -EBUSY;
1016 goto exit_put;
1017 }
1018
1019 if ((err = ext4_journal_get_write_access(handle,
1020 EXT4_SB(sb)->s_sbh))) {
1021 ext4_warning(sb, __FUNCTION__,
1022 "error %d on journal write access", err);
1023 unlock_super(sb);
1024 ext4_journal_stop(handle);
1025 goto exit_put;
1026 }
1027 ext4_blocks_count_set(es, o_blocks_count + add);
1028 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
1029 sb->s_dirt = 1;
1030 unlock_super(sb);
1031 ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count,
1032 o_blocks_count + add);
1033 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1034 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1035 o_blocks_count + add);
1036 if ((err = ext4_journal_stop(handle)))
1037 goto exit_put;
1038 if (test_opt(sb, DEBUG))
1039 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1040 ext4_blocks_count(es));
1041 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
1042 sizeof(struct ext4_super_block));
1043exit_put:
1044 return err;
1045} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 000000000000..b4b022aa2bc2
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,2829 @@
1/*
2 * linux/fs/ext4/super.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 */
18
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/fs.h>
22#include <linux/time.h>
23#include <linux/jbd2.h>
24#include <linux/ext4_fs.h>
25#include <linux/ext4_jbd2.h>
26#include <linux/slab.h>
27#include <linux/init.h>
28#include <linux/blkdev.h>
29#include <linux/parser.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h>
32#include <linux/vfs.h>
33#include <linux/random.h>
34#include <linux/mount.h>
35#include <linux/namei.h>
36#include <linux/quotaops.h>
37#include <linux/seq_file.h>
38
39#include <asm/uaccess.h>
40
41#include "xattr.h"
42#include "acl.h"
43#include "namei.h"
44
45static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
46 unsigned long journal_devnum);
47static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
48 unsigned int);
49static void ext4_commit_super (struct super_block * sb,
50 struct ext4_super_block * es,
51 int sync);
52static void ext4_mark_recovery_complete(struct super_block * sb,
53 struct ext4_super_block * es);
54static void ext4_clear_journal_err(struct super_block * sb,
55 struct ext4_super_block * es);
56static int ext4_sync_fs(struct super_block *sb, int wait);
57static const char *ext4_decode_error(struct super_block * sb, int errno,
58 char nbuf[16]);
59static int ext4_remount (struct super_block * sb, int * flags, char * data);
60static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
61static void ext4_unlockfs(struct super_block *sb);
62static void ext4_write_super (struct super_block * sb);
63static void ext4_write_super_lockfs(struct super_block *sb);
64
65
66ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
67 struct ext4_group_desc *bg)
68{
69 return le32_to_cpu(bg->bg_block_bitmap) |
70 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
71 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
72}
73
74ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
75 struct ext4_group_desc *bg)
76{
77 return le32_to_cpu(bg->bg_inode_bitmap) |
78 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
79 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
80}
81
82ext4_fsblk_t ext4_inode_table(struct super_block *sb,
83 struct ext4_group_desc *bg)
84{
85 return le32_to_cpu(bg->bg_inode_table) |
86 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
87 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
88}
89
90void ext4_block_bitmap_set(struct super_block *sb,
91 struct ext4_group_desc *bg, ext4_fsblk_t blk)
92{
93 bg->bg_block_bitmap = cpu_to_le32((u32)blk);
94 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
95 bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
96}
97
98void ext4_inode_bitmap_set(struct super_block *sb,
99 struct ext4_group_desc *bg, ext4_fsblk_t blk)
100{
101 bg->bg_inode_bitmap = cpu_to_le32((u32)blk);
102 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
103 bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
104}
105
106void ext4_inode_table_set(struct super_block *sb,
107 struct ext4_group_desc *bg, ext4_fsblk_t blk)
108{
109 bg->bg_inode_table = cpu_to_le32((u32)blk);
110 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
111 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
112}
113
114/*
115 * Wrappers for jbd2_journal_start/end.
116 *
117 * The only special thing we need to do here is to make sure that all
118 * journal_end calls result in the superblock being marked dirty, so
119 * that sync() will call the filesystem's write_super callback if
120 * appropriate.
121 */
122handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
123{
124 journal_t *journal;
125
126 if (sb->s_flags & MS_RDONLY)
127 return ERR_PTR(-EROFS);
128
129 /* Special case here: if the journal has aborted behind our
130 * backs (eg. EIO in the commit thread), then we still need to
131 * take the FS itself readonly cleanly. */
132 journal = EXT4_SB(sb)->s_journal;
133 if (is_journal_aborted(journal)) {
134 ext4_abort(sb, __FUNCTION__,
135 "Detected aborted journal");
136 return ERR_PTR(-EROFS);
137 }
138
139 return jbd2_journal_start(journal, nblocks);
140}
141
142/*
143 * The only special thing we need to do here is to make sure that all
144 * jbd2_journal_stop calls result in the superblock being marked dirty, so
145 * that sync() will call the filesystem's write_super callback if
146 * appropriate.
147 */
148int __ext4_journal_stop(const char *where, handle_t *handle)
149{
150 struct super_block *sb;
151 int err;
152 int rc;
153
154 sb = handle->h_transaction->t_journal->j_private;
155 err = handle->h_err;
156 rc = jbd2_journal_stop(handle);
157
158 if (!err)
159 err = rc;
160 if (err)
161 __ext4_std_error(sb, where, err);
162 return err;
163}
164
165void ext4_journal_abort_handle(const char *caller, const char *err_fn,
166 struct buffer_head *bh, handle_t *handle, int err)
167{
168 char nbuf[16];
169 const char *errstr = ext4_decode_error(NULL, err, nbuf);
170
171 if (bh)
172 BUFFER_TRACE(bh, "abort");
173
174 if (!handle->h_err)
175 handle->h_err = err;
176
177 if (is_handle_aborted(handle))
178 return;
179
180 printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
181 caller, errstr, err_fn);
182
183 jbd2_journal_abort_handle(handle);
184}
185
186/* Deal with the reporting of failure conditions on a filesystem such as
187 * inconsistencies detected or read IO failures.
188 *
189 * On ext2, we can store the error state of the filesystem in the
190 * superblock. That is not possible on ext4, because we may have other
191 * write ordering constraints on the superblock which prevent us from
192 * writing it out straight away; and given that the journal is about to
193 * be aborted, we can't rely on the current, or future, transactions to
194 * write out the superblock safely.
195 *
196 * We'll just use the jbd2_journal_abort() error code to record an error in
197 * the journal instead. On recovery, the journal will compain about
198 * that error until we've noted it down and cleared it.
199 */
200
201static void ext4_handle_error(struct super_block *sb)
202{
203 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
204
205 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
206 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
207
208 if (sb->s_flags & MS_RDONLY)
209 return;
210
211 if (!test_opt (sb, ERRORS_CONT)) {
212 journal_t *journal = EXT4_SB(sb)->s_journal;
213
214 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
215 if (journal)
216 jbd2_journal_abort(journal, -EIO);
217 }
218 if (test_opt (sb, ERRORS_RO)) {
219 printk (KERN_CRIT "Remounting filesystem read-only\n");
220 sb->s_flags |= MS_RDONLY;
221 }
222 ext4_commit_super(sb, es, 1);
223 if (test_opt(sb, ERRORS_PANIC))
224 panic("EXT4-fs (device %s): panic forced after error\n",
225 sb->s_id);
226}
227
228void ext4_error (struct super_block * sb, const char * function,
229 const char * fmt, ...)
230{
231 va_list args;
232
233 va_start(args, fmt);
234 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
235 vprintk(fmt, args);
236 printk("\n");
237 va_end(args);
238
239 ext4_handle_error(sb);
240}
241
242static const char *ext4_decode_error(struct super_block * sb, int errno,
243 char nbuf[16])
244{
245 char *errstr = NULL;
246
247 switch (errno) {
248 case -EIO:
249 errstr = "IO failure";
250 break;
251 case -ENOMEM:
252 errstr = "Out of memory";
253 break;
254 case -EROFS:
255 if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
256 errstr = "Journal has aborted";
257 else
258 errstr = "Readonly filesystem";
259 break;
260 default:
261 /* If the caller passed in an extra buffer for unknown
262 * errors, textualise them now. Else we just return
263 * NULL. */
264 if (nbuf) {
265 /* Check for truncated error codes... */
266 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
267 errstr = nbuf;
268 }
269 break;
270 }
271
272 return errstr;
273}
274
275/* __ext4_std_error decodes expected errors from journaling functions
276 * automatically and invokes the appropriate error response. */
277
278void __ext4_std_error (struct super_block * sb, const char * function,
279 int errno)
280{
281 char nbuf[16];
282 const char *errstr;
283
284 /* Special case: if the error is EROFS, and we're not already
285 * inside a transaction, then there's really no point in logging
286 * an error. */
287 if (errno == -EROFS && journal_current_handle() == NULL &&
288 (sb->s_flags & MS_RDONLY))
289 return;
290
291 errstr = ext4_decode_error(sb, errno, nbuf);
292 printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
293 sb->s_id, function, errstr);
294
295 ext4_handle_error(sb);
296}
297
298/*
299 * ext4_abort is a much stronger failure handler than ext4_error. The
300 * abort function may be used to deal with unrecoverable failures such
301 * as journal IO errors or ENOMEM at a critical moment in log management.
302 *
303 * We unconditionally force the filesystem into an ABORT|READONLY state,
304 * unless the error response on the fs has been set to panic in which
305 * case we take the easy way out and panic immediately.
306 */
307
308void ext4_abort (struct super_block * sb, const char * function,
309 const char * fmt, ...)
310{
311 va_list args;
312
313 printk (KERN_CRIT "ext4_abort called.\n");
314
315 va_start(args, fmt);
316 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
317 vprintk(fmt, args);
318 printk("\n");
319 va_end(args);
320
321 if (test_opt(sb, ERRORS_PANIC))
322 panic("EXT4-fs panic from previous error\n");
323
324 if (sb->s_flags & MS_RDONLY)
325 return;
326
327 printk(KERN_CRIT "Remounting filesystem read-only\n");
328 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
329 sb->s_flags |= MS_RDONLY;
330 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
331 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
332}
333
334void ext4_warning (struct super_block * sb, const char * function,
335 const char * fmt, ...)
336{
337 va_list args;
338
339 va_start(args, fmt);
340 printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
341 sb->s_id, function);
342 vprintk(fmt, args);
343 printk("\n");
344 va_end(args);
345}
346
347void ext4_update_dynamic_rev(struct super_block *sb)
348{
349 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
350
351 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
352 return;
353
354 ext4_warning(sb, __FUNCTION__,
355 "updating to rev %d because of new feature flag, "
356 "running e2fsck is recommended",
357 EXT4_DYNAMIC_REV);
358
359 es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
360 es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
361 es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
362 /* leave es->s_feature_*compat flags alone */
363 /* es->s_uuid will be set by e2fsck if empty */
364
365 /*
366 * The rest of the superblock fields should be zero, and if not it
367 * means they are likely already in use, so leave them alone. We
368 * can leave it up to e2fsck to clean up any inconsistencies there.
369 */
370}
371
372/*
373 * Open the external journal device
374 */
375static struct block_device *ext4_blkdev_get(dev_t dev)
376{
377 struct block_device *bdev;
378 char b[BDEVNAME_SIZE];
379
380 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
381 if (IS_ERR(bdev))
382 goto fail;
383 return bdev;
384
385fail:
386 printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
387 __bdevname(dev, b), PTR_ERR(bdev));
388 return NULL;
389}
390
391/*
392 * Release the journal device
393 */
394static int ext4_blkdev_put(struct block_device *bdev)
395{
396 bd_release(bdev);
397 return blkdev_put(bdev);
398}
399
400static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
401{
402 struct block_device *bdev;
403 int ret = -ENODEV;
404
405 bdev = sbi->journal_bdev;
406 if (bdev) {
407 ret = ext4_blkdev_put(bdev);
408 sbi->journal_bdev = NULL;
409 }
410 return ret;
411}
412
413static inline struct inode *orphan_list_entry(struct list_head *l)
414{
415 return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
416}
417
418static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
419{
420 struct list_head *l;
421
422 printk(KERN_ERR "sb orphan head is %d\n",
423 le32_to_cpu(sbi->s_es->s_last_orphan));
424
425 printk(KERN_ERR "sb_info orphan list:\n");
426 list_for_each(l, &sbi->s_orphan) {
427 struct inode *inode = orphan_list_entry(l);
428 printk(KERN_ERR " "
429 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
430 inode->i_sb->s_id, inode->i_ino, inode,
431 inode->i_mode, inode->i_nlink,
432 NEXT_ORPHAN(inode));
433 }
434}
435
436static void ext4_put_super (struct super_block * sb)
437{
438 struct ext4_sb_info *sbi = EXT4_SB(sb);
439 struct ext4_super_block *es = sbi->s_es;
440 int i;
441
442 ext4_ext_release(sb);
443 ext4_xattr_put_super(sb);
444 jbd2_journal_destroy(sbi->s_journal);
445 if (!(sb->s_flags & MS_RDONLY)) {
446 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
447 es->s_state = cpu_to_le16(sbi->s_mount_state);
448 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
449 mark_buffer_dirty(sbi->s_sbh);
450 ext4_commit_super(sb, es, 1);
451 }
452
453 for (i = 0; i < sbi->s_gdb_count; i++)
454 brelse(sbi->s_group_desc[i]);
455 kfree(sbi->s_group_desc);
456 percpu_counter_destroy(&sbi->s_freeblocks_counter);
457 percpu_counter_destroy(&sbi->s_freeinodes_counter);
458 percpu_counter_destroy(&sbi->s_dirs_counter);
459 brelse(sbi->s_sbh);
460#ifdef CONFIG_QUOTA
461 for (i = 0; i < MAXQUOTAS; i++)
462 kfree(sbi->s_qf_names[i]);
463#endif
464
465 /* Debugging code just in case the in-memory inode orphan list
466 * isn't empty. The on-disk one can be non-empty if we've
467 * detected an error and taken the fs readonly, but the
468 * in-memory list had better be clean by this point. */
469 if (!list_empty(&sbi->s_orphan))
470 dump_orphan_list(sb, sbi);
471 J_ASSERT(list_empty(&sbi->s_orphan));
472
473 invalidate_bdev(sb->s_bdev, 0);
474 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
475 /*
476 * Invalidate the journal device's buffers. We don't want them
477 * floating about in memory - the physical journal device may
478 * hotswapped, and it breaks the `ro-after' testing code.
479 */
480 sync_blockdev(sbi->journal_bdev);
481 invalidate_bdev(sbi->journal_bdev, 0);
482 ext4_blkdev_remove(sbi);
483 }
484 sb->s_fs_info = NULL;
485 kfree(sbi);
486 return;
487}
488
489static kmem_cache_t *ext4_inode_cachep;
490
491/*
492 * Called inside transaction, so use GFP_NOFS
493 */
494static struct inode *ext4_alloc_inode(struct super_block *sb)
495{
496 struct ext4_inode_info *ei;
497
498 ei = kmem_cache_alloc(ext4_inode_cachep, SLAB_NOFS);
499 if (!ei)
500 return NULL;
501#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
502 ei->i_acl = EXT4_ACL_NOT_CACHED;
503 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
504#endif
505 ei->i_block_alloc_info = NULL;
506 ei->vfs_inode.i_version = 1;
507 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
508 return &ei->vfs_inode;
509}
510
511static void ext4_destroy_inode(struct inode *inode)
512{
513 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
514}
515
516static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
517{
518 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
519
520 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
521 SLAB_CTOR_CONSTRUCTOR) {
522 INIT_LIST_HEAD(&ei->i_orphan);
523#ifdef CONFIG_EXT4DEV_FS_XATTR
524 init_rwsem(&ei->xattr_sem);
525#endif
526 mutex_init(&ei->truncate_mutex);
527 inode_init_once(&ei->vfs_inode);
528 }
529}
530
531static int init_inodecache(void)
532{
533 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
534 sizeof(struct ext4_inode_info),
535 0, (SLAB_RECLAIM_ACCOUNT|
536 SLAB_MEM_SPREAD),
537 init_once, NULL);
538 if (ext4_inode_cachep == NULL)
539 return -ENOMEM;
540 return 0;
541}
542
543static void destroy_inodecache(void)
544{
545 kmem_cache_destroy(ext4_inode_cachep);
546}
547
548static void ext4_clear_inode(struct inode *inode)
549{
550 struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
551#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
552 if (EXT4_I(inode)->i_acl &&
553 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
554 posix_acl_release(EXT4_I(inode)->i_acl);
555 EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
556 }
557 if (EXT4_I(inode)->i_default_acl &&
558 EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
559 posix_acl_release(EXT4_I(inode)->i_default_acl);
560 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
561 }
562#endif
563 ext4_discard_reservation(inode);
564 EXT4_I(inode)->i_block_alloc_info = NULL;
565 if (unlikely(rsv))
566 kfree(rsv);
567}
568
569static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
570{
571#if defined(CONFIG_QUOTA)
572 struct ext4_sb_info *sbi = EXT4_SB(sb);
573
574 if (sbi->s_jquota_fmt)
575 seq_printf(seq, ",jqfmt=%s",
576 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
577
578 if (sbi->s_qf_names[USRQUOTA])
579 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
580
581 if (sbi->s_qf_names[GRPQUOTA])
582 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
583
584 if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
585 seq_puts(seq, ",usrquota");
586
587 if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
588 seq_puts(seq, ",grpquota");
589#endif
590}
591
592static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
593{
594 struct super_block *sb = vfs->mnt_sb;
595
596 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
597 seq_puts(seq, ",data=journal");
598 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
599 seq_puts(seq, ",data=ordered");
600 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
601 seq_puts(seq, ",data=writeback");
602
603 ext4_show_quota_options(seq, sb);
604
605 return 0;
606}
607
608
609static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp)
610{
611 __u32 *objp = vobjp;
612 unsigned long ino = objp[0];
613 __u32 generation = objp[1];
614 struct inode *inode;
615 struct dentry *result;
616
617 if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
618 return ERR_PTR(-ESTALE);
619 if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
620 return ERR_PTR(-ESTALE);
621
622 /* iget isn't really right if the inode is currently unallocated!!
623 *
624 * ext4_read_inode will return a bad_inode if the inode had been
625 * deleted, so we should be safe.
626 *
627 * Currently we don't know the generation for parent directory, so
628 * a generation of 0 means "accept any"
629 */
630 inode = iget(sb, ino);
631 if (inode == NULL)
632 return ERR_PTR(-ENOMEM);
633 if (is_bad_inode(inode) ||
634 (generation && inode->i_generation != generation)) {
635 iput(inode);
636 return ERR_PTR(-ESTALE);
637 }
638 /* now to find a dentry.
639 * If possible, get a well-connected one
640 */
641 result = d_alloc_anon(inode);
642 if (!result) {
643 iput(inode);
644 return ERR_PTR(-ENOMEM);
645 }
646 return result;
647}
648
649#ifdef CONFIG_QUOTA
650#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
651#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
652
653static int ext4_dquot_initialize(struct inode *inode, int type);
654static int ext4_dquot_drop(struct inode *inode);
655static int ext4_write_dquot(struct dquot *dquot);
656static int ext4_acquire_dquot(struct dquot *dquot);
657static int ext4_release_dquot(struct dquot *dquot);
658static int ext4_mark_dquot_dirty(struct dquot *dquot);
659static int ext4_write_info(struct super_block *sb, int type);
660static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path);
661static int ext4_quota_on_mount(struct super_block *sb, int type);
662static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
663 size_t len, loff_t off);
664static ssize_t ext4_quota_write(struct super_block *sb, int type,
665 const char *data, size_t len, loff_t off);
666
667static struct dquot_operations ext4_quota_operations = {
668 .initialize = ext4_dquot_initialize,
669 .drop = ext4_dquot_drop,
670 .alloc_space = dquot_alloc_space,
671 .alloc_inode = dquot_alloc_inode,
672 .free_space = dquot_free_space,
673 .free_inode = dquot_free_inode,
674 .transfer = dquot_transfer,
675 .write_dquot = ext4_write_dquot,
676 .acquire_dquot = ext4_acquire_dquot,
677 .release_dquot = ext4_release_dquot,
678 .mark_dirty = ext4_mark_dquot_dirty,
679 .write_info = ext4_write_info
680};
681
682static struct quotactl_ops ext4_qctl_operations = {
683 .quota_on = ext4_quota_on,
684 .quota_off = vfs_quota_off,
685 .quota_sync = vfs_quota_sync,
686 .get_info = vfs_get_dqinfo,
687 .set_info = vfs_set_dqinfo,
688 .get_dqblk = vfs_get_dqblk,
689 .set_dqblk = vfs_set_dqblk
690};
691#endif
692
693static struct super_operations ext4_sops = {
694 .alloc_inode = ext4_alloc_inode,
695 .destroy_inode = ext4_destroy_inode,
696 .read_inode = ext4_read_inode,
697 .write_inode = ext4_write_inode,
698 .dirty_inode = ext4_dirty_inode,
699 .delete_inode = ext4_delete_inode,
700 .put_super = ext4_put_super,
701 .write_super = ext4_write_super,
702 .sync_fs = ext4_sync_fs,
703 .write_super_lockfs = ext4_write_super_lockfs,
704 .unlockfs = ext4_unlockfs,
705 .statfs = ext4_statfs,
706 .remount_fs = ext4_remount,
707 .clear_inode = ext4_clear_inode,
708 .show_options = ext4_show_options,
709#ifdef CONFIG_QUOTA
710 .quota_read = ext4_quota_read,
711 .quota_write = ext4_quota_write,
712#endif
713};
714
715static struct export_operations ext4_export_ops = {
716 .get_parent = ext4_get_parent,
717 .get_dentry = ext4_get_dentry,
718};
719
720enum {
721 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
722 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
723 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
724 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
725 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
726 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
727 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
728 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
729 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
730 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
731 Opt_grpquota, Opt_extents,
732};
733
734static match_table_t tokens = {
735 {Opt_bsd_df, "bsddf"},
736 {Opt_minix_df, "minixdf"},
737 {Opt_grpid, "grpid"},
738 {Opt_grpid, "bsdgroups"},
739 {Opt_nogrpid, "nogrpid"},
740 {Opt_nogrpid, "sysvgroups"},
741 {Opt_resgid, "resgid=%u"},
742 {Opt_resuid, "resuid=%u"},
743 {Opt_sb, "sb=%u"},
744 {Opt_err_cont, "errors=continue"},
745 {Opt_err_panic, "errors=panic"},
746 {Opt_err_ro, "errors=remount-ro"},
747 {Opt_nouid32, "nouid32"},
748 {Opt_nocheck, "nocheck"},
749 {Opt_nocheck, "check=none"},
750 {Opt_debug, "debug"},
751 {Opt_oldalloc, "oldalloc"},
752 {Opt_orlov, "orlov"},
753 {Opt_user_xattr, "user_xattr"},
754 {Opt_nouser_xattr, "nouser_xattr"},
755 {Opt_acl, "acl"},
756 {Opt_noacl, "noacl"},
757 {Opt_reservation, "reservation"},
758 {Opt_noreservation, "noreservation"},
759 {Opt_noload, "noload"},
760 {Opt_nobh, "nobh"},
761 {Opt_bh, "bh"},
762 {Opt_commit, "commit=%u"},
763 {Opt_journal_update, "journal=update"},
764 {Opt_journal_inum, "journal=%u"},
765 {Opt_journal_dev, "journal_dev=%u"},
766 {Opt_abort, "abort"},
767 {Opt_data_journal, "data=journal"},
768 {Opt_data_ordered, "data=ordered"},
769 {Opt_data_writeback, "data=writeback"},
770 {Opt_offusrjquota, "usrjquota="},
771 {Opt_usrjquota, "usrjquota=%s"},
772 {Opt_offgrpjquota, "grpjquota="},
773 {Opt_grpjquota, "grpjquota=%s"},
774 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
775 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
776 {Opt_grpquota, "grpquota"},
777 {Opt_noquota, "noquota"},
778 {Opt_quota, "quota"},
779 {Opt_usrquota, "usrquota"},
780 {Opt_barrier, "barrier=%u"},
781 {Opt_extents, "extents"},
782 {Opt_err, NULL},
783 {Opt_resize, "resize"},
784};
785
786static ext4_fsblk_t get_sb_block(void **data)
787{
788 ext4_fsblk_t sb_block;
789 char *options = (char *) *data;
790
791 if (!options || strncmp(options, "sb=", 3) != 0)
792 return 1; /* Default location */
793 options += 3;
794 /*todo: use simple_strtoll with >32bit ext4 */
795 sb_block = simple_strtoul(options, &options, 0);
796 if (*options && *options != ',') {
797 printk("EXT4-fs: Invalid sb specification: %s\n",
798 (char *) *data);
799 return 1;
800 }
801 if (*options == ',')
802 options++;
803 *data = (void *) options;
804 return sb_block;
805}
806
807static int parse_options (char *options, struct super_block *sb,
808 unsigned int *inum, unsigned long *journal_devnum,
809 ext4_fsblk_t *n_blocks_count, int is_remount)
810{
811 struct ext4_sb_info *sbi = EXT4_SB(sb);
812 char * p;
813 substring_t args[MAX_OPT_ARGS];
814 int data_opt = 0;
815 int option;
816#ifdef CONFIG_QUOTA
817 int qtype;
818 char *qname;
819#endif
820
821 if (!options)
822 return 1;
823
824 while ((p = strsep (&options, ",")) != NULL) {
825 int token;
826 if (!*p)
827 continue;
828
829 token = match_token(p, tokens, args);
830 switch (token) {
831 case Opt_bsd_df:
832 clear_opt (sbi->s_mount_opt, MINIX_DF);
833 break;
834 case Opt_minix_df:
835 set_opt (sbi->s_mount_opt, MINIX_DF);
836 break;
837 case Opt_grpid:
838 set_opt (sbi->s_mount_opt, GRPID);
839 break;
840 case Opt_nogrpid:
841 clear_opt (sbi->s_mount_opt, GRPID);
842 break;
843 case Opt_resuid:
844 if (match_int(&args[0], &option))
845 return 0;
846 sbi->s_resuid = option;
847 break;
848 case Opt_resgid:
849 if (match_int(&args[0], &option))
850 return 0;
851 sbi->s_resgid = option;
852 break;
853 case Opt_sb:
854 /* handled by get_sb_block() instead of here */
855 /* *sb_block = match_int(&args[0]); */
856 break;
857 case Opt_err_panic:
858 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
859 clear_opt (sbi->s_mount_opt, ERRORS_RO);
860 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
861 break;
862 case Opt_err_ro:
863 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
864 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
865 set_opt (sbi->s_mount_opt, ERRORS_RO);
866 break;
867 case Opt_err_cont:
868 clear_opt (sbi->s_mount_opt, ERRORS_RO);
869 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
870 set_opt (sbi->s_mount_opt, ERRORS_CONT);
871 break;
872 case Opt_nouid32:
873 set_opt (sbi->s_mount_opt, NO_UID32);
874 break;
875 case Opt_nocheck:
876 clear_opt (sbi->s_mount_opt, CHECK);
877 break;
878 case Opt_debug:
879 set_opt (sbi->s_mount_opt, DEBUG);
880 break;
881 case Opt_oldalloc:
882 set_opt (sbi->s_mount_opt, OLDALLOC);
883 break;
884 case Opt_orlov:
885 clear_opt (sbi->s_mount_opt, OLDALLOC);
886 break;
887#ifdef CONFIG_EXT4DEV_FS_XATTR
888 case Opt_user_xattr:
889 set_opt (sbi->s_mount_opt, XATTR_USER);
890 break;
891 case Opt_nouser_xattr:
892 clear_opt (sbi->s_mount_opt, XATTR_USER);
893 break;
894#else
895 case Opt_user_xattr:
896 case Opt_nouser_xattr:
897 printk("EXT4 (no)user_xattr options not supported\n");
898 break;
899#endif
900#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
901 case Opt_acl:
902 set_opt(sbi->s_mount_opt, POSIX_ACL);
903 break;
904 case Opt_noacl:
905 clear_opt(sbi->s_mount_opt, POSIX_ACL);
906 break;
907#else
908 case Opt_acl:
909 case Opt_noacl:
910 printk("EXT4 (no)acl options not supported\n");
911 break;
912#endif
913 case Opt_reservation:
914 set_opt(sbi->s_mount_opt, RESERVATION);
915 break;
916 case Opt_noreservation:
917 clear_opt(sbi->s_mount_opt, RESERVATION);
918 break;
919 case Opt_journal_update:
920 /* @@@ FIXME */
921 /* Eventually we will want to be able to create
922 a journal file here. For now, only allow the
923 user to specify an existing inode to be the
924 journal file. */
925 if (is_remount) {
926 printk(KERN_ERR "EXT4-fs: cannot specify "
927 "journal on remount\n");
928 return 0;
929 }
930 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
931 break;
932 case Opt_journal_inum:
933 if (is_remount) {
934 printk(KERN_ERR "EXT4-fs: cannot specify "
935 "journal on remount\n");
936 return 0;
937 }
938 if (match_int(&args[0], &option))
939 return 0;
940 *inum = option;
941 break;
942 case Opt_journal_dev:
943 if (is_remount) {
944 printk(KERN_ERR "EXT4-fs: cannot specify "
945 "journal on remount\n");
946 return 0;
947 }
948 if (match_int(&args[0], &option))
949 return 0;
950 *journal_devnum = option;
951 break;
952 case Opt_noload:
953 set_opt (sbi->s_mount_opt, NOLOAD);
954 break;
955 case Opt_commit:
956 if (match_int(&args[0], &option))
957 return 0;
958 if (option < 0)
959 return 0;
960 if (option == 0)
961 option = JBD_DEFAULT_MAX_COMMIT_AGE;
962 sbi->s_commit_interval = HZ * option;
963 break;
964 case Opt_data_journal:
965 data_opt = EXT4_MOUNT_JOURNAL_DATA;
966 goto datacheck;
967 case Opt_data_ordered:
968 data_opt = EXT4_MOUNT_ORDERED_DATA;
969 goto datacheck;
970 case Opt_data_writeback:
971 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
972 datacheck:
973 if (is_remount) {
974 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
975 != data_opt) {
976 printk(KERN_ERR
977 "EXT4-fs: cannot change data "
978 "mode on remount\n");
979 return 0;
980 }
981 } else {
982 sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
983 sbi->s_mount_opt |= data_opt;
984 }
985 break;
986#ifdef CONFIG_QUOTA
987 case Opt_usrjquota:
988 qtype = USRQUOTA;
989 goto set_qf_name;
990 case Opt_grpjquota:
991 qtype = GRPQUOTA;
992set_qf_name:
993 if (sb_any_quota_enabled(sb)) {
994 printk(KERN_ERR
995 "EXT4-fs: Cannot change journalled "
996 "quota options when quota turned on.\n");
997 return 0;
998 }
999 qname = match_strdup(&args[0]);
1000 if (!qname) {
1001 printk(KERN_ERR
1002 "EXT4-fs: not enough memory for "
1003 "storing quotafile name.\n");
1004 return 0;
1005 }
1006 if (sbi->s_qf_names[qtype] &&
1007 strcmp(sbi->s_qf_names[qtype], qname)) {
1008 printk(KERN_ERR
1009 "EXT4-fs: %s quota file already "
1010 "specified.\n", QTYPE2NAME(qtype));
1011 kfree(qname);
1012 return 0;
1013 }
1014 sbi->s_qf_names[qtype] = qname;
1015 if (strchr(sbi->s_qf_names[qtype], '/')) {
1016 printk(KERN_ERR
1017 "EXT4-fs: quotafile must be on "
1018 "filesystem root.\n");
1019 kfree(sbi->s_qf_names[qtype]);
1020 sbi->s_qf_names[qtype] = NULL;
1021 return 0;
1022 }
1023 set_opt(sbi->s_mount_opt, QUOTA);
1024 break;
1025 case Opt_offusrjquota:
1026 qtype = USRQUOTA;
1027 goto clear_qf_name;
1028 case Opt_offgrpjquota:
1029 qtype = GRPQUOTA;
1030clear_qf_name:
1031 if (sb_any_quota_enabled(sb)) {
1032 printk(KERN_ERR "EXT4-fs: Cannot change "
1033 "journalled quota options when "
1034 "quota turned on.\n");
1035 return 0;
1036 }
1037 /*
1038 * The space will be released later when all options
1039 * are confirmed to be correct
1040 */
1041 sbi->s_qf_names[qtype] = NULL;
1042 break;
1043 case Opt_jqfmt_vfsold:
1044 sbi->s_jquota_fmt = QFMT_VFS_OLD;
1045 break;
1046 case Opt_jqfmt_vfsv0:
1047 sbi->s_jquota_fmt = QFMT_VFS_V0;
1048 break;
1049 case Opt_quota:
1050 case Opt_usrquota:
1051 set_opt(sbi->s_mount_opt, QUOTA);
1052 set_opt(sbi->s_mount_opt, USRQUOTA);
1053 break;
1054 case Opt_grpquota:
1055 set_opt(sbi->s_mount_opt, QUOTA);
1056 set_opt(sbi->s_mount_opt, GRPQUOTA);
1057 break;
1058 case Opt_noquota:
1059 if (sb_any_quota_enabled(sb)) {
1060 printk(KERN_ERR "EXT4-fs: Cannot change quota "
1061 "options when quota turned on.\n");
1062 return 0;
1063 }
1064 clear_opt(sbi->s_mount_opt, QUOTA);
1065 clear_opt(sbi->s_mount_opt, USRQUOTA);
1066 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1067 break;
1068#else
1069 case Opt_quota:
1070 case Opt_usrquota:
1071 case Opt_grpquota:
1072 case Opt_usrjquota:
1073 case Opt_grpjquota:
1074 case Opt_offusrjquota:
1075 case Opt_offgrpjquota:
1076 case Opt_jqfmt_vfsold:
1077 case Opt_jqfmt_vfsv0:
1078 printk(KERN_ERR
1079 "EXT4-fs: journalled quota options not "
1080 "supported.\n");
1081 break;
1082 case Opt_noquota:
1083 break;
1084#endif
1085 case Opt_abort:
1086 set_opt(sbi->s_mount_opt, ABORT);
1087 break;
1088 case Opt_barrier:
1089 if (match_int(&args[0], &option))
1090 return 0;
1091 if (option)
1092 set_opt(sbi->s_mount_opt, BARRIER);
1093 else
1094 clear_opt(sbi->s_mount_opt, BARRIER);
1095 break;
1096 case Opt_ignore:
1097 break;
1098 case Opt_resize:
1099 if (!is_remount) {
1100 printk("EXT4-fs: resize option only available "
1101 "for remount\n");
1102 return 0;
1103 }
1104 if (match_int(&args[0], &option) != 0)
1105 return 0;
1106 *n_blocks_count = option;
1107 break;
1108 case Opt_nobh:
1109 set_opt(sbi->s_mount_opt, NOBH);
1110 break;
1111 case Opt_bh:
1112 clear_opt(sbi->s_mount_opt, NOBH);
1113 break;
1114 case Opt_extents:
1115 set_opt (sbi->s_mount_opt, EXTENTS);
1116 break;
1117 default:
1118 printk (KERN_ERR
1119 "EXT4-fs: Unrecognized mount option \"%s\" "
1120 "or missing value\n", p);
1121 return 0;
1122 }
1123 }
1124#ifdef CONFIG_QUOTA
1125 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1126 if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
1127 sbi->s_qf_names[USRQUOTA])
1128 clear_opt(sbi->s_mount_opt, USRQUOTA);
1129
1130 if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
1131 sbi->s_qf_names[GRPQUOTA])
1132 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1133
1134 if ((sbi->s_qf_names[USRQUOTA] &&
1135 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1136 (sbi->s_qf_names[GRPQUOTA] &&
1137 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1138 printk(KERN_ERR "EXT4-fs: old and new quota "
1139 "format mixing.\n");
1140 return 0;
1141 }
1142
1143 if (!sbi->s_jquota_fmt) {
1144 printk(KERN_ERR "EXT4-fs: journalled quota format "
1145 "not specified.\n");
1146 return 0;
1147 }
1148 } else {
1149 if (sbi->s_jquota_fmt) {
1150 printk(KERN_ERR "EXT4-fs: journalled quota format "
1151 "specified with no journalling "
1152 "enabled.\n");
1153 return 0;
1154 }
1155 }
1156#endif
1157 return 1;
1158}
1159
1160static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1161 int read_only)
1162{
1163 struct ext4_sb_info *sbi = EXT4_SB(sb);
1164 int res = 0;
1165
1166 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1167 printk (KERN_ERR "EXT4-fs warning: revision level too high, "
1168 "forcing read-only mode\n");
1169 res = MS_RDONLY;
1170 }
1171 if (read_only)
1172 return res;
1173 if (!(sbi->s_mount_state & EXT4_VALID_FS))
1174 printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
1175 "running e2fsck is recommended\n");
1176 else if ((sbi->s_mount_state & EXT4_ERROR_FS))
1177 printk (KERN_WARNING
1178 "EXT4-fs warning: mounting fs with errors, "
1179 "running e2fsck is recommended\n");
1180 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
1181 le16_to_cpu(es->s_mnt_count) >=
1182 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1183 printk (KERN_WARNING
1184 "EXT4-fs warning: maximal mount count reached, "
1185 "running e2fsck is recommended\n");
1186 else if (le32_to_cpu(es->s_checkinterval) &&
1187 (le32_to_cpu(es->s_lastcheck) +
1188 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1189 printk (KERN_WARNING
1190 "EXT4-fs warning: checktime reached, "
1191 "running e2fsck is recommended\n");
1192#if 0
1193 /* @@@ We _will_ want to clear the valid bit if we find
1194 * inconsistencies, to force a fsck at reboot. But for
1195 * a plain journaled filesystem we can keep it set as
1196 * valid forever! :)
1197 */
1198 es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
1199#endif
1200 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1201 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1202 es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
1203 es->s_mtime = cpu_to_le32(get_seconds());
1204 ext4_update_dynamic_rev(sb);
1205 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1206
1207 ext4_commit_super(sb, es, 1);
1208 if (test_opt(sb, DEBUG))
1209 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
1210 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
1211 sb->s_blocksize,
1212 sbi->s_groups_count,
1213 EXT4_BLOCKS_PER_GROUP(sb),
1214 EXT4_INODES_PER_GROUP(sb),
1215 sbi->s_mount_opt);
1216
1217 printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
1218 if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
1219 char b[BDEVNAME_SIZE];
1220
1221 printk("external journal on %s\n",
1222 bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
1223 } else {
1224 printk("internal journal\n");
1225 }
1226 return res;
1227}
1228
1229/* Called at mount-time, super-block is locked */
1230static int ext4_check_descriptors (struct super_block * sb)
1231{
1232 struct ext4_sb_info *sbi = EXT4_SB(sb);
1233 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
1234 ext4_fsblk_t last_block;
1235 ext4_fsblk_t block_bitmap;
1236 ext4_fsblk_t inode_bitmap;
1237 ext4_fsblk_t inode_table;
1238 struct ext4_group_desc * gdp = NULL;
1239 int desc_block = 0;
1240 int i;
1241
1242 ext4_debug ("Checking group descriptors");
1243
1244 for (i = 0; i < sbi->s_groups_count; i++)
1245 {
1246 if (i == sbi->s_groups_count - 1)
1247 last_block = ext4_blocks_count(sbi->s_es) - 1;
1248 else
1249 last_block = first_block +
1250 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1251
1252 if ((i % EXT4_DESC_PER_BLOCK(sb)) == 0)
1253 gdp = (struct ext4_group_desc *)
1254 sbi->s_group_desc[desc_block++]->b_data;
1255 block_bitmap = ext4_block_bitmap(sb, gdp);
1256 if (block_bitmap < first_block || block_bitmap > last_block)
1257 {
1258 ext4_error (sb, "ext4_check_descriptors",
1259 "Block bitmap for group %d"
1260 " not in group (block %llu)!",
1261 i, block_bitmap);
1262 return 0;
1263 }
1264 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1265 if (inode_bitmap < first_block || inode_bitmap > last_block)
1266 {
1267 ext4_error (sb, "ext4_check_descriptors",
1268 "Inode bitmap for group %d"
1269 " not in group (block %llu)!",
1270 i, inode_bitmap);
1271 return 0;
1272 }
1273 inode_table = ext4_inode_table(sb, gdp);
1274 if (inode_table < first_block ||
1275 inode_table + sbi->s_itb_per_group > last_block)
1276 {
1277 ext4_error (sb, "ext4_check_descriptors",
1278 "Inode table for group %d"
1279 " not in group (block %llu)!",
1280 i, inode_table);
1281 return 0;
1282 }
1283 first_block += EXT4_BLOCKS_PER_GROUP(sb);
1284 gdp = (struct ext4_group_desc *)
1285 ((__u8 *)gdp + EXT4_DESC_SIZE(sb));
1286 }
1287
1288 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
1289 sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
1290 return 1;
1291}
1292
1293
1294/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
1295 * the superblock) which were deleted from all directories, but held open by
1296 * a process at the time of a crash. We walk the list and try to delete these
1297 * inodes at recovery time (only with a read-write filesystem).
1298 *
1299 * In order to keep the orphan inode chain consistent during traversal (in
1300 * case of crash during recovery), we link each inode into the superblock
1301 * orphan list_head and handle it the same way as an inode deletion during
1302 * normal operation (which journals the operations for us).
1303 *
1304 * We only do an iget() and an iput() on each inode, which is very safe if we
1305 * accidentally point at an in-use or already deleted inode. The worst that
1306 * can happen in this case is that we get a "bit already cleared" message from
1307 * ext4_free_inode(). The only reason we would point at a wrong inode is if
1308 * e2fsck was run on this filesystem, and it must have already done the orphan
1309 * inode cleanup for us, so we can safely abort without any further action.
1310 */
1311static void ext4_orphan_cleanup (struct super_block * sb,
1312 struct ext4_super_block * es)
1313{
1314 unsigned int s_flags = sb->s_flags;
1315 int nr_orphans = 0, nr_truncates = 0;
1316#ifdef CONFIG_QUOTA
1317 int i;
1318#endif
1319 if (!es->s_last_orphan) {
1320 jbd_debug(4, "no orphan inodes to clean up\n");
1321 return;
1322 }
1323
1324 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
1325 if (es->s_last_orphan)
1326 jbd_debug(1, "Errors on filesystem, "
1327 "clearing orphan list.\n");
1328 es->s_last_orphan = 0;
1329 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1330 return;
1331 }
1332
1333 if (s_flags & MS_RDONLY) {
1334 printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
1335 sb->s_id);
1336 sb->s_flags &= ~MS_RDONLY;
1337 }
1338#ifdef CONFIG_QUOTA
1339 /* Needed for iput() to work correctly and not trash data */
1340 sb->s_flags |= MS_ACTIVE;
1341 /* Turn on quotas so that they are updated correctly */
1342 for (i = 0; i < MAXQUOTAS; i++) {
1343 if (EXT4_SB(sb)->s_qf_names[i]) {
1344 int ret = ext4_quota_on_mount(sb, i);
1345 if (ret < 0)
1346 printk(KERN_ERR
1347 "EXT4-fs: Cannot turn on journalled "
1348 "quota: error %d\n", ret);
1349 }
1350 }
1351#endif
1352
1353 while (es->s_last_orphan) {
1354 struct inode *inode;
1355
1356 if (!(inode =
1357 ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
1358 es->s_last_orphan = 0;
1359 break;
1360 }
1361
1362 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1363 DQUOT_INIT(inode);
1364 if (inode->i_nlink) {
1365 printk(KERN_DEBUG
1366 "%s: truncating inode %lu to %Ld bytes\n",
1367 __FUNCTION__, inode->i_ino, inode->i_size);
1368 jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
1369 inode->i_ino, inode->i_size);
1370 ext4_truncate(inode);
1371 nr_truncates++;
1372 } else {
1373 printk(KERN_DEBUG
1374 "%s: deleting unreferenced inode %lu\n",
1375 __FUNCTION__, inode->i_ino);
1376 jbd_debug(2, "deleting unreferenced inode %lu\n",
1377 inode->i_ino);
1378 nr_orphans++;
1379 }
1380 iput(inode); /* The delete magic happens here! */
1381 }
1382
1383#define PLURAL(x) (x), ((x)==1) ? "" : "s"
1384
1385 if (nr_orphans)
1386 printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
1387 sb->s_id, PLURAL(nr_orphans));
1388 if (nr_truncates)
1389 printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
1390 sb->s_id, PLURAL(nr_truncates));
1391#ifdef CONFIG_QUOTA
1392 /* Turn quotas off */
1393 for (i = 0; i < MAXQUOTAS; i++) {
1394 if (sb_dqopt(sb)->files[i])
1395 vfs_quota_off(sb, i);
1396 }
1397#endif
1398 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1399}
1400
1401#define log2(n) ffz(~(n))
1402
1403/*
1404 * Maximal file size. There is a direct, and {,double-,triple-}indirect
1405 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
1406 * We need to be 1 filesystem block less than the 2^32 sector limit.
1407 */
1408static loff_t ext4_max_size(int bits)
1409{
1410 loff_t res = EXT4_NDIR_BLOCKS;
1411 /* This constant is calculated to be the largest file size for a
1412 * dense, 4k-blocksize file such that the total number of
1413 * sectors in the file, including data and all indirect blocks,
1414 * does not exceed 2^32. */
1415 const loff_t upper_limit = 0x1ff7fffd000LL;
1416
1417 res += 1LL << (bits-2);
1418 res += 1LL << (2*(bits-2));
1419 res += 1LL << (3*(bits-2));
1420 res <<= bits;
1421 if (res > upper_limit)
1422 res = upper_limit;
1423 return res;
1424}
1425
1426static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1427 ext4_fsblk_t logical_sb_block, int nr)
1428{
1429 struct ext4_sb_info *sbi = EXT4_SB(sb);
1430 unsigned long bg, first_meta_bg;
1431 int has_super = 0;
1432
1433 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1434
1435 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
1436 nr < first_meta_bg)
1437 return logical_sb_block + nr + 1;
1438 bg = sbi->s_desc_per_block * nr;
1439 if (ext4_bg_has_super(sb, bg))
1440 has_super = 1;
1441 return (has_super + ext4_group_first_block_no(sb, bg));
1442}
1443
1444
1445static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1446{
1447 struct buffer_head * bh;
1448 struct ext4_super_block *es = NULL;
1449 struct ext4_sb_info *sbi;
1450 ext4_fsblk_t block;
1451 ext4_fsblk_t sb_block = get_sb_block(&data);
1452 ext4_fsblk_t logical_sb_block;
1453 unsigned long offset = 0;
1454 unsigned int journal_inum = 0;
1455 unsigned long journal_devnum = 0;
1456 unsigned long def_mount_opts;
1457 struct inode *root;
1458 int blocksize;
1459 int hblock;
1460 int db_count;
1461 int i;
1462 int needs_recovery;
1463 __le32 features;
1464 __u64 blocks_count;
1465
1466 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1467 if (!sbi)
1468 return -ENOMEM;
1469 sb->s_fs_info = sbi;
1470 sbi->s_mount_opt = 0;
1471 sbi->s_resuid = EXT4_DEF_RESUID;
1472 sbi->s_resgid = EXT4_DEF_RESGID;
1473
1474 unlock_kernel();
1475
1476 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
1477 if (!blocksize) {
1478 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
1479 goto out_fail;
1480 }
1481
1482 /*
1483 * The ext4 superblock will not be buffer aligned for other than 1kB
1484 * block sizes. We need to calculate the offset from buffer start.
1485 */
1486 if (blocksize != EXT4_MIN_BLOCK_SIZE) {
1487 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1488 offset = do_div(logical_sb_block, blocksize);
1489 } else {
1490 logical_sb_block = sb_block;
1491 }
1492
1493 if (!(bh = sb_bread(sb, logical_sb_block))) {
1494 printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
1495 goto out_fail;
1496 }
1497 /*
1498 * Note: s_es must be initialized as soon as possible because
1499 * some ext4 macro-instructions depend on its value
1500 */
1501 es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
1502 sbi->s_es = es;
1503 sb->s_magic = le16_to_cpu(es->s_magic);
1504 if (sb->s_magic != EXT4_SUPER_MAGIC)
1505 goto cantfind_ext4;
1506
1507 /* Set defaults before we parse the mount options */
1508 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1509 if (def_mount_opts & EXT4_DEFM_DEBUG)
1510 set_opt(sbi->s_mount_opt, DEBUG);
1511 if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
1512 set_opt(sbi->s_mount_opt, GRPID);
1513 if (def_mount_opts & EXT4_DEFM_UID16)
1514 set_opt(sbi->s_mount_opt, NO_UID32);
1515 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
1516 set_opt(sbi->s_mount_opt, XATTR_USER);
1517 if (def_mount_opts & EXT4_DEFM_ACL)
1518 set_opt(sbi->s_mount_opt, POSIX_ACL);
1519 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
1520 sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
1521 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
1522 sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
1523 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
1524 sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
1525
1526 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
1527 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1528 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO)
1529 set_opt(sbi->s_mount_opt, ERRORS_RO);
1530 else
1531 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1532
1533 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1534 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
1535
1536 set_opt(sbi->s_mount_opt, RESERVATION);
1537
1538 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1539 NULL, 0))
1540 goto failed_mount;
1541
1542 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1543 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1544
1545 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
1546 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
1547 EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
1548 EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
1549 printk(KERN_WARNING
1550 "EXT4-fs warning: feature flags set on rev 0 fs, "
1551 "running e2fsck is recommended\n");
1552 /*
1553 * Check feature flags regardless of the revision level, since we
1554 * previously didn't change the revision level when setting the flags,
1555 * so there is a chance incompat flags are set on a rev 0 filesystem.
1556 */
1557 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
1558 if (features) {
1559 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
1560 "unsupported optional features (%x).\n",
1561 sb->s_id, le32_to_cpu(features));
1562 goto failed_mount;
1563 }
1564 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
1565 if (!(sb->s_flags & MS_RDONLY) && features) {
1566 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
1567 "unsupported optional features (%x).\n",
1568 sb->s_id, le32_to_cpu(features));
1569 goto failed_mount;
1570 }
1571 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1572
1573 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
1574 blocksize > EXT4_MAX_BLOCK_SIZE) {
1575 printk(KERN_ERR
1576 "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
1577 blocksize, sb->s_id);
1578 goto failed_mount;
1579 }
1580
1581 hblock = bdev_hardsect_size(sb->s_bdev);
1582 if (sb->s_blocksize != blocksize) {
1583 /*
1584 * Make sure the blocksize for the filesystem is larger
1585 * than the hardware sectorsize for the machine.
1586 */
1587 if (blocksize < hblock) {
1588 printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
1589 "device blocksize %d.\n", blocksize, hblock);
1590 goto failed_mount;
1591 }
1592
1593 brelse (bh);
1594 sb_set_blocksize(sb, blocksize);
1595 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1596 offset = do_div(logical_sb_block, blocksize);
1597 bh = sb_bread(sb, logical_sb_block);
1598 if (!bh) {
1599 printk(KERN_ERR
1600 "EXT4-fs: Can't read superblock on 2nd try.\n");
1601 goto failed_mount;
1602 }
1603 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
1604 sbi->s_es = es;
1605 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
1606 printk (KERN_ERR
1607 "EXT4-fs: Magic mismatch, very weird !\n");
1608 goto failed_mount;
1609 }
1610 }
1611
1612 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
1613
1614 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
1615 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
1616 sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
1617 } else {
1618 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1619 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1620 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
1621 (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
1622 (sbi->s_inode_size > blocksize)) {
1623 printk (KERN_ERR
1624 "EXT4-fs: unsupported inode size: %d\n",
1625 sbi->s_inode_size);
1626 goto failed_mount;
1627 }
1628 }
1629 sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
1630 le32_to_cpu(es->s_log_frag_size);
1631 if (blocksize != sbi->s_frag_size) {
1632 printk(KERN_ERR
1633 "EXT4-fs: fragsize %lu != blocksize %u (unsupported)\n",
1634 sbi->s_frag_size, blocksize);
1635 goto failed_mount;
1636 }
1637 sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
1638 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
1639 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
1640 sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
1641 sbi->s_desc_size & (sbi->s_desc_size - 1)) {
1642 printk(KERN_ERR
1643 "EXT4-fs: unsupported descriptor size %lu\n",
1644 sbi->s_desc_size);
1645 goto failed_mount;
1646 }
1647 } else
1648 sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
1649 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1650 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1651 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1652 if (EXT4_INODE_SIZE(sb) == 0)
1653 goto cantfind_ext4;
1654 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
1655 if (sbi->s_inodes_per_block == 0)
1656 goto cantfind_ext4;
1657 sbi->s_itb_per_group = sbi->s_inodes_per_group /
1658 sbi->s_inodes_per_block;
1659 sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
1660 sbi->s_sbh = bh;
1661 sbi->s_mount_state = le16_to_cpu(es->s_state);
1662 sbi->s_addr_per_block_bits = log2(EXT4_ADDR_PER_BLOCK(sb));
1663 sbi->s_desc_per_block_bits = log2(EXT4_DESC_PER_BLOCK(sb));
1664 for (i=0; i < 4; i++)
1665 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1666 sbi->s_def_hash_version = es->s_def_hash_version;
1667
1668 if (sbi->s_blocks_per_group > blocksize * 8) {
1669 printk (KERN_ERR
1670 "EXT4-fs: #blocks per group too big: %lu\n",
1671 sbi->s_blocks_per_group);
1672 goto failed_mount;
1673 }
1674 if (sbi->s_frags_per_group > blocksize * 8) {
1675 printk (KERN_ERR
1676 "EXT4-fs: #fragments per group too big: %lu\n",
1677 sbi->s_frags_per_group);
1678 goto failed_mount;
1679 }
1680 if (sbi->s_inodes_per_group > blocksize * 8) {
1681 printk (KERN_ERR
1682 "EXT4-fs: #inodes per group too big: %lu\n",
1683 sbi->s_inodes_per_group);
1684 goto failed_mount;
1685 }
1686
1687 if (ext4_blocks_count(es) >
1688 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1689 printk(KERN_ERR "EXT4-fs: filesystem on %s:"
1690 " too large to mount safely\n", sb->s_id);
1691 if (sizeof(sector_t) < 8)
1692 printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
1693 "enabled\n");
1694 goto failed_mount;
1695 }
1696
1697 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
1698 goto cantfind_ext4;
1699 blocks_count = (ext4_blocks_count(es) -
1700 le32_to_cpu(es->s_first_data_block) +
1701 EXT4_BLOCKS_PER_GROUP(sb) - 1);
1702 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
1703 sbi->s_groups_count = blocks_count;
1704 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
1705 EXT4_DESC_PER_BLOCK(sb);
1706 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1707 GFP_KERNEL);
1708 if (sbi->s_group_desc == NULL) {
1709 printk (KERN_ERR "EXT4-fs: not enough memory\n");
1710 goto failed_mount;
1711 }
1712
1713 bgl_lock_init(&sbi->s_blockgroup_lock);
1714
1715 for (i = 0; i < db_count; i++) {
1716 block = descriptor_loc(sb, logical_sb_block, i);
1717 sbi->s_group_desc[i] = sb_bread(sb, block);
1718 if (!sbi->s_group_desc[i]) {
1719 printk (KERN_ERR "EXT4-fs: "
1720 "can't read group descriptor %d\n", i);
1721 db_count = i;
1722 goto failed_mount2;
1723 }
1724 }
1725 if (!ext4_check_descriptors (sb)) {
1726 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
1727 goto failed_mount2;
1728 }
1729 sbi->s_gdb_count = db_count;
1730 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1731 spin_lock_init(&sbi->s_next_gen_lock);
1732
1733 percpu_counter_init(&sbi->s_freeblocks_counter,
1734 ext4_count_free_blocks(sb));
1735 percpu_counter_init(&sbi->s_freeinodes_counter,
1736 ext4_count_free_inodes(sb));
1737 percpu_counter_init(&sbi->s_dirs_counter,
1738 ext4_count_dirs(sb));
1739
1740 /* per fileystem reservation list head & lock */
1741 spin_lock_init(&sbi->s_rsv_window_lock);
1742 sbi->s_rsv_window_root = RB_ROOT;
1743 /* Add a single, static dummy reservation to the start of the
1744 * reservation window list --- it gives us a placeholder for
1745 * append-at-start-of-list which makes the allocation logic
1746 * _much_ simpler. */
1747 sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
1748 sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
1749 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
1750 sbi->s_rsv_window_head.rsv_goal_size = 0;
1751 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
1752
1753 /*
1754 * set up enough so that it can read an inode
1755 */
1756 sb->s_op = &ext4_sops;
1757 sb->s_export_op = &ext4_export_ops;
1758 sb->s_xattr = ext4_xattr_handlers;
1759#ifdef CONFIG_QUOTA
1760 sb->s_qcop = &ext4_qctl_operations;
1761 sb->dq_op = &ext4_quota_operations;
1762#endif
1763 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1764
1765 sb->s_root = NULL;
1766
1767 needs_recovery = (es->s_last_orphan != 0 ||
1768 EXT4_HAS_INCOMPAT_FEATURE(sb,
1769 EXT4_FEATURE_INCOMPAT_RECOVER));
1770
1771 /*
1772 * The first inode we look at is the journal inode. Don't try
1773 * root first: it may be modified in the journal!
1774 */
1775 if (!test_opt(sb, NOLOAD) &&
1776 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
1777 if (ext4_load_journal(sb, es, journal_devnum))
1778 goto failed_mount3;
1779 } else if (journal_inum) {
1780 if (ext4_create_journal(sb, es, journal_inum))
1781 goto failed_mount3;
1782 } else {
1783 if (!silent)
1784 printk (KERN_ERR
1785 "ext4: No journal on filesystem on %s\n",
1786 sb->s_id);
1787 goto failed_mount3;
1788 }
1789
1790 /* We have now updated the journal if required, so we can
1791 * validate the data journaling mode. */
1792 switch (test_opt(sb, DATA_FLAGS)) {
1793 case 0:
1794 /* No mode set, assume a default based on the journal
1795 * capabilities: ORDERED_DATA if the journal can
1796 * cope, else JOURNAL_DATA
1797 */
1798 if (jbd2_journal_check_available_features
1799 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
1800 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1801 else
1802 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1803 break;
1804
1805 case EXT4_MOUNT_ORDERED_DATA:
1806 case EXT4_MOUNT_WRITEBACK_DATA:
1807 if (!jbd2_journal_check_available_features
1808 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
1809 printk(KERN_ERR "EXT4-fs: Journal does not support "
1810 "requested data journaling mode\n");
1811 goto failed_mount4;
1812 }
1813 default:
1814 break;
1815 }
1816
1817 if (test_opt(sb, NOBH)) {
1818 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
1819 printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
1820 "its supported only with writeback mode\n");
1821 clear_opt(sbi->s_mount_opt, NOBH);
1822 }
1823 }
1824 /*
1825 * The jbd2_journal_load will have done any necessary log recovery,
1826 * so we can safely mount the rest of the filesystem now.
1827 */
1828
1829 root = iget(sb, EXT4_ROOT_INO);
1830 sb->s_root = d_alloc_root(root);
1831 if (!sb->s_root) {
1832 printk(KERN_ERR "EXT4-fs: get root inode failed\n");
1833 iput(root);
1834 goto failed_mount4;
1835 }
1836 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
1837 dput(sb->s_root);
1838 sb->s_root = NULL;
1839 printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
1840 goto failed_mount4;
1841 }
1842
1843 ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1844 /*
1845 * akpm: core read_super() calls in here with the superblock locked.
1846 * That deadlocks, because orphan cleanup needs to lock the superblock
1847 * in numerous places. Here we just pop the lock - it's relatively
1848 * harmless, because we are now ready to accept write_super() requests,
1849 * and aviro says that's the only reason for hanging onto the
1850 * superblock lock.
1851 */
1852 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
1853 ext4_orphan_cleanup(sb, es);
1854 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
1855 if (needs_recovery)
1856 printk (KERN_INFO "EXT4-fs: recovery complete.\n");
1857 ext4_mark_recovery_complete(sb, es);
1858 printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
1859 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
1860 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
1861 "writeback");
1862
1863 ext4_ext_init(sb);
1864
1865 lock_kernel();
1866 return 0;
1867
1868cantfind_ext4:
1869 if (!silent)
1870 printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
1871 sb->s_id);
1872 goto failed_mount;
1873
1874failed_mount4:
1875 jbd2_journal_destroy(sbi->s_journal);
1876failed_mount3:
1877 percpu_counter_destroy(&sbi->s_freeblocks_counter);
1878 percpu_counter_destroy(&sbi->s_freeinodes_counter);
1879 percpu_counter_destroy(&sbi->s_dirs_counter);
1880failed_mount2:
1881 for (i = 0; i < db_count; i++)
1882 brelse(sbi->s_group_desc[i]);
1883 kfree(sbi->s_group_desc);
1884failed_mount:
1885#ifdef CONFIG_QUOTA
1886 for (i = 0; i < MAXQUOTAS; i++)
1887 kfree(sbi->s_qf_names[i]);
1888#endif
1889 ext4_blkdev_remove(sbi);
1890 brelse(bh);
1891out_fail:
1892 sb->s_fs_info = NULL;
1893 kfree(sbi);
1894 lock_kernel();
1895 return -EINVAL;
1896}
1897
1898/*
1899 * Setup any per-fs journal parameters now. We'll do this both on
1900 * initial mount, once the journal has been initialised but before we've
1901 * done any recovery; and again on any subsequent remount.
1902 */
1903static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
1904{
1905 struct ext4_sb_info *sbi = EXT4_SB(sb);
1906
1907 if (sbi->s_commit_interval)
1908 journal->j_commit_interval = sbi->s_commit_interval;
1909 /* We could also set up an ext4-specific default for the commit
1910 * interval here, but for now we'll just fall back to the jbd
1911 * default. */
1912
1913 spin_lock(&journal->j_state_lock);
1914 if (test_opt(sb, BARRIER))
1915 journal->j_flags |= JBD2_BARRIER;
1916 else
1917 journal->j_flags &= ~JBD2_BARRIER;
1918 spin_unlock(&journal->j_state_lock);
1919}
1920
1921static journal_t *ext4_get_journal(struct super_block *sb,
1922 unsigned int journal_inum)
1923{
1924 struct inode *journal_inode;
1925 journal_t *journal;
1926
1927 /* First, test for the existence of a valid inode on disk. Bad
1928 * things happen if we iget() an unused inode, as the subsequent
1929 * iput() will try to delete it. */
1930
1931 journal_inode = iget(sb, journal_inum);
1932 if (!journal_inode) {
1933 printk(KERN_ERR "EXT4-fs: no journal found.\n");
1934 return NULL;
1935 }
1936 if (!journal_inode->i_nlink) {
1937 make_bad_inode(journal_inode);
1938 iput(journal_inode);
1939 printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
1940 return NULL;
1941 }
1942
1943 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
1944 journal_inode, journal_inode->i_size);
1945 if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
1946 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
1947 iput(journal_inode);
1948 return NULL;
1949 }
1950
1951 journal = jbd2_journal_init_inode(journal_inode);
1952 if (!journal) {
1953 printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
1954 iput(journal_inode);
1955 return NULL;
1956 }
1957 journal->j_private = sb;
1958 ext4_init_journal_params(sb, journal);
1959 return journal;
1960}
1961
1962static journal_t *ext4_get_dev_journal(struct super_block *sb,
1963 dev_t j_dev)
1964{
1965 struct buffer_head * bh;
1966 journal_t *journal;
1967 ext4_fsblk_t start;
1968 ext4_fsblk_t len;
1969 int hblock, blocksize;
1970 ext4_fsblk_t sb_block;
1971 unsigned long offset;
1972 struct ext4_super_block * es;
1973 struct block_device *bdev;
1974
1975 bdev = ext4_blkdev_get(j_dev);
1976 if (bdev == NULL)
1977 return NULL;
1978
1979 if (bd_claim(bdev, sb)) {
1980 printk(KERN_ERR
1981 "EXT4: failed to claim external journal device.\n");
1982 blkdev_put(bdev);
1983 return NULL;
1984 }
1985
1986 blocksize = sb->s_blocksize;
1987 hblock = bdev_hardsect_size(bdev);
1988 if (blocksize < hblock) {
1989 printk(KERN_ERR
1990 "EXT4-fs: blocksize too small for journal device.\n");
1991 goto out_bdev;
1992 }
1993
1994 sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
1995 offset = EXT4_MIN_BLOCK_SIZE % blocksize;
1996 set_blocksize(bdev, blocksize);
1997 if (!(bh = __bread(bdev, sb_block, blocksize))) {
1998 printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
1999 "external journal\n");
2000 goto out_bdev;
2001 }
2002
2003 es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
2004 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
2005 !(le32_to_cpu(es->s_feature_incompat) &
2006 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2007 printk(KERN_ERR "EXT4-fs: external journal has "
2008 "bad superblock\n");
2009 brelse(bh);
2010 goto out_bdev;
2011 }
2012
2013 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2014 printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
2015 brelse(bh);
2016 goto out_bdev;
2017 }
2018
2019 len = ext4_blocks_count(es);
2020 start = sb_block + 1;
2021 brelse(bh); /* we're done with the superblock */
2022
2023 journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
2024 start, len, blocksize);
2025 if (!journal) {
2026 printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
2027 goto out_bdev;
2028 }
2029 journal->j_private = sb;
2030 ll_rw_block(READ, 1, &journal->j_sb_buffer);
2031 wait_on_buffer(journal->j_sb_buffer);
2032 if (!buffer_uptodate(journal->j_sb_buffer)) {
2033 printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
2034 goto out_journal;
2035 }
2036 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
2037 printk(KERN_ERR "EXT4-fs: External journal has more than one "
2038 "user (unsupported) - %d\n",
2039 be32_to_cpu(journal->j_superblock->s_nr_users));
2040 goto out_journal;
2041 }
2042 EXT4_SB(sb)->journal_bdev = bdev;
2043 ext4_init_journal_params(sb, journal);
2044 return journal;
2045out_journal:
2046 jbd2_journal_destroy(journal);
2047out_bdev:
2048 ext4_blkdev_put(bdev);
2049 return NULL;
2050}
2051
2052static int ext4_load_journal(struct super_block *sb,
2053 struct ext4_super_block *es,
2054 unsigned long journal_devnum)
2055{
2056 journal_t *journal;
2057 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
2058 dev_t journal_dev;
2059 int err = 0;
2060 int really_read_only;
2061
2062 if (journal_devnum &&
2063 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2064 printk(KERN_INFO "EXT4-fs: external journal device major/minor "
2065 "numbers have changed\n");
2066 journal_dev = new_decode_dev(journal_devnum);
2067 } else
2068 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
2069
2070 really_read_only = bdev_read_only(sb->s_bdev);
2071
2072 /*
2073 * Are we loading a blank journal or performing recovery after a
2074 * crash? For recovery, we need to check in advance whether we
2075 * can get read-write access to the device.
2076 */
2077
2078 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2079 if (sb->s_flags & MS_RDONLY) {
2080 printk(KERN_INFO "EXT4-fs: INFO: recovery "
2081 "required on readonly filesystem.\n");
2082 if (really_read_only) {
2083 printk(KERN_ERR "EXT4-fs: write access "
2084 "unavailable, cannot proceed.\n");
2085 return -EROFS;
2086 }
2087 printk (KERN_INFO "EXT4-fs: write access will "
2088 "be enabled during recovery.\n");
2089 }
2090 }
2091
2092 if (journal_inum && journal_dev) {
2093 printk(KERN_ERR "EXT4-fs: filesystem has both journal "
2094 "and inode journals!\n");
2095 return -EINVAL;
2096 }
2097
2098 if (journal_inum) {
2099 if (!(journal = ext4_get_journal(sb, journal_inum)))
2100 return -EINVAL;
2101 } else {
2102 if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
2103 return -EINVAL;
2104 }
2105
2106 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2107 err = jbd2_journal_update_format(journal);
2108 if (err) {
2109 printk(KERN_ERR "EXT4-fs: error updating journal.\n");
2110 jbd2_journal_destroy(journal);
2111 return err;
2112 }
2113 }
2114
2115 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
2116 err = jbd2_journal_wipe(journal, !really_read_only);
2117 if (!err)
2118 err = jbd2_journal_load(journal);
2119
2120 if (err) {
2121 printk(KERN_ERR "EXT4-fs: error loading journal.\n");
2122 jbd2_journal_destroy(journal);
2123 return err;
2124 }
2125
2126 EXT4_SB(sb)->s_journal = journal;
2127 ext4_clear_journal_err(sb, es);
2128
2129 if (journal_devnum &&
2130 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2131 es->s_journal_dev = cpu_to_le32(journal_devnum);
2132 sb->s_dirt = 1;
2133
2134 /* Make sure we flush the recovery flag to disk. */
2135 ext4_commit_super(sb, es, 1);
2136 }
2137
2138 return 0;
2139}
2140
2141static int ext4_create_journal(struct super_block * sb,
2142 struct ext4_super_block * es,
2143 unsigned int journal_inum)
2144{
2145 journal_t *journal;
2146
2147 if (sb->s_flags & MS_RDONLY) {
2148 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
2149 "create journal.\n");
2150 return -EROFS;
2151 }
2152
2153 if (!(journal = ext4_get_journal(sb, journal_inum)))
2154 return -EINVAL;
2155
2156 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2157 journal_inum);
2158
2159 if (jbd2_journal_create(journal)) {
2160 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2161 jbd2_journal_destroy(journal);
2162 return -EIO;
2163 }
2164
2165 EXT4_SB(sb)->s_journal = journal;
2166
2167 ext4_update_dynamic_rev(sb);
2168 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2169 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
2170
2171 es->s_journal_inum = cpu_to_le32(journal_inum);
2172 sb->s_dirt = 1;
2173
2174 /* Make sure we flush the recovery flag to disk. */
2175 ext4_commit_super(sb, es, 1);
2176
2177 return 0;
2178}
2179
2180static void ext4_commit_super (struct super_block * sb,
2181 struct ext4_super_block * es,
2182 int sync)
2183{
2184 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
2185
2186 if (!sbh)
2187 return;
2188 es->s_wtime = cpu_to_le32(get_seconds());
2189 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
2190 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
2191 BUFFER_TRACE(sbh, "marking dirty");
2192 mark_buffer_dirty(sbh);
2193 if (sync)
2194 sync_dirty_buffer(sbh);
2195}
2196
2197
2198/*
2199 * Have we just finished recovery? If so, and if we are mounting (or
2200 * remounting) the filesystem readonly, then we will end up with a
2201 * consistent fs on disk. Record that fact.
2202 */
2203static void ext4_mark_recovery_complete(struct super_block * sb,
2204 struct ext4_super_block * es)
2205{
2206 journal_t *journal = EXT4_SB(sb)->s_journal;
2207
2208 jbd2_journal_lock_updates(journal);
2209 jbd2_journal_flush(journal);
2210 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2211 sb->s_flags & MS_RDONLY) {
2212 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2213 sb->s_dirt = 0;
2214 ext4_commit_super(sb, es, 1);
2215 }
2216 jbd2_journal_unlock_updates(journal);
2217}
2218
2219/*
2220 * If we are mounting (or read-write remounting) a filesystem whose journal
2221 * has recorded an error from a previous lifetime, move that error to the
2222 * main filesystem now.
2223 */
2224static void ext4_clear_journal_err(struct super_block * sb,
2225 struct ext4_super_block * es)
2226{
2227 journal_t *journal;
2228 int j_errno;
2229 const char *errstr;
2230
2231 journal = EXT4_SB(sb)->s_journal;
2232
2233 /*
2234 * Now check for any error status which may have been recorded in the
2235 * journal by a prior ext4_error() or ext4_abort()
2236 */
2237
2238 j_errno = jbd2_journal_errno(journal);
2239 if (j_errno) {
2240 char nbuf[16];
2241
2242 errstr = ext4_decode_error(sb, j_errno, nbuf);
2243 ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
2244 "from previous mount: %s", errstr);
2245 ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
2246 "filesystem check.");
2247
2248 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2249 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2250 ext4_commit_super (sb, es, 1);
2251
2252 jbd2_journal_clear_err(journal);
2253 }
2254}
2255
2256/*
2257 * Force the running and committing transactions to commit,
2258 * and wait on the commit.
2259 */
2260int ext4_force_commit(struct super_block *sb)
2261{
2262 journal_t *journal;
2263 int ret;
2264
2265 if (sb->s_flags & MS_RDONLY)
2266 return 0;
2267
2268 journal = EXT4_SB(sb)->s_journal;
2269 sb->s_dirt = 0;
2270 ret = ext4_journal_force_commit(journal);
2271 return ret;
2272}
2273
2274/*
2275 * Ext4 always journals updates to the superblock itself, so we don't
2276 * have to propagate any other updates to the superblock on disk at this
2277 * point. Just start an async writeback to get the buffers on their way
2278 * to the disk.
2279 *
2280 * This implicitly triggers the writebehind on sync().
2281 */
2282
2283static void ext4_write_super (struct super_block * sb)
2284{
2285 if (mutex_trylock(&sb->s_lock) != 0)
2286 BUG();
2287 sb->s_dirt = 0;
2288}
2289
2290static int ext4_sync_fs(struct super_block *sb, int wait)
2291{
2292 tid_t target;
2293
2294 sb->s_dirt = 0;
2295 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2296 if (wait)
2297 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
2298 }
2299 return 0;
2300}
2301
2302/*
2303 * LVM calls this function before a (read-only) snapshot is created. This
2304 * gives us a chance to flush the journal completely and mark the fs clean.
2305 */
2306static void ext4_write_super_lockfs(struct super_block *sb)
2307{
2308 sb->s_dirt = 0;
2309
2310 if (!(sb->s_flags & MS_RDONLY)) {
2311 journal_t *journal = EXT4_SB(sb)->s_journal;
2312
2313 /* Now we set up the journal barrier. */
2314 jbd2_journal_lock_updates(journal);
2315 jbd2_journal_flush(journal);
2316
2317 /* Journal blocked and flushed, clear needs_recovery flag. */
2318 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2319 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
2320 }
2321}
2322
2323/*
2324 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2325 * flag here, even though the filesystem is not technically dirty yet.
2326 */
2327static void ext4_unlockfs(struct super_block *sb)
2328{
2329 if (!(sb->s_flags & MS_RDONLY)) {
2330 lock_super(sb);
2331 /* Reser the needs_recovery flag before the fs is unlocked. */
2332 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2333 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
2334 unlock_super(sb);
2335 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
2336 }
2337}
2338
2339static int ext4_remount (struct super_block * sb, int * flags, char * data)
2340{
2341 struct ext4_super_block * es;
2342 struct ext4_sb_info *sbi = EXT4_SB(sb);
2343 ext4_fsblk_t n_blocks_count = 0;
2344 unsigned long old_sb_flags;
2345 struct ext4_mount_options old_opts;
2346 int err;
2347#ifdef CONFIG_QUOTA
2348 int i;
2349#endif
2350
2351 /* Store the original options */
2352 old_sb_flags = sb->s_flags;
2353 old_opts.s_mount_opt = sbi->s_mount_opt;
2354 old_opts.s_resuid = sbi->s_resuid;
2355 old_opts.s_resgid = sbi->s_resgid;
2356 old_opts.s_commit_interval = sbi->s_commit_interval;
2357#ifdef CONFIG_QUOTA
2358 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2359 for (i = 0; i < MAXQUOTAS; i++)
2360 old_opts.s_qf_names[i] = sbi->s_qf_names[i];
2361#endif
2362
2363 /*
2364 * Allow the "check" option to be passed as a remount option.
2365 */
2366 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
2367 err = -EINVAL;
2368 goto restore_opts;
2369 }
2370
2371 if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
2372 ext4_abort(sb, __FUNCTION__, "Abort forced by user");
2373
2374 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2375 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
2376
2377 es = sbi->s_es;
2378
2379 ext4_init_journal_params(sb, sbi->s_journal);
2380
2381 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2382 n_blocks_count > ext4_blocks_count(es)) {
2383 if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
2384 err = -EROFS;
2385 goto restore_opts;
2386 }
2387
2388 if (*flags & MS_RDONLY) {
2389 /*
2390 * First of all, the unconditional stuff we have to do
2391 * to disable replay of the journal when we next remount
2392 */
2393 sb->s_flags |= MS_RDONLY;
2394
2395 /*
2396 * OK, test if we are remounting a valid rw partition
2397 * readonly, and if so set the rdonly flag and then
2398 * mark the partition as valid again.
2399 */
2400 if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
2401 (sbi->s_mount_state & EXT4_VALID_FS))
2402 es->s_state = cpu_to_le16(sbi->s_mount_state);
2403
2404 ext4_mark_recovery_complete(sb, es);
2405 } else {
2406 __le32 ret;
2407 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2408 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
2409 printk(KERN_WARNING "EXT4-fs: %s: couldn't "
2410 "remount RDWR because of unsupported "
2411 "optional features (%x).\n",
2412 sb->s_id, le32_to_cpu(ret));
2413 err = -EROFS;
2414 goto restore_opts;
2415 }
2416 /*
2417 * Mounting a RDONLY partition read-write, so reread
2418 * and store the current valid flag. (It may have
2419 * been changed by e2fsck since we originally mounted
2420 * the partition.)
2421 */
2422 ext4_clear_journal_err(sb, es);
2423 sbi->s_mount_state = le16_to_cpu(es->s_state);
2424 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
2425 goto restore_opts;
2426 if (!ext4_setup_super (sb, es, 0))
2427 sb->s_flags &= ~MS_RDONLY;
2428 }
2429 }
2430#ifdef CONFIG_QUOTA
2431 /* Release old quota file names */
2432 for (i = 0; i < MAXQUOTAS; i++)
2433 if (old_opts.s_qf_names[i] &&
2434 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2435 kfree(old_opts.s_qf_names[i]);
2436#endif
2437 return 0;
2438restore_opts:
2439 sb->s_flags = old_sb_flags;
2440 sbi->s_mount_opt = old_opts.s_mount_opt;
2441 sbi->s_resuid = old_opts.s_resuid;
2442 sbi->s_resgid = old_opts.s_resgid;
2443 sbi->s_commit_interval = old_opts.s_commit_interval;
2444#ifdef CONFIG_QUOTA
2445 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
2446 for (i = 0; i < MAXQUOTAS; i++) {
2447 if (sbi->s_qf_names[i] &&
2448 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2449 kfree(sbi->s_qf_names[i]);
2450 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2451 }
2452#endif
2453 return err;
2454}
2455
2456static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2457{
2458 struct super_block *sb = dentry->d_sb;
2459 struct ext4_sb_info *sbi = EXT4_SB(sb);
2460 struct ext4_super_block *es = sbi->s_es;
2461 ext4_fsblk_t overhead;
2462 int i;
2463
2464 if (test_opt (sb, MINIX_DF))
2465 overhead = 0;
2466 else {
2467 unsigned long ngroups;
2468 ngroups = EXT4_SB(sb)->s_groups_count;
2469 smp_rmb();
2470
2471 /*
2472 * Compute the overhead (FS structures)
2473 */
2474
2475 /*
2476 * All of the blocks before first_data_block are
2477 * overhead
2478 */
2479 overhead = le32_to_cpu(es->s_first_data_block);
2480
2481 /*
2482 * Add the overhead attributed to the superblock and
2483 * block group descriptors. If the sparse superblocks
2484 * feature is turned on, then not all groups have this.
2485 */
2486 for (i = 0; i < ngroups; i++) {
2487 overhead += ext4_bg_has_super(sb, i) +
2488 ext4_bg_num_gdb(sb, i);
2489 cond_resched();
2490 }
2491
2492 /*
2493 * Every block group has an inode bitmap, a block
2494 * bitmap, and an inode table.
2495 */
2496 overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group));
2497 }
2498
2499 buf->f_type = EXT4_SUPER_MAGIC;
2500 buf->f_bsize = sb->s_blocksize;
2501 buf->f_blocks = ext4_blocks_count(es) - overhead;
2502 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
2503 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
2504 if (buf->f_bfree < ext4_r_blocks_count(es))
2505 buf->f_bavail = 0;
2506 buf->f_files = le32_to_cpu(es->s_inodes_count);
2507 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
2508 buf->f_namelen = EXT4_NAME_LEN;
2509 return 0;
2510}
2511
2512/* Helper function for writing quotas on sync - we need to start transaction before quota file
2513 * is locked for write. Otherwise the are possible deadlocks:
2514 * Process 1 Process 2
2515 * ext4_create() quota_sync()
2516 * jbd2_journal_start() write_dquot()
2517 * DQUOT_INIT() down(dqio_mutex)
2518 * down(dqio_mutex) jbd2_journal_start()
2519 *
2520 */
2521
2522#ifdef CONFIG_QUOTA
2523
2524static inline struct inode *dquot_to_inode(struct dquot *dquot)
2525{
2526 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
2527}
2528
2529static int ext4_dquot_initialize(struct inode *inode, int type)
2530{
2531 handle_t *handle;
2532 int ret, err;
2533
2534 /* We may create quota structure so we need to reserve enough blocks */
2535 handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
2536 if (IS_ERR(handle))
2537 return PTR_ERR(handle);
2538 ret = dquot_initialize(inode, type);
2539 err = ext4_journal_stop(handle);
2540 if (!ret)
2541 ret = err;
2542 return ret;
2543}
2544
2545static int ext4_dquot_drop(struct inode *inode)
2546{
2547 handle_t *handle;
2548 int ret, err;
2549
2550 /* We may delete quota structure so we need to reserve enough blocks */
2551 handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
2552 if (IS_ERR(handle))
2553 return PTR_ERR(handle);
2554 ret = dquot_drop(inode);
2555 err = ext4_journal_stop(handle);
2556 if (!ret)
2557 ret = err;
2558 return ret;
2559}
2560
2561static int ext4_write_dquot(struct dquot *dquot)
2562{
2563 int ret, err;
2564 handle_t *handle;
2565 struct inode *inode;
2566
2567 inode = dquot_to_inode(dquot);
2568 handle = ext4_journal_start(inode,
2569 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2570 if (IS_ERR(handle))
2571 return PTR_ERR(handle);
2572 ret = dquot_commit(dquot);
2573 err = ext4_journal_stop(handle);
2574 if (!ret)
2575 ret = err;
2576 return ret;
2577}
2578
2579static int ext4_acquire_dquot(struct dquot *dquot)
2580{
2581 int ret, err;
2582 handle_t *handle;
2583
2584 handle = ext4_journal_start(dquot_to_inode(dquot),
2585 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2586 if (IS_ERR(handle))
2587 return PTR_ERR(handle);
2588 ret = dquot_acquire(dquot);
2589 err = ext4_journal_stop(handle);
2590 if (!ret)
2591 ret = err;
2592 return ret;
2593}
2594
2595static int ext4_release_dquot(struct dquot *dquot)
2596{
2597 int ret, err;
2598 handle_t *handle;
2599
2600 handle = ext4_journal_start(dquot_to_inode(dquot),
2601 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2602 if (IS_ERR(handle))
2603 return PTR_ERR(handle);
2604 ret = dquot_release(dquot);
2605 err = ext4_journal_stop(handle);
2606 if (!ret)
2607 ret = err;
2608 return ret;
2609}
2610
2611static int ext4_mark_dquot_dirty(struct dquot *dquot)
2612{
2613 /* Are we journalling quotas? */
2614 if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
2615 EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
2616 dquot_mark_dquot_dirty(dquot);
2617 return ext4_write_dquot(dquot);
2618 } else {
2619 return dquot_mark_dquot_dirty(dquot);
2620 }
2621}
2622
2623static int ext4_write_info(struct super_block *sb, int type)
2624{
2625 int ret, err;
2626 handle_t *handle;
2627
2628 /* Data block + inode block */
2629 handle = ext4_journal_start(sb->s_root->d_inode, 2);
2630 if (IS_ERR(handle))
2631 return PTR_ERR(handle);
2632 ret = dquot_commit_info(sb, type);
2633 err = ext4_journal_stop(handle);
2634 if (!ret)
2635 ret = err;
2636 return ret;
2637}
2638
2639/*
2640 * Turn on quotas during mount time - we need to find
2641 * the quota file and such...
2642 */
2643static int ext4_quota_on_mount(struct super_block *sb, int type)
2644{
2645 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
2646 EXT4_SB(sb)->s_jquota_fmt, type);
2647}
2648
2649/*
2650 * Standard function to be called on quota_on
2651 */
2652static int ext4_quota_on(struct super_block *sb, int type, int format_id,
2653 char *path)
2654{
2655 int err;
2656 struct nameidata nd;
2657
2658 if (!test_opt(sb, QUOTA))
2659 return -EINVAL;
2660 /* Not journalling quota? */
2661 if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
2662 !EXT4_SB(sb)->s_qf_names[GRPQUOTA])
2663 return vfs_quota_on(sb, type, format_id, path);
2664 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
2665 if (err)
2666 return err;
2667 /* Quotafile not on the same filesystem? */
2668 if (nd.mnt->mnt_sb != sb) {
2669 path_release(&nd);
2670 return -EXDEV;
2671 }
2672 /* Quotafile not of fs root? */
2673 if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
2674 printk(KERN_WARNING
2675 "EXT4-fs: Quota file not on filesystem root. "
2676 "Journalled quota will not work.\n");
2677 path_release(&nd);
2678 return vfs_quota_on(sb, type, format_id, path);
2679}
2680
2681/* Read data from quotafile - avoid pagecache and such because we cannot afford
2682 * acquiring the locks... As quota files are never truncated and quota code
2683 * itself serializes the operations (and noone else should touch the files)
2684 * we don't have to be afraid of races */
2685static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
2686 size_t len, loff_t off)
2687{
2688 struct inode *inode = sb_dqopt(sb)->files[type];
2689 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2690 int err = 0;
2691 int offset = off & (sb->s_blocksize - 1);
2692 int tocopy;
2693 size_t toread;
2694 struct buffer_head *bh;
2695 loff_t i_size = i_size_read(inode);
2696
2697 if (off > i_size)
2698 return 0;
2699 if (off+len > i_size)
2700 len = i_size-off;
2701 toread = len;
2702 while (toread > 0) {
2703 tocopy = sb->s_blocksize - offset < toread ?
2704 sb->s_blocksize - offset : toread;
2705 bh = ext4_bread(NULL, inode, blk, 0, &err);
2706 if (err)
2707 return err;
2708 if (!bh) /* A hole? */
2709 memset(data, 0, tocopy);
2710 else
2711 memcpy(data, bh->b_data+offset, tocopy);
2712 brelse(bh);
2713 offset = 0;
2714 toread -= tocopy;
2715 data += tocopy;
2716 blk++;
2717 }
2718 return len;
2719}
2720
2721/* Write to quotafile (we know the transaction is already started and has
2722 * enough credits) */
2723static ssize_t ext4_quota_write(struct super_block *sb, int type,
2724 const char *data, size_t len, loff_t off)
2725{
2726 struct inode *inode = sb_dqopt(sb)->files[type];
2727 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2728 int err = 0;
2729 int offset = off & (sb->s_blocksize - 1);
2730 int tocopy;
2731 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
2732 size_t towrite = len;
2733 struct buffer_head *bh;
2734 handle_t *handle = journal_current_handle();
2735
2736 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2737 while (towrite > 0) {
2738 tocopy = sb->s_blocksize - offset < towrite ?
2739 sb->s_blocksize - offset : towrite;
2740 bh = ext4_bread(handle, inode, blk, 1, &err);
2741 if (!bh)
2742 goto out;
2743 if (journal_quota) {
2744 err = ext4_journal_get_write_access(handle, bh);
2745 if (err) {
2746 brelse(bh);
2747 goto out;
2748 }
2749 }
2750 lock_buffer(bh);
2751 memcpy(bh->b_data+offset, data, tocopy);
2752 flush_dcache_page(bh->b_page);
2753 unlock_buffer(bh);
2754 if (journal_quota)
2755 err = ext4_journal_dirty_metadata(handle, bh);
2756 else {
2757 /* Always do at least ordered writes for quotas */
2758 err = ext4_journal_dirty_data(handle, bh);
2759 mark_buffer_dirty(bh);
2760 }
2761 brelse(bh);
2762 if (err)
2763 goto out;
2764 offset = 0;
2765 towrite -= tocopy;
2766 data += tocopy;
2767 blk++;
2768 }
2769out:
2770 if (len == towrite)
2771 return err;
2772 if (inode->i_size < off+len-towrite) {
2773 i_size_write(inode, off+len-towrite);
2774 EXT4_I(inode)->i_disksize = inode->i_size;
2775 }
2776 inode->i_version++;
2777 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2778 ext4_mark_inode_dirty(handle, inode);
2779 mutex_unlock(&inode->i_mutex);
2780 return len - towrite;
2781}
2782
2783#endif
2784
2785static int ext4_get_sb(struct file_system_type *fs_type,
2786 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2787{
2788 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
2789}
2790
2791static struct file_system_type ext4dev_fs_type = {
2792 .owner = THIS_MODULE,
2793 .name = "ext4dev",
2794 .get_sb = ext4_get_sb,
2795 .kill_sb = kill_block_super,
2796 .fs_flags = FS_REQUIRES_DEV,
2797};
2798
2799static int __init init_ext4_fs(void)
2800{
2801 int err = init_ext4_xattr();
2802 if (err)
2803 return err;
2804 err = init_inodecache();
2805 if (err)
2806 goto out1;
2807 err = register_filesystem(&ext4dev_fs_type);
2808 if (err)
2809 goto out;
2810 return 0;
2811out:
2812 destroy_inodecache();
2813out1:
2814 exit_ext4_xattr();
2815 return err;
2816}
2817
2818static void __exit exit_ext4_fs(void)
2819{
2820 unregister_filesystem(&ext4dev_fs_type);
2821 destroy_inodecache();
2822 exit_ext4_xattr();
2823}
2824
2825MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
2826MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
2827MODULE_LICENSE("GPL");
2828module_init(init_ext4_fs)
2829module_exit(exit_ext4_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 000000000000..fcf527286d75
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,54 @@
1/*
2 * linux/fs/ext4/symlink.c
3 *
4 * Only fast symlinks left here - the rest is done by generic code. AV, 1999
5 *
6 * Copyright (C) 1992, 1993, 1994, 1995
7 * Remy Card (card@masi.ibp.fr)
8 * Laboratoire MASI - Institut Blaise Pascal
9 * Universite Pierre et Marie Curie (Paris VI)
10 *
11 * from
12 *
13 * linux/fs/minix/symlink.c
14 *
15 * Copyright (C) 1991, 1992 Linus Torvalds
16 *
17 * ext4 symlink handling code
18 */
19
20#include <linux/fs.h>
21#include <linux/jbd2.h>
22#include <linux/ext4_fs.h>
23#include <linux/namei.h>
24#include "xattr.h"
25
26static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
27{
28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data);
30 return NULL;
31}
32
33struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link,
37#ifdef CONFIG_EXT4DEV_FS_XATTR
38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr,
40 .listxattr = ext4_listxattr,
41 .removexattr = generic_removexattr,
42#endif
43};
44
45struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link,
48#ifdef CONFIG_EXT4DEV_FS_XATTR
49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr,
51 .listxattr = ext4_listxattr,
52 .removexattr = generic_removexattr,
53#endif
54};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 000000000000..63233cd946a7
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1317 @@
1/*
2 * linux/fs/ext4/xattr.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 *
6 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
7 * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
8 * Extended attributes for symlinks and special files added per
9 * suggestion of Luka Renko <luka.renko@hermes.si>.
10 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
11 * Red Hat Inc.
12 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
13 * and Andreas Gruenbacher <agruen@suse.de>.
14 */
15
16/*
17 * Extended attributes are stored directly in inodes (on file systems with
18 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
19 * field contains the block number if an inode uses an additional block. All
20 * attributes must fit in the inode and one additional block. Blocks that
21 * contain the identical set of attributes may be shared among several inodes.
22 * Identical blocks are detected by keeping a cache of blocks that have
23 * recently been accessed.
24 *
25 * The attributes in inodes and on blocks have a different header; the entries
26 * are stored in the same format:
27 *
28 * +------------------+
29 * | header |
30 * | entry 1 | |
31 * | entry 2 | | growing downwards
32 * | entry 3 | v
33 * | four null bytes |
34 * | . . . |
35 * | value 1 | ^
36 * | value 3 | | growing upwards
37 * | value 2 | |
38 * +------------------+
39 *
40 * The header is followed by multiple entry descriptors. In disk blocks, the
41 * entry descriptors are kept sorted. In inodes, they are unsorted. The
42 * attribute values are aligned to the end of the block in no specific order.
43 *
44 * Locking strategy
45 * ----------------
46 * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
47 * EA blocks are only changed if they are exclusive to an inode, so
48 * holding xattr_sem also means that nothing but the EA block's reference
49 * count can change. Multiple writers to the same block are synchronized
50 * by the buffer lock.
51 */
52
53#include <linux/init.h>
54#include <linux/fs.h>
55#include <linux/slab.h>
56#include <linux/ext4_jbd2.h>
57#include <linux/ext4_fs.h>
58#include <linux/mbcache.h>
59#include <linux/quotaops.h>
60#include <linux/rwsem.h>
61#include "xattr.h"
62#include "acl.h"
63
64#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
65#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
66#define BFIRST(bh) ENTRY(BHDR(bh)+1)
67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
68
69#define IHDR(inode, raw_inode) \
70 ((struct ext4_xattr_ibody_header *) \
71 ((void *)raw_inode + \
72 EXT4_GOOD_OLD_INODE_SIZE + \
73 EXT4_I(inode)->i_extra_isize))
74#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
75
76#ifdef EXT4_XATTR_DEBUG
77# define ea_idebug(inode, f...) do { \
78 printk(KERN_DEBUG "inode %s:%lu: ", \
79 inode->i_sb->s_id, inode->i_ino); \
80 printk(f); \
81 printk("\n"); \
82 } while (0)
83# define ea_bdebug(bh, f...) do { \
84 char b[BDEVNAME_SIZE]; \
85 printk(KERN_DEBUG "block %s:%lu: ", \
86 bdevname(bh->b_bdev, b), \
87 (unsigned long) bh->b_blocknr); \
88 printk(f); \
89 printk("\n"); \
90 } while (0)
91#else
92# define ea_idebug(f...)
93# define ea_bdebug(f...)
94#endif
95
96static void ext4_xattr_cache_insert(struct buffer_head *);
97static struct buffer_head *ext4_xattr_cache_find(struct inode *,
98 struct ext4_xattr_header *,
99 struct mb_cache_entry **);
100static void ext4_xattr_rehash(struct ext4_xattr_header *,
101 struct ext4_xattr_entry *);
102
103static struct mb_cache *ext4_xattr_cache;
104
105static struct xattr_handler *ext4_xattr_handler_map[] = {
106 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
107#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
108 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
109 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
110#endif
111 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
112#ifdef CONFIG_EXT4DEV_FS_SECURITY
113 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
114#endif
115};
116
117struct xattr_handler *ext4_xattr_handlers[] = {
118 &ext4_xattr_user_handler,
119 &ext4_xattr_trusted_handler,
120#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
121 &ext4_xattr_acl_access_handler,
122 &ext4_xattr_acl_default_handler,
123#endif
124#ifdef CONFIG_EXT4DEV_FS_SECURITY
125 &ext4_xattr_security_handler,
126#endif
127 NULL
128};
129
130static inline struct xattr_handler *
131ext4_xattr_handler(int name_index)
132{
133 struct xattr_handler *handler = NULL;
134
135 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
136 handler = ext4_xattr_handler_map[name_index];
137 return handler;
138}
139
140/*
141 * Inode operation listxattr()
142 *
143 * dentry->d_inode->i_mutex: don't care
144 */
145ssize_t
146ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 return ext4_xattr_list(dentry->d_inode, buffer, size);
149}
150
151static int
152ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
153{
154 while (!IS_LAST_ENTRY(entry)) {
155 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
156 if ((void *)next >= end)
157 return -EIO;
158 entry = next;
159 }
160 return 0;
161}
162
163static inline int
164ext4_xattr_check_block(struct buffer_head *bh)
165{
166 int error;
167
168 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
169 BHDR(bh)->h_blocks != cpu_to_le32(1))
170 return -EIO;
171 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
172 return error;
173}
174
175static inline int
176ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
177{
178 size_t value_size = le32_to_cpu(entry->e_value_size);
179
180 if (entry->e_value_block != 0 || value_size > size ||
181 le16_to_cpu(entry->e_value_offs) + value_size > size)
182 return -EIO;
183 return 0;
184}
185
186static int
187ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
188 const char *name, size_t size, int sorted)
189{
190 struct ext4_xattr_entry *entry;
191 size_t name_len;
192 int cmp = 1;
193
194 if (name == NULL)
195 return -EINVAL;
196 name_len = strlen(name);
197 entry = *pentry;
198 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
199 cmp = name_index - entry->e_name_index;
200 if (!cmp)
201 cmp = name_len - entry->e_name_len;
202 if (!cmp)
203 cmp = memcmp(name, entry->e_name, name_len);
204 if (cmp <= 0 && (sorted || cmp == 0))
205 break;
206 }
207 *pentry = entry;
208 if (!cmp && ext4_xattr_check_entry(entry, size))
209 return -EIO;
210 return cmp ? -ENODATA : 0;
211}
212
213static int
214ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
215 void *buffer, size_t buffer_size)
216{
217 struct buffer_head *bh = NULL;
218 struct ext4_xattr_entry *entry;
219 size_t size;
220 int error;
221
222 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
223 name_index, name, buffer, (long)buffer_size);
224
225 error = -ENODATA;
226 if (!EXT4_I(inode)->i_file_acl)
227 goto cleanup;
228 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
229 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
230 if (!bh)
231 goto cleanup;
232 ea_bdebug(bh, "b_count=%d, refcount=%d",
233 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
234 if (ext4_xattr_check_block(bh)) {
235bad_block: ext4_error(inode->i_sb, __FUNCTION__,
236 "inode %lu: bad block %llu", inode->i_ino,
237 EXT4_I(inode)->i_file_acl);
238 error = -EIO;
239 goto cleanup;
240 }
241 ext4_xattr_cache_insert(bh);
242 entry = BFIRST(bh);
243 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
244 if (error == -EIO)
245 goto bad_block;
246 if (error)
247 goto cleanup;
248 size = le32_to_cpu(entry->e_value_size);
249 if (buffer) {
250 error = -ERANGE;
251 if (size > buffer_size)
252 goto cleanup;
253 memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
254 size);
255 }
256 error = size;
257
258cleanup:
259 brelse(bh);
260 return error;
261}
262
263static int
264ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
265 void *buffer, size_t buffer_size)
266{
267 struct ext4_xattr_ibody_header *header;
268 struct ext4_xattr_entry *entry;
269 struct ext4_inode *raw_inode;
270 struct ext4_iloc iloc;
271 size_t size;
272 void *end;
273 int error;
274
275 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
276 return -ENODATA;
277 error = ext4_get_inode_loc(inode, &iloc);
278 if (error)
279 return error;
280 raw_inode = ext4_raw_inode(&iloc);
281 header = IHDR(inode, raw_inode);
282 entry = IFIRST(header);
283 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
284 error = ext4_xattr_check_names(entry, end);
285 if (error)
286 goto cleanup;
287 error = ext4_xattr_find_entry(&entry, name_index, name,
288 end - (void *)entry, 0);
289 if (error)
290 goto cleanup;
291 size = le32_to_cpu(entry->e_value_size);
292 if (buffer) {
293 error = -ERANGE;
294 if (size > buffer_size)
295 goto cleanup;
296 memcpy(buffer, (void *)IFIRST(header) +
297 le16_to_cpu(entry->e_value_offs), size);
298 }
299 error = size;
300
301cleanup:
302 brelse(iloc.bh);
303 return error;
304}
305
306/*
307 * ext4_xattr_get()
308 *
309 * Copy an extended attribute into the buffer
310 * provided, or compute the buffer size required.
311 * Buffer is NULL to compute the size of the buffer required.
312 *
313 * Returns a negative error number on failure, or the number of bytes
314 * used / required on success.
315 */
316int
317ext4_xattr_get(struct inode *inode, int name_index, const char *name,
318 void *buffer, size_t buffer_size)
319{
320 int error;
321
322 down_read(&EXT4_I(inode)->xattr_sem);
323 error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
324 buffer_size);
325 if (error == -ENODATA)
326 error = ext4_xattr_block_get(inode, name_index, name, buffer,
327 buffer_size);
328 up_read(&EXT4_I(inode)->xattr_sem);
329 return error;
330}
331
332static int
333ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
334 char *buffer, size_t buffer_size)
335{
336 size_t rest = buffer_size;
337
338 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
339 struct xattr_handler *handler =
340 ext4_xattr_handler(entry->e_name_index);
341
342 if (handler) {
343 size_t size = handler->list(inode, buffer, rest,
344 entry->e_name,
345 entry->e_name_len);
346 if (buffer) {
347 if (size > rest)
348 return -ERANGE;
349 buffer += size;
350 }
351 rest -= size;
352 }
353 }
354 return buffer_size - rest;
355}
356
357static int
358ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
359{
360 struct buffer_head *bh = NULL;
361 int error;
362
363 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
364 buffer, (long)buffer_size);
365
366 error = 0;
367 if (!EXT4_I(inode)->i_file_acl)
368 goto cleanup;
369 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
370 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
371 error = -EIO;
372 if (!bh)
373 goto cleanup;
374 ea_bdebug(bh, "b_count=%d, refcount=%d",
375 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
376 if (ext4_xattr_check_block(bh)) {
377 ext4_error(inode->i_sb, __FUNCTION__,
378 "inode %lu: bad block %llu", inode->i_ino,
379 EXT4_I(inode)->i_file_acl);
380 error = -EIO;
381 goto cleanup;
382 }
383 ext4_xattr_cache_insert(bh);
384 error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
385
386cleanup:
387 brelse(bh);
388
389 return error;
390}
391
392static int
393ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
394{
395 struct ext4_xattr_ibody_header *header;
396 struct ext4_inode *raw_inode;
397 struct ext4_iloc iloc;
398 void *end;
399 int error;
400
401 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
402 return 0;
403 error = ext4_get_inode_loc(inode, &iloc);
404 if (error)
405 return error;
406 raw_inode = ext4_raw_inode(&iloc);
407 header = IHDR(inode, raw_inode);
408 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
409 error = ext4_xattr_check_names(IFIRST(header), end);
410 if (error)
411 goto cleanup;
412 error = ext4_xattr_list_entries(inode, IFIRST(header),
413 buffer, buffer_size);
414
415cleanup:
416 brelse(iloc.bh);
417 return error;
418}
419
420/*
421 * ext4_xattr_list()
422 *
423 * Copy a list of attribute names into the buffer
424 * provided, or compute the buffer size required.
425 * Buffer is NULL to compute the size of the buffer required.
426 *
427 * Returns a negative error number on failure, or the number of bytes
428 * used / required on success.
429 */
430int
431ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
432{
433 int i_error, b_error;
434
435 down_read(&EXT4_I(inode)->xattr_sem);
436 i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
437 if (i_error < 0) {
438 b_error = 0;
439 } else {
440 if (buffer) {
441 buffer += i_error;
442 buffer_size -= i_error;
443 }
444 b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
445 if (b_error < 0)
446 i_error = 0;
447 }
448 up_read(&EXT4_I(inode)->xattr_sem);
449 return i_error + b_error;
450}
451
452/*
453 * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
454 * not set, set it.
455 */
456static void ext4_xattr_update_super_block(handle_t *handle,
457 struct super_block *sb)
458{
459 if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
460 return;
461
462 lock_super(sb);
463 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
464 EXT4_SB(sb)->s_es->s_feature_compat |=
465 cpu_to_le32(EXT4_FEATURE_COMPAT_EXT_ATTR);
466 sb->s_dirt = 1;
467 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
468 }
469 unlock_super(sb);
470}
471
472/*
473 * Release the xattr block BH: If the reference count is > 1, decrement
474 * it; otherwise free the block.
475 */
476static void
477ext4_xattr_release_block(handle_t *handle, struct inode *inode,
478 struct buffer_head *bh)
479{
480 struct mb_cache_entry *ce = NULL;
481
482 ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
483 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
484 ea_bdebug(bh, "refcount now=0; freeing");
485 if (ce)
486 mb_cache_entry_free(ce);
487 ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
488 get_bh(bh);
489 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
490 } else {
491 if (ext4_journal_get_write_access(handle, bh) == 0) {
492 lock_buffer(bh);
493 BHDR(bh)->h_refcount = cpu_to_le32(
494 le32_to_cpu(BHDR(bh)->h_refcount) - 1);
495 ext4_journal_dirty_metadata(handle, bh);
496 if (IS_SYNC(inode))
497 handle->h_sync = 1;
498 DQUOT_FREE_BLOCK(inode, 1);
499 unlock_buffer(bh);
500 ea_bdebug(bh, "refcount now=%d; releasing",
501 le32_to_cpu(BHDR(bh)->h_refcount));
502 }
503 if (ce)
504 mb_cache_entry_release(ce);
505 }
506}
507
508struct ext4_xattr_info {
509 int name_index;
510 const char *name;
511 const void *value;
512 size_t value_len;
513};
514
515struct ext4_xattr_search {
516 struct ext4_xattr_entry *first;
517 void *base;
518 void *end;
519 struct ext4_xattr_entry *here;
520 int not_found;
521};
522
523static int
524ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
525{
526 struct ext4_xattr_entry *last;
527 size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
528
529 /* Compute min_offs and last. */
530 last = s->first;
531 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
532 if (!last->e_value_block && last->e_value_size) {
533 size_t offs = le16_to_cpu(last->e_value_offs);
534 if (offs < min_offs)
535 min_offs = offs;
536 }
537 }
538 free = min_offs - ((void *)last - s->base) - sizeof(__u32);
539 if (!s->not_found) {
540 if (!s->here->e_value_block && s->here->e_value_size) {
541 size_t size = le32_to_cpu(s->here->e_value_size);
542 free += EXT4_XATTR_SIZE(size);
543 }
544 free += EXT4_XATTR_LEN(name_len);
545 }
546 if (i->value) {
547 if (free < EXT4_XATTR_SIZE(i->value_len) ||
548 free < EXT4_XATTR_LEN(name_len) +
549 EXT4_XATTR_SIZE(i->value_len))
550 return -ENOSPC;
551 }
552
553 if (i->value && s->not_found) {
554 /* Insert the new name. */
555 size_t size = EXT4_XATTR_LEN(name_len);
556 size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
557 memmove((void *)s->here + size, s->here, rest);
558 memset(s->here, 0, size);
559 s->here->e_name_index = i->name_index;
560 s->here->e_name_len = name_len;
561 memcpy(s->here->e_name, i->name, name_len);
562 } else {
563 if (!s->here->e_value_block && s->here->e_value_size) {
564 void *first_val = s->base + min_offs;
565 size_t offs = le16_to_cpu(s->here->e_value_offs);
566 void *val = s->base + offs;
567 size_t size = EXT4_XATTR_SIZE(
568 le32_to_cpu(s->here->e_value_size));
569
570 if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
571 /* The old and the new value have the same
572 size. Just replace. */
573 s->here->e_value_size =
574 cpu_to_le32(i->value_len);
575 memset(val + size - EXT4_XATTR_PAD, 0,
576 EXT4_XATTR_PAD); /* Clear pad bytes. */
577 memcpy(val, i->value, i->value_len);
578 return 0;
579 }
580
581 /* Remove the old value. */
582 memmove(first_val + size, first_val, val - first_val);
583 memset(first_val, 0, size);
584 s->here->e_value_size = 0;
585 s->here->e_value_offs = 0;
586 min_offs += size;
587
588 /* Adjust all value offsets. */
589 last = s->first;
590 while (!IS_LAST_ENTRY(last)) {
591 size_t o = le16_to_cpu(last->e_value_offs);
592 if (!last->e_value_block &&
593 last->e_value_size && o < offs)
594 last->e_value_offs =
595 cpu_to_le16(o + size);
596 last = EXT4_XATTR_NEXT(last);
597 }
598 }
599 if (!i->value) {
600 /* Remove the old name. */
601 size_t size = EXT4_XATTR_LEN(name_len);
602 last = ENTRY((void *)last - size);
603 memmove(s->here, (void *)s->here + size,
604 (void *)last - (void *)s->here + sizeof(__u32));
605 memset(last, 0, size);
606 }
607 }
608
609 if (i->value) {
610 /* Insert the new value. */
611 s->here->e_value_size = cpu_to_le32(i->value_len);
612 if (i->value_len) {
613 size_t size = EXT4_XATTR_SIZE(i->value_len);
614 void *val = s->base + min_offs - size;
615 s->here->e_value_offs = cpu_to_le16(min_offs - size);
616 memset(val + size - EXT4_XATTR_PAD, 0,
617 EXT4_XATTR_PAD); /* Clear the pad bytes. */
618 memcpy(val, i->value, i->value_len);
619 }
620 }
621 return 0;
622}
623
624struct ext4_xattr_block_find {
625 struct ext4_xattr_search s;
626 struct buffer_head *bh;
627};
628
629static int
630ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
631 struct ext4_xattr_block_find *bs)
632{
633 struct super_block *sb = inode->i_sb;
634 int error;
635
636 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
637 i->name_index, i->name, i->value, (long)i->value_len);
638
639 if (EXT4_I(inode)->i_file_acl) {
640 /* The inode already has an extended attribute block. */
641 bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
642 error = -EIO;
643 if (!bs->bh)
644 goto cleanup;
645 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
646 atomic_read(&(bs->bh->b_count)),
647 le32_to_cpu(BHDR(bs->bh)->h_refcount));
648 if (ext4_xattr_check_block(bs->bh)) {
649 ext4_error(sb, __FUNCTION__,
650 "inode %lu: bad block %llu", inode->i_ino,
651 EXT4_I(inode)->i_file_acl);
652 error = -EIO;
653 goto cleanup;
654 }
655 /* Find the named attribute. */
656 bs->s.base = BHDR(bs->bh);
657 bs->s.first = BFIRST(bs->bh);
658 bs->s.end = bs->bh->b_data + bs->bh->b_size;
659 bs->s.here = bs->s.first;
660 error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
661 i->name, bs->bh->b_size, 1);
662 if (error && error != -ENODATA)
663 goto cleanup;
664 bs->s.not_found = error;
665 }
666 error = 0;
667
668cleanup:
669 return error;
670}
671
672static int
673ext4_xattr_block_set(handle_t *handle, struct inode *inode,
674 struct ext4_xattr_info *i,
675 struct ext4_xattr_block_find *bs)
676{
677 struct super_block *sb = inode->i_sb;
678 struct buffer_head *new_bh = NULL;
679 struct ext4_xattr_search *s = &bs->s;
680 struct mb_cache_entry *ce = NULL;
681 int error;
682
683#define header(x) ((struct ext4_xattr_header *)(x))
684
685 if (i->value && i->value_len > sb->s_blocksize)
686 return -ENOSPC;
687 if (s->base) {
688 ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
689 bs->bh->b_blocknr);
690 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
691 if (ce) {
692 mb_cache_entry_free(ce);
693 ce = NULL;
694 }
695 ea_bdebug(bs->bh, "modifying in-place");
696 error = ext4_journal_get_write_access(handle, bs->bh);
697 if (error)
698 goto cleanup;
699 lock_buffer(bs->bh);
700 error = ext4_xattr_set_entry(i, s);
701 if (!error) {
702 if (!IS_LAST_ENTRY(s->first))
703 ext4_xattr_rehash(header(s->base),
704 s->here);
705 ext4_xattr_cache_insert(bs->bh);
706 }
707 unlock_buffer(bs->bh);
708 if (error == -EIO)
709 goto bad_block;
710 if (!error)
711 error = ext4_journal_dirty_metadata(handle,
712 bs->bh);
713 if (error)
714 goto cleanup;
715 goto inserted;
716 } else {
717 int offset = (char *)s->here - bs->bh->b_data;
718
719 if (ce) {
720 mb_cache_entry_release(ce);
721 ce = NULL;
722 }
723 ea_bdebug(bs->bh, "cloning");
724 s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
725 error = -ENOMEM;
726 if (s->base == NULL)
727 goto cleanup;
728 memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
729 s->first = ENTRY(header(s->base)+1);
730 header(s->base)->h_refcount = cpu_to_le32(1);
731 s->here = ENTRY(s->base + offset);
732 s->end = s->base + bs->bh->b_size;
733 }
734 } else {
735 /* Allocate a buffer where we construct the new block. */
736 s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
737 /* assert(header == s->base) */
738 error = -ENOMEM;
739 if (s->base == NULL)
740 goto cleanup;
741 memset(s->base, 0, sb->s_blocksize);
742 header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
743 header(s->base)->h_blocks = cpu_to_le32(1);
744 header(s->base)->h_refcount = cpu_to_le32(1);
745 s->first = ENTRY(header(s->base)+1);
746 s->here = ENTRY(header(s->base)+1);
747 s->end = s->base + sb->s_blocksize;
748 }
749
750 error = ext4_xattr_set_entry(i, s);
751 if (error == -EIO)
752 goto bad_block;
753 if (error)
754 goto cleanup;
755 if (!IS_LAST_ENTRY(s->first))
756 ext4_xattr_rehash(header(s->base), s->here);
757
758inserted:
759 if (!IS_LAST_ENTRY(s->first)) {
760 new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
761 if (new_bh) {
762 /* We found an identical block in the cache. */
763 if (new_bh == bs->bh)
764 ea_bdebug(new_bh, "keeping");
765 else {
766 /* The old block is released after updating
767 the inode. */
768 error = -EDQUOT;
769 if (DQUOT_ALLOC_BLOCK(inode, 1))
770 goto cleanup;
771 error = ext4_journal_get_write_access(handle,
772 new_bh);
773 if (error)
774 goto cleanup_dquot;
775 lock_buffer(new_bh);
776 BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
777 le32_to_cpu(BHDR(new_bh)->h_refcount));
778 ea_bdebug(new_bh, "reusing; refcount now=%d",
779 le32_to_cpu(BHDR(new_bh)->h_refcount));
780 unlock_buffer(new_bh);
781 error = ext4_journal_dirty_metadata(handle,
782 new_bh);
783 if (error)
784 goto cleanup_dquot;
785 }
786 mb_cache_entry_release(ce);
787 ce = NULL;
788 } else if (bs->bh && s->base == bs->bh->b_data) {
789 /* We were modifying this block in-place. */
790 ea_bdebug(bs->bh, "keeping this block");
791 new_bh = bs->bh;
792 get_bh(new_bh);
793 } else {
794 /* We need to allocate a new block */
795 ext4_fsblk_t goal = le32_to_cpu(
796 EXT4_SB(sb)->s_es->s_first_data_block) +
797 (ext4_fsblk_t)EXT4_I(inode)->i_block_group *
798 EXT4_BLOCKS_PER_GROUP(sb);
799 ext4_fsblk_t block = ext4_new_block(handle, inode,
800 goal, &error);
801 if (error)
802 goto cleanup;
803 ea_idebug(inode, "creating block %d", block);
804
805 new_bh = sb_getblk(sb, block);
806 if (!new_bh) {
807getblk_failed:
808 ext4_free_blocks(handle, inode, block, 1);
809 error = -EIO;
810 goto cleanup;
811 }
812 lock_buffer(new_bh);
813 error = ext4_journal_get_create_access(handle, new_bh);
814 if (error) {
815 unlock_buffer(new_bh);
816 goto getblk_failed;
817 }
818 memcpy(new_bh->b_data, s->base, new_bh->b_size);
819 set_buffer_uptodate(new_bh);
820 unlock_buffer(new_bh);
821 ext4_xattr_cache_insert(new_bh);
822 error = ext4_journal_dirty_metadata(handle, new_bh);
823 if (error)
824 goto cleanup;
825 }
826 }
827
828 /* Update the inode. */
829 EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
830
831 /* Drop the previous xattr block. */
832 if (bs->bh && bs->bh != new_bh)
833 ext4_xattr_release_block(handle, inode, bs->bh);
834 error = 0;
835
836cleanup:
837 if (ce)
838 mb_cache_entry_release(ce);
839 brelse(new_bh);
840 if (!(bs->bh && s->base == bs->bh->b_data))
841 kfree(s->base);
842
843 return error;
844
845cleanup_dquot:
846 DQUOT_FREE_BLOCK(inode, 1);
847 goto cleanup;
848
849bad_block:
850 ext4_error(inode->i_sb, __FUNCTION__,
851 "inode %lu: bad block %llu", inode->i_ino,
852 EXT4_I(inode)->i_file_acl);
853 goto cleanup;
854
855#undef header
856}
857
858struct ext4_xattr_ibody_find {
859 struct ext4_xattr_search s;
860 struct ext4_iloc iloc;
861};
862
863static int
864ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
865 struct ext4_xattr_ibody_find *is)
866{
867 struct ext4_xattr_ibody_header *header;
868 struct ext4_inode *raw_inode;
869 int error;
870
871 if (EXT4_I(inode)->i_extra_isize == 0)
872 return 0;
873 raw_inode = ext4_raw_inode(&is->iloc);
874 header = IHDR(inode, raw_inode);
875 is->s.base = is->s.first = IFIRST(header);
876 is->s.here = is->s.first;
877 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
878 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
879 error = ext4_xattr_check_names(IFIRST(header), is->s.end);
880 if (error)
881 return error;
882 /* Find the named attribute. */
883 error = ext4_xattr_find_entry(&is->s.here, i->name_index,
884 i->name, is->s.end -
885 (void *)is->s.base, 0);
886 if (error && error != -ENODATA)
887 return error;
888 is->s.not_found = error;
889 }
890 return 0;
891}
892
893static int
894ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
895 struct ext4_xattr_info *i,
896 struct ext4_xattr_ibody_find *is)
897{
898 struct ext4_xattr_ibody_header *header;
899 struct ext4_xattr_search *s = &is->s;
900 int error;
901
902 if (EXT4_I(inode)->i_extra_isize == 0)
903 return -ENOSPC;
904 error = ext4_xattr_set_entry(i, s);
905 if (error)
906 return error;
907 header = IHDR(inode, ext4_raw_inode(&is->iloc));
908 if (!IS_LAST_ENTRY(s->first)) {
909 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
910 EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
911 } else {
912 header->h_magic = cpu_to_le32(0);
913 EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
914 }
915 return 0;
916}
917
918/*
919 * ext4_xattr_set_handle()
920 *
921 * Create, replace or remove an extended attribute for this inode. Buffer
922 * is NULL to remove an existing extended attribute, and non-NULL to
923 * either replace an existing extended attribute, or create a new extended
924 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
925 * specify that an extended attribute must exist and must not exist
926 * previous to the call, respectively.
927 *
928 * Returns 0, or a negative error number on failure.
929 */
930int
931ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
932 const char *name, const void *value, size_t value_len,
933 int flags)
934{
935 struct ext4_xattr_info i = {
936 .name_index = name_index,
937 .name = name,
938 .value = value,
939 .value_len = value_len,
940
941 };
942 struct ext4_xattr_ibody_find is = {
943 .s = { .not_found = -ENODATA, },
944 };
945 struct ext4_xattr_block_find bs = {
946 .s = { .not_found = -ENODATA, },
947 };
948 int error;
949
950 if (!name)
951 return -EINVAL;
952 if (strlen(name) > 255)
953 return -ERANGE;
954 down_write(&EXT4_I(inode)->xattr_sem);
955 error = ext4_get_inode_loc(inode, &is.iloc);
956 if (error)
957 goto cleanup;
958
959 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
960 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
961 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
962 EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
963 }
964
965 error = ext4_xattr_ibody_find(inode, &i, &is);
966 if (error)
967 goto cleanup;
968 if (is.s.not_found)
969 error = ext4_xattr_block_find(inode, &i, &bs);
970 if (error)
971 goto cleanup;
972 if (is.s.not_found && bs.s.not_found) {
973 error = -ENODATA;
974 if (flags & XATTR_REPLACE)
975 goto cleanup;
976 error = 0;
977 if (!value)
978 goto cleanup;
979 } else {
980 error = -EEXIST;
981 if (flags & XATTR_CREATE)
982 goto cleanup;
983 }
984 error = ext4_journal_get_write_access(handle, is.iloc.bh);
985 if (error)
986 goto cleanup;
987 if (!value) {
988 if (!is.s.not_found)
989 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
990 else if (!bs.s.not_found)
991 error = ext4_xattr_block_set(handle, inode, &i, &bs);
992 } else {
993 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
994 if (!error && !bs.s.not_found) {
995 i.value = NULL;
996 error = ext4_xattr_block_set(handle, inode, &i, &bs);
997 } else if (error == -ENOSPC) {
998 error = ext4_xattr_block_set(handle, inode, &i, &bs);
999 if (error)
1000 goto cleanup;
1001 if (!is.s.not_found) {
1002 i.value = NULL;
1003 error = ext4_xattr_ibody_set(handle, inode, &i,
1004 &is);
1005 }
1006 }
1007 }
1008 if (!error) {
1009 ext4_xattr_update_super_block(handle, inode->i_sb);
1010 inode->i_ctime = CURRENT_TIME_SEC;
1011 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1012 /*
1013 * The bh is consumed by ext4_mark_iloc_dirty, even with
1014 * error != 0.
1015 */
1016 is.iloc.bh = NULL;
1017 if (IS_SYNC(inode))
1018 handle->h_sync = 1;
1019 }
1020
1021cleanup:
1022 brelse(is.iloc.bh);
1023 brelse(bs.bh);
1024 up_write(&EXT4_I(inode)->xattr_sem);
1025 return error;
1026}
1027
1028/*
1029 * ext4_xattr_set()
1030 *
1031 * Like ext4_xattr_set_handle, but start from an inode. This extended
1032 * attribute modification is a filesystem transaction by itself.
1033 *
1034 * Returns 0, or a negative error number on failure.
1035 */
1036int
1037ext4_xattr_set(struct inode *inode, int name_index, const char *name,
1038 const void *value, size_t value_len, int flags)
1039{
1040 handle_t *handle;
1041 int error, retries = 0;
1042
1043retry:
1044 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
1045 if (IS_ERR(handle)) {
1046 error = PTR_ERR(handle);
1047 } else {
1048 int error2;
1049
1050 error = ext4_xattr_set_handle(handle, inode, name_index, name,
1051 value, value_len, flags);
1052 error2 = ext4_journal_stop(handle);
1053 if (error == -ENOSPC &&
1054 ext4_should_retry_alloc(inode->i_sb, &retries))
1055 goto retry;
1056 if (error == 0)
1057 error = error2;
1058 }
1059
1060 return error;
1061}
1062
1063/*
1064 * ext4_xattr_delete_inode()
1065 *
1066 * Free extended attribute resources associated with this inode. This
1067 * is called immediately before an inode is freed. We have exclusive
1068 * access to the inode.
1069 */
1070void
1071ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1072{
1073 struct buffer_head *bh = NULL;
1074
1075 if (!EXT4_I(inode)->i_file_acl)
1076 goto cleanup;
1077 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1078 if (!bh) {
1079 ext4_error(inode->i_sb, __FUNCTION__,
1080 "inode %lu: block %llu read error", inode->i_ino,
1081 EXT4_I(inode)->i_file_acl);
1082 goto cleanup;
1083 }
1084 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1085 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1086 ext4_error(inode->i_sb, __FUNCTION__,
1087 "inode %lu: bad block %llu", inode->i_ino,
1088 EXT4_I(inode)->i_file_acl);
1089 goto cleanup;
1090 }
1091 ext4_xattr_release_block(handle, inode, bh);
1092 EXT4_I(inode)->i_file_acl = 0;
1093
1094cleanup:
1095 brelse(bh);
1096}
1097
1098/*
1099 * ext4_xattr_put_super()
1100 *
1101 * This is called when a file system is unmounted.
1102 */
1103void
1104ext4_xattr_put_super(struct super_block *sb)
1105{
1106 mb_cache_shrink(sb->s_bdev);
1107}
1108
1109/*
1110 * ext4_xattr_cache_insert()
1111 *
1112 * Create a new entry in the extended attribute cache, and insert
1113 * it unless such an entry is already in the cache.
1114 *
1115 * Returns 0, or a negative error number on failure.
1116 */
1117static void
1118ext4_xattr_cache_insert(struct buffer_head *bh)
1119{
1120 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1121 struct mb_cache_entry *ce;
1122 int error;
1123
1124 ce = mb_cache_entry_alloc(ext4_xattr_cache);
1125 if (!ce) {
1126 ea_bdebug(bh, "out of memory");
1127 return;
1128 }
1129 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
1130 if (error) {
1131 mb_cache_entry_free(ce);
1132 if (error == -EBUSY) {
1133 ea_bdebug(bh, "already in cache");
1134 error = 0;
1135 }
1136 } else {
1137 ea_bdebug(bh, "inserting [%x]", (int)hash);
1138 mb_cache_entry_release(ce);
1139 }
1140}
1141
1142/*
1143 * ext4_xattr_cmp()
1144 *
1145 * Compare two extended attribute blocks for equality.
1146 *
1147 * Returns 0 if the blocks are equal, 1 if they differ, and
1148 * a negative error number on errors.
1149 */
1150static int
1151ext4_xattr_cmp(struct ext4_xattr_header *header1,
1152 struct ext4_xattr_header *header2)
1153{
1154 struct ext4_xattr_entry *entry1, *entry2;
1155
1156 entry1 = ENTRY(header1+1);
1157 entry2 = ENTRY(header2+1);
1158 while (!IS_LAST_ENTRY(entry1)) {
1159 if (IS_LAST_ENTRY(entry2))
1160 return 1;
1161 if (entry1->e_hash != entry2->e_hash ||
1162 entry1->e_name_index != entry2->e_name_index ||
1163 entry1->e_name_len != entry2->e_name_len ||
1164 entry1->e_value_size != entry2->e_value_size ||
1165 memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
1166 return 1;
1167 if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
1168 return -EIO;
1169 if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
1170 (char *)header2 + le16_to_cpu(entry2->e_value_offs),
1171 le32_to_cpu(entry1->e_value_size)))
1172 return 1;
1173
1174 entry1 = EXT4_XATTR_NEXT(entry1);
1175 entry2 = EXT4_XATTR_NEXT(entry2);
1176 }
1177 if (!IS_LAST_ENTRY(entry2))
1178 return 1;
1179 return 0;
1180}
1181
1182/*
1183 * ext4_xattr_cache_find()
1184 *
1185 * Find an identical extended attribute block.
1186 *
1187 * Returns a pointer to the block found, or NULL if such a block was
1188 * not found or an error occurred.
1189 */
1190static struct buffer_head *
1191ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1192 struct mb_cache_entry **pce)
1193{
1194 __u32 hash = le32_to_cpu(header->h_hash);
1195 struct mb_cache_entry *ce;
1196
1197 if (!header->h_hash)
1198 return NULL; /* never share */
1199 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1200again:
1201 ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
1202 inode->i_sb->s_bdev, hash);
1203 while (ce) {
1204 struct buffer_head *bh;
1205
1206 if (IS_ERR(ce)) {
1207 if (PTR_ERR(ce) == -EAGAIN)
1208 goto again;
1209 break;
1210 }
1211 bh = sb_bread(inode->i_sb, ce->e_block);
1212 if (!bh) {
1213 ext4_error(inode->i_sb, __FUNCTION__,
1214 "inode %lu: block %lu read error",
1215 inode->i_ino, (unsigned long) ce->e_block);
1216 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1217 EXT4_XATTR_REFCOUNT_MAX) {
1218 ea_idebug(inode, "block %lu refcount %d>=%d",
1219 (unsigned long) ce->e_block,
1220 le32_to_cpu(BHDR(bh)->h_refcount),
1221 EXT4_XATTR_REFCOUNT_MAX);
1222 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
1223 *pce = ce;
1224 return bh;
1225 }
1226 brelse(bh);
1227 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
1228 }
1229 return NULL;
1230}
1231
1232#define NAME_HASH_SHIFT 5
1233#define VALUE_HASH_SHIFT 16
1234
1235/*
1236 * ext4_xattr_hash_entry()
1237 *
1238 * Compute the hash of an extended attribute.
1239 */
1240static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
1241 struct ext4_xattr_entry *entry)
1242{
1243 __u32 hash = 0;
1244 char *name = entry->e_name;
1245 int n;
1246
1247 for (n=0; n < entry->e_name_len; n++) {
1248 hash = (hash << NAME_HASH_SHIFT) ^
1249 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
1250 *name++;
1251 }
1252
1253 if (entry->e_value_block == 0 && entry->e_value_size != 0) {
1254 __le32 *value = (__le32 *)((char *)header +
1255 le16_to_cpu(entry->e_value_offs));
1256 for (n = (le32_to_cpu(entry->e_value_size) +
1257 EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
1258 hash = (hash << VALUE_HASH_SHIFT) ^
1259 (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
1260 le32_to_cpu(*value++);
1261 }
1262 }
1263 entry->e_hash = cpu_to_le32(hash);
1264}
1265
1266#undef NAME_HASH_SHIFT
1267#undef VALUE_HASH_SHIFT
1268
1269#define BLOCK_HASH_SHIFT 16
1270
1271/*
1272 * ext4_xattr_rehash()
1273 *
1274 * Re-compute the extended attribute hash value after an entry has changed.
1275 */
1276static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1277 struct ext4_xattr_entry *entry)
1278{
1279 struct ext4_xattr_entry *here;
1280 __u32 hash = 0;
1281
1282 ext4_xattr_hash_entry(header, entry);
1283 here = ENTRY(header+1);
1284 while (!IS_LAST_ENTRY(here)) {
1285 if (!here->e_hash) {
1286 /* Block is not shared if an entry's hash value == 0 */
1287 hash = 0;
1288 break;
1289 }
1290 hash = (hash << BLOCK_HASH_SHIFT) ^
1291 (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
1292 le32_to_cpu(here->e_hash);
1293 here = EXT4_XATTR_NEXT(here);
1294 }
1295 header->h_hash = cpu_to_le32(hash);
1296}
1297
1298#undef BLOCK_HASH_SHIFT
1299
1300int __init
1301init_ext4_xattr(void)
1302{
1303 ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
1304 sizeof(struct mb_cache_entry) +
1305 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1306 if (!ext4_xattr_cache)
1307 return -ENOMEM;
1308 return 0;
1309}
1310
1311void
1312exit_ext4_xattr(void)
1313{
1314 if (ext4_xattr_cache)
1315 mb_cache_destroy(ext4_xattr_cache);
1316 ext4_xattr_cache = NULL;
1317}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 000000000000..79432b35398f
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,145 @@
1/*
2 File: fs/ext4/xattr.h
3
4 On-disk format of extended attributes for the ext4 filesystem.
5
6 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
7*/
8
9#include <linux/xattr.h>
10
11/* Magic value in attribute blocks */
12#define EXT4_XATTR_MAGIC 0xEA020000
13
14/* Maximum number of references to one attribute block */
15#define EXT4_XATTR_REFCOUNT_MAX 1024
16
17/* Name indexes */
18#define EXT4_XATTR_INDEX_USER 1
19#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2
20#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3
21#define EXT4_XATTR_INDEX_TRUSTED 4
22#define EXT4_XATTR_INDEX_LUSTRE 5
23#define EXT4_XATTR_INDEX_SECURITY 6
24
25struct ext4_xattr_header {
26 __le32 h_magic; /* magic number for identification */
27 __le32 h_refcount; /* reference count */
28 __le32 h_blocks; /* number of disk blocks used */
29 __le32 h_hash; /* hash value of all attributes */
30 __u32 h_reserved[4]; /* zero right now */
31};
32
33struct ext4_xattr_ibody_header {
34 __le32 h_magic; /* magic number for identification */
35};
36
37struct ext4_xattr_entry {
38 __u8 e_name_len; /* length of name */
39 __u8 e_name_index; /* attribute name index */
40 __le16 e_value_offs; /* offset in disk block of value */
41 __le32 e_value_block; /* disk block attribute is stored on (n/i) */
42 __le32 e_value_size; /* size of attribute value */
43 __le32 e_hash; /* hash value of name and value */
44 char e_name[0]; /* attribute name */
45};
46
47#define EXT4_XATTR_PAD_BITS 2
48#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS)
49#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1)
50#define EXT4_XATTR_LEN(name_len) \
51 (((name_len) + EXT4_XATTR_ROUND + \
52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
53#define EXT4_XATTR_NEXT(entry) \
54 ( (struct ext4_xattr_entry *)( \
55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
56#define EXT4_XATTR_SIZE(size) \
57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
58
59# ifdef CONFIG_EXT4DEV_FS_XATTR
60
61extern struct xattr_handler ext4_xattr_user_handler;
62extern struct xattr_handler ext4_xattr_trusted_handler;
63extern struct xattr_handler ext4_xattr_acl_access_handler;
64extern struct xattr_handler ext4_xattr_acl_default_handler;
65extern struct xattr_handler ext4_xattr_security_handler;
66
67extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
68
69extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
70extern int ext4_xattr_list(struct inode *, char *, size_t);
71extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
72extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
73
74extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
75extern void ext4_xattr_put_super(struct super_block *);
76
77extern int init_ext4_xattr(void);
78extern void exit_ext4_xattr(void);
79
80extern struct xattr_handler *ext4_xattr_handlers[];
81
82# else /* CONFIG_EXT4DEV_FS_XATTR */
83
84static inline int
85ext4_xattr_get(struct inode *inode, int name_index, const char *name,
86 void *buffer, size_t size, int flags)
87{
88 return -EOPNOTSUPP;
89}
90
91static inline int
92ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
93{
94 return -EOPNOTSUPP;
95}
96
97static inline int
98ext4_xattr_set(struct inode *inode, int name_index, const char *name,
99 const void *value, size_t size, int flags)
100{
101 return -EOPNOTSUPP;
102}
103
104static inline int
105ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
106 const char *name, const void *value, size_t size, int flags)
107{
108 return -EOPNOTSUPP;
109}
110
111static inline void
112ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
113{
114}
115
116static inline void
117ext4_xattr_put_super(struct super_block *sb)
118{
119}
120
121static inline int
122init_ext4_xattr(void)
123{
124 return 0;
125}
126
127static inline void
128exit_ext4_xattr(void)
129{
130}
131
132#define ext4_xattr_handlers NULL
133
134# endif /* CONFIG_EXT4DEV_FS_XATTR */
135
136#ifdef CONFIG_EXT4DEV_FS_SECURITY
137extern int ext4_init_security(handle_t *handle, struct inode *inode,
138 struct inode *dir);
139#else
140static inline int ext4_init_security(handle_t *handle, struct inode *inode,
141 struct inode *dir)
142{
143 return 0;
144}
145#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 000000000000..b6a6861951f9
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,77 @@
1/*
2 * linux/fs/ext4/xattr_security.c
3 * Handler for storing security labels as extended attributes.
4 */
5
6#include <linux/module.h>
7#include <linux/string.h>
8#include <linux/fs.h>
9#include <linux/smp_lock.h>
10#include <linux/ext4_jbd2.h>
11#include <linux/ext4_fs.h>
12#include <linux/security.h>
13#include "xattr.h"
14
15static size_t
16ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
17 const char *name, size_t name_len)
18{
19 const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
20 const size_t total_len = prefix_len + name_len + 1;
21
22
23 if (list && total_len <= list_size) {
24 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
25 memcpy(list+prefix_len, name, name_len);
26 list[prefix_len + name_len] = '\0';
27 }
28 return total_len;
29}
30
31static int
32ext4_xattr_security_get(struct inode *inode, const char *name,
33 void *buffer, size_t size)
34{
35 if (strcmp(name, "") == 0)
36 return -EINVAL;
37 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
38 buffer, size);
39}
40
41static int
42ext4_xattr_security_set(struct inode *inode, const char *name,
43 const void *value, size_t size, int flags)
44{
45 if (strcmp(name, "") == 0)
46 return -EINVAL;
47 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
48 value, size, flags);
49}
50
51int
52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
53{
54 int err;
55 size_t len;
56 void *value;
57 char *name;
58
59 err = security_inode_init_security(inode, dir, &name, &value, &len);
60 if (err) {
61 if (err == -EOPNOTSUPP)
62 return 0;
63 return err;
64 }
65 err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
66 name, value, len, 0);
67 kfree(name);
68 kfree(value);
69 return err;
70}
71
72struct xattr_handler ext4_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext4_xattr_security_list,
75 .get = ext4_xattr_security_get,
76 .set = ext4_xattr_security_set,
77};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 000000000000..b76f2dbc82da
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,62 @@
1/*
2 * linux/fs/ext4/xattr_trusted.c
3 * Handler for trusted extended attributes.
4 *
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include <linux/smp_lock.h>
13#include <linux/ext4_jbd2.h>
14#include <linux/ext4_fs.h>
15#include "xattr.h"
16
17#define XATTR_TRUSTED_PREFIX "trusted."
18
19static size_t
20ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
21 const char *name, size_t name_len)
22{
23 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
24 const size_t total_len = prefix_len + name_len + 1;
25
26 if (!capable(CAP_SYS_ADMIN))
27 return 0;
28
29 if (list && total_len <= list_size) {
30 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
31 memcpy(list+prefix_len, name, name_len);
32 list[prefix_len + name_len] = '\0';
33 }
34 return total_len;
35}
36
37static int
38ext4_xattr_trusted_get(struct inode *inode, const char *name,
39 void *buffer, size_t size)
40{
41 if (strcmp(name, "") == 0)
42 return -EINVAL;
43 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
44 buffer, size);
45}
46
47static int
48ext4_xattr_trusted_set(struct inode *inode, const char *name,
49 const void *value, size_t size, int flags)
50{
51 if (strcmp(name, "") == 0)
52 return -EINVAL;
53 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
54 value, size, flags);
55}
56
57struct xattr_handler ext4_xattr_trusted_handler = {
58 .prefix = XATTR_TRUSTED_PREFIX,
59 .list = ext4_xattr_trusted_list,
60 .get = ext4_xattr_trusted_get,
61 .set = ext4_xattr_trusted_set,
62};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 000000000000..c53cded0761a
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,64 @@
1/*
2 * linux/fs/ext4/xattr_user.c
3 * Handler for extended user attributes.
4 *
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/fs.h>
11#include <linux/smp_lock.h>
12#include <linux/ext4_jbd2.h>
13#include <linux/ext4_fs.h>
14#include "xattr.h"
15
16#define XATTR_USER_PREFIX "user."
17
18static size_t
19ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len)
21{
22 const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
23 const size_t total_len = prefix_len + name_len + 1;
24
25 if (!test_opt(inode->i_sb, XATTR_USER))
26 return 0;
27
28 if (list && total_len <= list_size) {
29 memcpy(list, XATTR_USER_PREFIX, prefix_len);
30 memcpy(list+prefix_len, name, name_len);
31 list[prefix_len + name_len] = '\0';
32 }
33 return total_len;
34}
35
36static int
37ext4_xattr_user_get(struct inode *inode, const char *name,
38 void *buffer, size_t size)
39{
40 if (strcmp(name, "") == 0)
41 return -EINVAL;
42 if (!test_opt(inode->i_sb, XATTR_USER))
43 return -EOPNOTSUPP;
44 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
45}
46
47static int
48ext4_xattr_user_set(struct inode *inode, const char *name,
49 const void *value, size_t size, int flags)
50{
51 if (strcmp(name, "") == 0)
52 return -EINVAL;
53 if (!test_opt(inode->i_sb, XATTR_USER))
54 return -EOPNOTSUPP;
55 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
56 value, size, flags);
57}
58
59struct xattr_handler ext4_xattr_user_handler = {
60 .prefix = XATTR_USER_PREFIX,
61 .list = ext4_xattr_user_list,
62 .get = ext4_xattr_user_get,
63 .set = ext4_xattr_user_set,
64};
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f4b8f8b3fbdd..8337451e7897 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -13,6 +13,7 @@
13#include <linux/smp_lock.h> 13#include <linux/smp_lock.h>
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/writeback.h> 15#include <linux/writeback.h>
16#include <linux/backing-dev.h>
16#include <linux/blkdev.h> 17#include <linux/blkdev.h>
17 18
18int fat_generic_ioctl(struct inode *inode, struct file *filp, 19int fat_generic_ioctl(struct inode *inode, struct file *filp,
@@ -118,7 +119,7 @@ static int fat_file_release(struct inode *inode, struct file *filp)
118 if ((filp->f_mode & FMODE_WRITE) && 119 if ((filp->f_mode & FMODE_WRITE) &&
119 MSDOS_SB(inode->i_sb)->options.flush) { 120 MSDOS_SB(inode->i_sb)->options.flush) {
120 fat_flush_inodes(inode->i_sb, inode, NULL); 121 fat_flush_inodes(inode->i_sb, inode, NULL);
121 blk_congestion_wait(WRITE, HZ/10); 122 congestion_wait(WRITE, HZ/10);
122 } 123 }
123 return 0; 124 return 0;
124} 125}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 045738032a83..78945b53b0f8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -384,7 +384,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
384 le16_to_cpu(de->cdate)) + secs; 384 le16_to_cpu(de->cdate)) + secs;
385 inode->i_ctime.tv_nsec = csecs * 10000000; 385 inode->i_ctime.tv_nsec = csecs * 10000000;
386 inode->i_atime.tv_sec = 386 inode->i_atime.tv_sec =
387 date_dos2unix(le16_to_cpu(0), le16_to_cpu(de->adate)); 387 date_dos2unix(0, le16_to_cpu(de->adate));
388 inode->i_atime.tv_nsec = 0; 388 inode->i_atime.tv_nsec = 0;
389 } else 389 } else
390 inode->i_ctime = inode->i_atime = inode->i_mtime; 390 inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -1472,7 +1472,7 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
1472 ret = writeback_inode(i1); 1472 ret = writeback_inode(i1);
1473 if (!ret && i2) 1473 if (!ret && i2)
1474 ret = writeback_inode(i2); 1474 ret = writeback_inode(i2);
1475 if (!ret && sb) { 1475 if (!ret) {
1476 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; 1476 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
1477 ret = filemap_flush(mapping); 1477 ret = filemap_flush(mapping);
1478 } 1478 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8605155db171..cfc8f81e60d0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -138,6 +138,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
138 struct fuse_entry_out outarg; 138 struct fuse_entry_out outarg;
139 struct fuse_conn *fc; 139 struct fuse_conn *fc;
140 struct fuse_req *req; 140 struct fuse_req *req;
141 struct dentry *parent;
141 142
142 /* Doesn't hurt to "reset" the validity timeout */ 143 /* Doesn't hurt to "reset" the validity timeout */
143 fuse_invalidate_entry_cache(entry); 144 fuse_invalidate_entry_cache(entry);
@@ -151,8 +152,10 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
151 if (IS_ERR(req)) 152 if (IS_ERR(req))
152 return 0; 153 return 0;
153 154
154 fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); 155 parent = dget_parent(entry);
156 fuse_lookup_init(req, parent->d_inode, entry, &outarg);
155 request_send(fc, req); 157 request_send(fc, req);
158 dput(parent);
156 err = req->out.h.error; 159 err = req->out.h.error;
157 /* Zero nodeid is same as -ENOENT */ 160 /* Zero nodeid is same as -ENOENT */
158 if (!err && !outarg.nodeid) 161 if (!err && !outarg.nodeid)
@@ -163,7 +166,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
163 fuse_send_forget(fc, req, outarg.nodeid, 1); 166 fuse_send_forget(fc, req, outarg.nodeid, 1);
164 return 0; 167 return 0;
165 } 168 }
169 spin_lock(&fc->lock);
166 fi->nlookup ++; 170 fi->nlookup ++;
171 spin_unlock(&fc->lock);
167 } 172 }
168 fuse_put_request(fc, req); 173 fuse_put_request(fc, req);
169 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) 174 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
@@ -175,22 +180,6 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
175 return 1; 180 return 1;
176} 181}
177 182
178/*
179 * Check if there's already a hashed alias of this directory inode.
180 * If yes, then lookup and mkdir must not create a new alias.
181 */
182static int dir_alias(struct inode *inode)
183{
184 if (S_ISDIR(inode->i_mode)) {
185 struct dentry *alias = d_find_alias(inode);
186 if (alias) {
187 dput(alias);
188 return 1;
189 }
190 }
191 return 0;
192}
193
194static int invalid_nodeid(u64 nodeid) 183static int invalid_nodeid(u64 nodeid)
195{ 184{
196 return !nodeid || nodeid == FUSE_ROOT_ID; 185 return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -206,6 +195,24 @@ static int valid_mode(int m)
206 S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); 195 S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
207} 196}
208 197
198/*
199 * Add a directory inode to a dentry, ensuring that no other dentry
200 * refers to this inode. Called with fc->inst_mutex.
201 */
202static int fuse_d_add_directory(struct dentry *entry, struct inode *inode)
203{
204 struct dentry *alias = d_find_alias(inode);
205 if (alias) {
206 /* This tries to shrink the subtree below alias */
207 fuse_invalidate_entry(alias);
208 dput(alias);
209 if (!list_empty(&inode->i_dentry))
210 return -EBUSY;
211 }
212 d_add(entry, inode);
213 return 0;
214}
215
209static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, 216static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
210 struct nameidata *nd) 217 struct nameidata *nd)
211{ 218{
@@ -241,11 +248,17 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
241 if (err && err != -ENOENT) 248 if (err && err != -ENOENT)
242 return ERR_PTR(err); 249 return ERR_PTR(err);
243 250
244 if (inode && dir_alias(inode)) { 251 if (inode && S_ISDIR(inode->i_mode)) {
245 iput(inode); 252 mutex_lock(&fc->inst_mutex);
246 return ERR_PTR(-EIO); 253 err = fuse_d_add_directory(entry, inode);
247 } 254 mutex_unlock(&fc->inst_mutex);
248 d_add(entry, inode); 255 if (err) {
256 iput(inode);
257 return ERR_PTR(err);
258 }
259 } else
260 d_add(entry, inode);
261
249 entry->d_op = &fuse_dentry_operations; 262 entry->d_op = &fuse_dentry_operations;
250 if (!err) 263 if (!err)
251 fuse_change_timeout(entry, &outarg); 264 fuse_change_timeout(entry, &outarg);
@@ -401,12 +414,22 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
401 } 414 }
402 fuse_put_request(fc, req); 415 fuse_put_request(fc, req);
403 416
404 if (dir_alias(inode)) { 417 if (S_ISDIR(inode->i_mode)) {
405 iput(inode); 418 struct dentry *alias;
406 return -EIO; 419 mutex_lock(&fc->inst_mutex);
407 } 420 alias = d_find_alias(inode);
421 if (alias) {
422 /* New directory must have moved since mkdir */
423 mutex_unlock(&fc->inst_mutex);
424 dput(alias);
425 iput(inode);
426 return -EBUSY;
427 }
428 d_instantiate(entry, inode);
429 mutex_unlock(&fc->inst_mutex);
430 } else
431 d_instantiate(entry, inode);
408 432
409 d_instantiate(entry, inode);
410 fuse_change_timeout(entry, &outarg); 433 fuse_change_timeout(entry, &outarg);
411 fuse_invalidate_attr(dir); 434 fuse_invalidate_attr(dir);
412 return 0; 435 return 0;
@@ -935,14 +958,30 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
935 } 958 }
936} 959}
937 960
961static void fuse_vmtruncate(struct inode *inode, loff_t offset)
962{
963 struct fuse_conn *fc = get_fuse_conn(inode);
964 int need_trunc;
965
966 spin_lock(&fc->lock);
967 need_trunc = inode->i_size > offset;
968 i_size_write(inode, offset);
969 spin_unlock(&fc->lock);
970
971 if (need_trunc) {
972 struct address_space *mapping = inode->i_mapping;
973 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
974 truncate_inode_pages(mapping, offset);
975 }
976}
977
938/* 978/*
939 * Set attributes, and at the same time refresh them. 979 * Set attributes, and at the same time refresh them.
940 * 980 *
941 * Truncation is slightly complicated, because the 'truncate' request 981 * Truncation is slightly complicated, because the 'truncate' request
942 * may fail, in which case we don't want to touch the mapping. 982 * may fail, in which case we don't want to touch the mapping.
943 * vmtruncate() doesn't allow for this case. So do the rlimit 983 * vmtruncate() doesn't allow for this case, so do the rlimit checking
944 * checking by hand and call vmtruncate() only after the file has 984 * and the actual truncation by hand.
945 * actually been truncated.
946 */ 985 */
947static int fuse_setattr(struct dentry *entry, struct iattr *attr) 986static int fuse_setattr(struct dentry *entry, struct iattr *attr)
948{ 987{
@@ -993,12 +1032,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
993 make_bad_inode(inode); 1032 make_bad_inode(inode);
994 err = -EIO; 1033 err = -EIO;
995 } else { 1034 } else {
996 if (is_truncate) { 1035 if (is_truncate)
997 loff_t origsize = i_size_read(inode); 1036 fuse_vmtruncate(inode, outarg.attr.size);
998 i_size_write(inode, outarg.attr.size);
999 if (origsize > outarg.attr.size)
1000 vmtruncate(inode, outarg.attr.size);
1001 }
1002 fuse_change_attributes(inode, &outarg.attr); 1037 fuse_change_attributes(inode, &outarg.attr);
1003 fi->i_time = time_to_jiffies(outarg.attr_valid, 1038 fi->i_time = time_to_jiffies(outarg.attr_valid,
1004 outarg.attr_valid_nsec); 1039 outarg.attr_valid_nsec);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 183626868eea..2bb5ace3882d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -481,8 +481,10 @@ static int fuse_commit_write(struct file *file, struct page *page,
481 err = -EIO; 481 err = -EIO;
482 if (!err) { 482 if (!err) {
483 pos += count; 483 pos += count;
484 if (pos > i_size_read(inode)) 484 spin_lock(&fc->lock);
485 if (pos > inode->i_size)
485 i_size_write(inode, pos); 486 i_size_write(inode, pos);
487 spin_unlock(&fc->lock);
486 488
487 if (offset == 0 && to == PAGE_CACHE_SIZE) { 489 if (offset == 0 && to == PAGE_CACHE_SIZE) {
488 clear_page_dirty(page); 490 clear_page_dirty(page);
@@ -586,8 +588,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
586 } 588 }
587 fuse_put_request(fc, req); 589 fuse_put_request(fc, req);
588 if (res > 0) { 590 if (res > 0) {
589 if (write && pos > i_size_read(inode)) 591 if (write) {
590 i_size_write(inode, pos); 592 spin_lock(&fc->lock);
593 if (pos > inode->i_size)
594 i_size_write(inode, pos);
595 spin_unlock(&fc->lock);
596 }
591 *ppos = pos; 597 *ppos = pos;
592 } 598 }
593 fuse_invalidate_attr(inode); 599 fuse_invalidate_attr(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 69c7750d55b8..91edb8932d90 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -239,6 +239,9 @@ struct fuse_conn {
239 /** Lock protecting accessess to members of this structure */ 239 /** Lock protecting accessess to members of this structure */
240 spinlock_t lock; 240 spinlock_t lock;
241 241
242 /** Mutex protecting against directory alias creation */
243 struct mutex inst_mutex;
244
242 /** Refcount */ 245 /** Refcount */
243 atomic_t count; 246 atomic_t count;
244 247
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d0a9aee01f2..fc4203570370 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -109,6 +109,7 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
109 109
110void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr) 110void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
111{ 111{
112 struct fuse_conn *fc = get_fuse_conn(inode);
112 if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size) 113 if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
113 invalidate_inode_pages(inode->i_mapping); 114 invalidate_inode_pages(inode->i_mapping);
114 115
@@ -117,7 +118,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
117 inode->i_nlink = attr->nlink; 118 inode->i_nlink = attr->nlink;
118 inode->i_uid = attr->uid; 119 inode->i_uid = attr->uid;
119 inode->i_gid = attr->gid; 120 inode->i_gid = attr->gid;
121 spin_lock(&fc->lock);
120 i_size_write(inode, attr->size); 122 i_size_write(inode, attr->size);
123 spin_unlock(&fc->lock);
121 inode->i_blocks = attr->blocks; 124 inode->i_blocks = attr->blocks;
122 inode->i_atime.tv_sec = attr->atime; 125 inode->i_atime.tv_sec = attr->atime;
123 inode->i_atime.tv_nsec = attr->atimensec; 126 inode->i_atime.tv_nsec = attr->atimensec;
@@ -130,7 +133,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
130static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) 133static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
131{ 134{
132 inode->i_mode = attr->mode & S_IFMT; 135 inode->i_mode = attr->mode & S_IFMT;
133 i_size_write(inode, attr->size); 136 inode->i_size = attr->size;
134 if (S_ISREG(inode->i_mode)) { 137 if (S_ISREG(inode->i_mode)) {
135 fuse_init_common(inode); 138 fuse_init_common(inode);
136 fuse_init_file_inode(inode); 139 fuse_init_file_inode(inode);
@@ -169,7 +172,6 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
169 struct inode *inode; 172 struct inode *inode;
170 struct fuse_inode *fi; 173 struct fuse_inode *fi;
171 struct fuse_conn *fc = get_fuse_conn_super(sb); 174 struct fuse_conn *fc = get_fuse_conn_super(sb);
172 int retried = 0;
173 175
174 retry: 176 retry:
175 inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); 177 inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
@@ -183,16 +185,16 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
183 fuse_init_inode(inode, attr); 185 fuse_init_inode(inode, attr);
184 unlock_new_inode(inode); 186 unlock_new_inode(inode);
185 } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { 187 } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
186 BUG_ON(retried);
187 /* Inode has changed type, any I/O on the old should fail */ 188 /* Inode has changed type, any I/O on the old should fail */
188 make_bad_inode(inode); 189 make_bad_inode(inode);
189 iput(inode); 190 iput(inode);
190 retried = 1;
191 goto retry; 191 goto retry;
192 } 192 }
193 193
194 fi = get_fuse_inode(inode); 194 fi = get_fuse_inode(inode);
195 spin_lock(&fc->lock);
195 fi->nlookup ++; 196 fi->nlookup ++;
197 spin_unlock(&fc->lock);
196 fuse_change_attributes(inode, attr); 198 fuse_change_attributes(inode, attr);
197 return inode; 199 return inode;
198} 200}
@@ -377,6 +379,7 @@ static struct fuse_conn *new_conn(void)
377 fc = kzalloc(sizeof(*fc), GFP_KERNEL); 379 fc = kzalloc(sizeof(*fc), GFP_KERNEL);
378 if (fc) { 380 if (fc) {
379 spin_lock_init(&fc->lock); 381 spin_lock_init(&fc->lock);
382 mutex_init(&fc->inst_mutex);
380 atomic_set(&fc->count, 1); 383 atomic_set(&fc->count, 1);
381 init_waitqueue_head(&fc->waitq); 384 init_waitqueue_head(&fc->waitq);
382 init_waitqueue_head(&fc->blocked_waitq); 385 init_waitqueue_head(&fc->blocked_waitq);
@@ -396,8 +399,10 @@ static struct fuse_conn *new_conn(void)
396 399
397void fuse_conn_put(struct fuse_conn *fc) 400void fuse_conn_put(struct fuse_conn *fc)
398{ 401{
399 if (atomic_dec_and_test(&fc->count)) 402 if (atomic_dec_and_test(&fc->count)) {
403 mutex_destroy(&fc->inst_mutex);
400 kfree(fc); 404 kfree(fc);
405 }
401} 406}
402 407
403struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) 408struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
1config GFS2_FS
2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL
4 select FS_POSIX_ACL
5 help
6 A cluster filesystem.
7
8 Allows a cluster of computers to simultaneously use a block device
9 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
10 and writes to the block device like a local filesystem, but also uses
11 a lock module to allow the computers coordinate their I/O so
12 filesystem consistency is maintained. One of the nifty features of
13 GFS is perfect consistency -- changes made to the filesystem on one
14 machine show up immediately on all other machines in the cluster.
15
16 To use the GFS2 filesystem, you will need to enable one or more of
17 the below locking modules. Documentation and utilities for GFS2 can
18 be found here: http://sources.redhat.com/cluster
19
20config GFS2_FS_LOCKING_NOLOCK
21 tristate "GFS2 \"nolock\" locking module"
22 depends on GFS2_FS
23 help
24 Single node locking module for GFS2.
25
26 Use this module if you want to use GFS2 on a single node without
27 its clustering features. You can still take advantage of the
28 large file support, and upgrade to running a full cluster later on
29 if required.
30
31 If you will only be using GFS2 in cluster mode, you do not need this
32 module.
33
34config GFS2_FS_LOCKING_DLM
35 tristate "GFS2 DLM locking module"
36 depends on GFS2_FS
37 select DLM
38 help
39 Multiple node locking module for GFS2
40
41 Most users of GFS2 will require this module. It provides the locking
42 interface between GFS2 and the DLM, which is required to use GFS2
43 in a cluster environment.
44
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..e3f1ada643ac
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o
7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
9obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
10
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..5f959b8ce406
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,309 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "trans.h"
29#include "util.h"
30
31#define ACL_ACCESS 1
32#define ACL_DEFAULT 0
33
34int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
35 struct gfs2_ea_request *er,
36 int *remove, mode_t *mode)
37{
38 struct posix_acl *acl;
39 int error;
40
41 error = gfs2_acl_validate_remove(ip, access);
42 if (error)
43 return error;
44
45 if (!er->er_data)
46 return -EINVAL;
47
48 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
49 if (IS_ERR(acl))
50 return PTR_ERR(acl);
51 if (!acl) {
52 *remove = 1;
53 return 0;
54 }
55
56 error = posix_acl_valid(acl);
57 if (error)
58 goto out;
59
60 if (access) {
61 error = posix_acl_equiv_mode(acl, mode);
62 if (!error)
63 *remove = 1;
64 else if (error > 0)
65 error = 0;
66 }
67
68out:
69 posix_acl_release(acl);
70 return error;
71}
72
73int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
74{
75 if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
76 return -EOPNOTSUPP;
77 if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
78 return -EPERM;
79 if (S_ISLNK(ip->i_di.di_mode))
80 return -EOPNOTSUPP;
81 if (!access && !S_ISDIR(ip->i_di.di_mode))
82 return -EACCES;
83
84 return 0;
85}
86
87static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
88 struct gfs2_ea_location *el, char **data, unsigned int *len)
89{
90 struct gfs2_ea_request er;
91 struct gfs2_ea_location el_this;
92 int error;
93
94 if (!ip->i_di.di_eattr)
95 return 0;
96
97 memset(&er, 0, sizeof(struct gfs2_ea_request));
98 if (access) {
99 er.er_name = GFS2_POSIX_ACL_ACCESS;
100 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
101 } else {
102 er.er_name = GFS2_POSIX_ACL_DEFAULT;
103 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
104 }
105 er.er_type = GFS2_EATYPE_SYS;
106
107 if (!el)
108 el = &el_this;
109
110 error = gfs2_ea_find(ip, &er, el);
111 if (error)
112 return error;
113 if (!el->el_ea)
114 return 0;
115 if (!GFS2_EA_DATA_LEN(el->el_ea))
116 goto out;
117
118 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
119 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
120 error = -ENOMEM;
121 if (!er.er_data)
122 goto out;
123
124 error = gfs2_ea_get_copy(ip, el, er.er_data);
125 if (error)
126 goto out_kfree;
127
128 if (acl) {
129 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
130 if (IS_ERR(*acl))
131 error = PTR_ERR(*acl);
132 }
133
134out_kfree:
135 if (error || !data)
136 kfree(er.er_data);
137 else {
138 *data = er.er_data;
139 *len = er.er_data_len;
140 }
141out:
142 if (error || el == &el_this)
143 brelse(el->el_bh);
144 return error;
145}
146
147/**
148 * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
149 * @inode: the file we want to do something to
150 * @mask: what we want to do
151 *
152 * Returns: errno
153 */
154
155int gfs2_check_acl_locked(struct inode *inode, int mask)
156{
157 struct posix_acl *acl = NULL;
158 int error;
159
160 error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
161 if (error)
162 return error;
163
164 if (acl) {
165 error = posix_acl_permission(inode, acl, mask);
166 posix_acl_release(acl);
167 return error;
168 }
169
170 return -EAGAIN;
171}
172
173int gfs2_check_acl(struct inode *inode, int mask)
174{
175 struct gfs2_inode *ip = GFS2_I(inode);
176 struct gfs2_holder i_gh;
177 int error;
178
179 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
180 if (!error) {
181 error = gfs2_check_acl_locked(inode, mask);
182 gfs2_glock_dq_uninit(&i_gh);
183 }
184
185 return error;
186}
187
188static int munge_mode(struct gfs2_inode *ip, mode_t mode)
189{
190 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
191 struct buffer_head *dibh;
192 int error;
193
194 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
195 if (error)
196 return error;
197
198 error = gfs2_meta_inode_buffer(ip, &dibh);
199 if (!error) {
200 gfs2_assert_withdraw(sdp,
201 (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
202 ip->i_di.di_mode = mode;
203 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
204 gfs2_dinode_out(&ip->i_di, dibh->b_data);
205 brelse(dibh);
206 }
207
208 gfs2_trans_end(sdp);
209
210 return 0;
211}
212
213int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
214{
215 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
216 struct posix_acl *acl = NULL, *clone;
217 struct gfs2_ea_request er;
218 mode_t mode = ip->i_di.di_mode;
219 int error;
220
221 if (!sdp->sd_args.ar_posix_acl)
222 return 0;
223 if (S_ISLNK(ip->i_di.di_mode))
224 return 0;
225
226 memset(&er, 0, sizeof(struct gfs2_ea_request));
227 er.er_type = GFS2_EATYPE_SYS;
228
229 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
230 &er.er_data, &er.er_data_len);
231 if (error)
232 return error;
233 if (!acl) {
234 mode &= ~current->fs->umask;
235 if (mode != ip->i_di.di_mode)
236 error = munge_mode(ip, mode);
237 return error;
238 }
239
240 clone = posix_acl_clone(acl, GFP_KERNEL);
241 error = -ENOMEM;
242 if (!clone)
243 goto out;
244 posix_acl_release(acl);
245 acl = clone;
246
247 if (S_ISDIR(ip->i_di.di_mode)) {
248 er.er_name = GFS2_POSIX_ACL_DEFAULT;
249 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
250 error = gfs2_system_eaops.eo_set(ip, &er);
251 if (error)
252 goto out;
253 }
254
255 error = posix_acl_create_masq(acl, &mode);
256 if (error < 0)
257 goto out;
258 if (error > 0) {
259 er.er_name = GFS2_POSIX_ACL_ACCESS;
260 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
261 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
262 er.er_mode = mode;
263 er.er_flags = GFS2_ERF_MODE;
264 error = gfs2_system_eaops.eo_set(ip, &er);
265 if (error)
266 goto out;
267 } else
268 munge_mode(ip, mode);
269
270out:
271 posix_acl_release(acl);
272 kfree(er.er_data);
273 return error;
274}
275
276int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
277{
278 struct posix_acl *acl = NULL, *clone;
279 struct gfs2_ea_location el;
280 char *data;
281 unsigned int len;
282 int error;
283
284 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
285 if (error)
286 return error;
287 if (!acl)
288 return gfs2_setattr_simple(ip, attr);
289
290 clone = posix_acl_clone(acl, GFP_KERNEL);
291 error = -ENOMEM;
292 if (!clone)
293 goto out;
294 posix_acl_release(acl);
295 acl = clone;
296
297 error = posix_acl_chmod_masq(acl, attr->ia_mode);
298 if (!error) {
299 posix_acl_to_xattr(acl, data, len);
300 error = gfs2_ea_acl_chmod(ip, &el, attr, data);
301 }
302
303out:
304 posix_acl_release(acl);
305 brelse(el.el_bh);
306 kfree(data);
307 return error;
308}
309
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..05c294fe0d78
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __ACL_DOT_H__
11#define __ACL_DOT_H__
12
13#include "incore.h"
14
15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
16#define GFS2_POSIX_ACL_ACCESS_LEN 16
17#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
18#define GFS2_POSIX_ACL_DEFAULT_LEN 17
19
20#define GFS2_ACL_IS_ACCESS(name, len) \
21 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
22 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
23
24#define GFS2_ACL_IS_DEFAULT(name, len) \
25 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
26 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
27
28struct gfs2_ea_request;
29
30int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
31 struct gfs2_ea_request *er,
32 int *remove, mode_t *mode);
33int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
34int gfs2_check_acl_locked(struct inode *inode, int mask);
35int gfs2_check_acl(struct inode *inode, int mask);
36int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
37int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
38
39#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..06e9a8cb45e9
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1222 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "trans.h"
28#include "dir.h"
29#include "util.h"
30#include "ops_address.h"
31
32/* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
34 * keep it small.
35 */
36struct metapath {
37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
38};
39
40typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
41 struct buffer_head *bh, u64 *top,
42 u64 *bottom, unsigned int height,
43 void *data);
44
45struct strip_mine {
46 int sm_first;
47 unsigned int sm_height;
48};
49
50/**
51 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
52 * @ip: the inode
53 * @dibh: the dinode buffer
54 * @block: the block number that was allocated
55 * @private: any locked page held by the caller process
56 *
57 * Returns: errno
58 */
59
60static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
61 u64 block, struct page *page)
62{
63 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
64 struct inode *inode = &ip->i_inode;
65 struct buffer_head *bh;
66 int release = 0;
67
68 if (!page || page->index) {
69 page = grab_cache_page(inode->i_mapping, 0);
70 if (!page)
71 return -ENOMEM;
72 release = 1;
73 }
74
75 if (!PageUptodate(page)) {
76 void *kaddr = kmap(page);
77
78 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
79 ip->i_di.di_size);
80 memset(kaddr + ip->i_di.di_size, 0,
81 PAGE_CACHE_SIZE - ip->i_di.di_size);
82 kunmap(page);
83
84 SetPageUptodate(page);
85 }
86
87 if (!page_has_buffers(page))
88 create_empty_buffers(page, 1 << inode->i_blkbits,
89 (1 << BH_Uptodate));
90
91 bh = page_buffers(page);
92
93 if (!buffer_mapped(bh))
94 map_bh(bh, inode->i_sb, block);
95
96 set_buffer_uptodate(bh);
97 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
98 gfs2_trans_add_bh(ip->i_gl, bh, 0);
99 mark_buffer_dirty(bh);
100
101 if (release) {
102 unlock_page(page);
103 page_cache_release(page);
104 }
105
106 return 0;
107}
108
109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file
113 * @private: private data for the unstuffer
114 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way.
117 *
118 * Returns: errno
119 */
120
121int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
122{
123 struct buffer_head *bh, *dibh;
124 struct gfs2_dinode *di;
125 u64 block = 0;
126 int isdir = gfs2_is_dir(ip);
127 int error;
128
129 down_write(&ip->i_rw_mutex);
130
131 error = gfs2_meta_inode_buffer(ip, &dibh);
132 if (error)
133 goto out;
134
135 if (ip->i_di.di_size) {
136 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */
138
139 if (isdir) {
140 block = gfs2_alloc_meta(ip);
141
142 error = gfs2_dir_get_new_buffer(ip, block, &bh);
143 if (error)
144 goto out_brelse;
145 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
146 dibh, sizeof(struct gfs2_dinode));
147 brelse(bh);
148 } else {
149 block = gfs2_alloc_data(ip);
150
151 error = gfs2_unstuffer_page(ip, dibh, block, page);
152 if (error)
153 goto out_brelse;
154 }
155 }
156
157 /* Set up the pointer to the new block */
158
159 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
160 di = (struct gfs2_dinode *)dibh->b_data;
161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
162
163 if (ip->i_di.di_size) {
164 *(__be64 *)(di + 1) = cpu_to_be64(block);
165 ip->i_di.di_blocks++;
166 di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
167 }
168
169 ip->i_di.di_height = 1;
170 di->di_height = cpu_to_be16(1);
171
172out_brelse:
173 brelse(dibh);
174out:
175 up_write(&ip->i_rw_mutex);
176 return error;
177}
178
179/**
180 * calc_tree_height - Calculate the height of a metadata tree
181 * @ip: The GFS2 inode
182 * @size: The proposed size of the file
183 *
184 * Work out how tall a metadata tree needs to be in order to accommodate a
185 * file of a particular size. If size is less than the current size of
186 * the inode, then the current size of the inode is used instead of the
187 * supplied one.
188 *
189 * Returns: the height the tree should be
190 */
191
192static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
193{
194 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
195 u64 *arr;
196 unsigned int max, height;
197
198 if (ip->i_di.di_size > size)
199 size = ip->i_di.di_size;
200
201 if (gfs2_is_dir(ip)) {
202 arr = sdp->sd_jheightsize;
203 max = sdp->sd_max_jheight;
204 } else {
205 arr = sdp->sd_heightsize;
206 max = sdp->sd_max_height;
207 }
208
209 for (height = 0; height < max; height++)
210 if (arr[height] >= size)
211 break;
212
213 return height;
214}
215
216/**
217 * build_height - Build a metadata tree of the requested height
218 * @ip: The GFS2 inode
219 * @height: The height to build to
220 *
221 *
222 * Returns: errno
223 */
224
225static int build_height(struct inode *inode, unsigned height)
226{
227 struct gfs2_inode *ip = GFS2_I(inode);
228 unsigned new_height = height - ip->i_di.di_height;
229 struct buffer_head *dibh;
230 struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
231 struct gfs2_dinode *di;
232 int error;
233 u64 *bp;
234 u64 bn;
235 unsigned n;
236
237 if (height <= ip->i_di.di_height)
238 return 0;
239
240 error = gfs2_meta_inode_buffer(ip, &dibh);
241 if (error)
242 return error;
243
244 for(n = 0; n < new_height; n++) {
245 bn = gfs2_alloc_meta(ip);
246 blocks[n] = gfs2_meta_new(ip->i_gl, bn);
247 gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
248 }
249
250 n = 0;
251 bn = blocks[0]->b_blocknr;
252 if (new_height > 1) {
253 for(; n < new_height-1; n++) {
254 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
255 GFS2_FORMAT_IN);
256 gfs2_buffer_clear_tail(blocks[n],
257 sizeof(struct gfs2_meta_header));
258 bp = (u64 *)(blocks[n]->b_data +
259 sizeof(struct gfs2_meta_header));
260 *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
261 brelse(blocks[n]);
262 blocks[n] = NULL;
263 }
264 }
265 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
266 gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
267 dibh, sizeof(struct gfs2_dinode));
268 brelse(blocks[n]);
269 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
270 di = (struct gfs2_dinode *)dibh->b_data;
271 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
272 *(__be64 *)(di + 1) = cpu_to_be64(bn);
273 ip->i_di.di_height += new_height;
274 ip->i_di.di_blocks += new_height;
275 di->di_height = cpu_to_be16(ip->i_di.di_height);
276 di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
277 brelse(dibh);
278 return error;
279}
280
281/**
282 * find_metapath - Find path through the metadata tree
283 * @ip: The inode pointer
284 * @mp: The metapath to return the result in
285 * @block: The disk block to look up
286 *
287 * This routine returns a struct metapath structure that defines a path
288 * through the metadata of inode "ip" to get to block "block".
289 *
290 * Example:
291 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
292 * filesystem with a blocksize of 4096.
293 *
294 * find_metapath() would return a struct metapath structure set to:
295 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
296 * and mp_list[2] = 165.
297 *
298 * That means that in order to get to the block containing the byte at
299 * offset 101342453, we would load the indirect block pointed to by pointer
300 * 0 in the dinode. We would then load the indirect block pointed to by
301 * pointer 48 in that indirect block. We would then load the data block
302 * pointed to by pointer 165 in that indirect block.
303 *
304 * ----------------------------------------
305 * | Dinode | |
306 * | | 4|
307 * | |0 1 2 3 4 5 9|
308 * | | 6|
309 * ----------------------------------------
310 * |
311 * |
312 * V
313 * ----------------------------------------
314 * | Indirect Block |
315 * | 5|
316 * | 4 4 4 4 4 5 5 1|
317 * |0 5 6 7 8 9 0 1 2|
318 * ----------------------------------------
319 * |
320 * |
321 * V
322 * ----------------------------------------
323 * | Indirect Block |
324 * | 1 1 1 1 1 5|
325 * | 6 6 6 6 6 1|
326 * |0 3 4 5 6 7 2|
327 * ----------------------------------------
328 * |
329 * |
330 * V
331 * ----------------------------------------
332 * | Data block containing offset |
333 * | 101342453 |
334 * | |
335 * | |
336 * ----------------------------------------
337 *
338 */
339
340static void find_metapath(struct gfs2_inode *ip, u64 block,
341 struct metapath *mp)
342{
343 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
344 u64 b = block;
345 unsigned int i;
346
347 for (i = ip->i_di.di_height; i--;)
348 mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
349
350}
351
352/**
353 * metapointer - Return pointer to start of metadata in a buffer
354 * @bh: The buffer
355 * @height: The metadata height (0 = dinode)
356 * @mp: The metapath
357 *
358 * Return a pointer to the block number of the next height of the metadata
359 * tree given a buffer containing the pointer to the current height of the
360 * metadata tree.
361 */
362
363static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
364 unsigned int height, const struct metapath *mp)
365{
366 unsigned int head_size = (height > 0) ?
367 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
368 u64 *ptr;
369 *boundary = 0;
370 ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
371 if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size))
372 *boundary = 1;
373 return ptr;
374}
375
376/**
377 * lookup_block - Get the next metadata block in metadata tree
378 * @ip: The GFS2 inode
379 * @bh: Buffer containing the pointers to metadata blocks
380 * @height: The height of the tree (0 = dinode)
381 * @mp: The metapath
382 * @create: Non-zero if we may create a new meatdata block
383 * @new: Used to indicate if we did create a new metadata block
384 * @block: the returned disk block number
385 *
386 * Given a metatree, complete to a particular height, checks to see if the next
387 * height of the tree exists. If not the next height of the tree is created.
388 * The block number of the next height of the metadata tree is returned.
389 *
390 */
391
392static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
393 unsigned int height, struct metapath *mp, int create,
394 int *new, u64 *block)
395{
396 int boundary;
397 u64 *ptr = metapointer(bh, &boundary, height, mp);
398
399 if (*ptr) {
400 *block = be64_to_cpu(*ptr);
401 return boundary;
402 }
403
404 *block = 0;
405
406 if (!create)
407 return 0;
408
409 if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
410 *block = gfs2_alloc_data(ip);
411 else
412 *block = gfs2_alloc_meta(ip);
413
414 gfs2_trans_add_bh(ip->i_gl, bh, 1);
415
416 *ptr = cpu_to_be64(*block);
417 ip->i_di.di_blocks++;
418
419 *new = 1;
420 return 0;
421}
422
423/**
424 * gfs2_block_pointers - Map a block from an inode to a disk block
425 * @inode: The inode
426 * @lblock: The logical block number
427 * @map_bh: The bh to be mapped
428 * @mp: metapath to use
429 *
430 * Find the block number on the current device which corresponds to an
431 * inode's block. If the block had to be created, "new" will be set.
432 *
433 * Returns: errno
434 */
435
436static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
437 struct buffer_head *bh_map, struct metapath *mp)
438{
439 struct gfs2_inode *ip = GFS2_I(inode);
440 struct gfs2_sbd *sdp = GFS2_SB(inode);
441 struct buffer_head *bh;
442 unsigned int bsize;
443 unsigned int height;
444 unsigned int end_of_metadata;
445 unsigned int x;
446 int error = 0;
447 int new = 0;
448 u64 dblock = 0;
449 int boundary;
450 unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
451
452 BUG_ON(maxlen == 0);
453
454 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
455 return 0;
456
457 bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
458
459 height = calc_tree_height(ip, (lblock + 1) * bsize);
460 if (ip->i_di.di_height < height) {
461 if (!create)
462 return 0;
463
464 error = build_height(inode, height);
465 if (error)
466 return error;
467 }
468
469 find_metapath(ip, lblock, mp);
470 end_of_metadata = ip->i_di.di_height - 1;
471
472 error = gfs2_meta_inode_buffer(ip, &bh);
473 if (error)
474 return error;
475
476 for (x = 0; x < end_of_metadata; x++) {
477 lookup_block(ip, bh, x, mp, create, &new, &dblock);
478 brelse(bh);
479 if (!dblock)
480 return 0;
481
482 error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
483 if (error)
484 return error;
485 }
486
487 boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock);
488 clear_buffer_mapped(bh_map);
489 clear_buffer_new(bh_map);
490 clear_buffer_boundary(bh_map);
491
492 if (dblock) {
493 map_bh(bh_map, inode->i_sb, dblock);
494 if (boundary)
495 set_buffer_boundary(bh);
496 if (new) {
497 struct buffer_head *dibh;
498 error = gfs2_meta_inode_buffer(ip, &dibh);
499 if (!error) {
500 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
501 gfs2_dinode_out(&ip->i_di, dibh->b_data);
502 brelse(dibh);
503 }
504 set_buffer_new(bh_map);
505 goto out_brelse;
506 }
507 while(--maxlen && !buffer_boundary(bh_map)) {
508 u64 eblock;
509
510 mp->mp_list[end_of_metadata]++;
511 boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock);
512 if (eblock != ++dblock)
513 break;
514 bh_map->b_size += (1 << inode->i_blkbits);
515 if (boundary)
516 set_buffer_boundary(bh_map);
517 }
518 }
519out_brelse:
520 brelse(bh);
521 return 0;
522}
523
524
525static inline void bmap_lock(struct inode *inode, int create)
526{
527 struct gfs2_inode *ip = GFS2_I(inode);
528 if (create)
529 down_write(&ip->i_rw_mutex);
530 else
531 down_read(&ip->i_rw_mutex);
532}
533
534static inline void bmap_unlock(struct inode *inode, int create)
535{
536 struct gfs2_inode *ip = GFS2_I(inode);
537 if (create)
538 up_write(&ip->i_rw_mutex);
539 else
540 up_read(&ip->i_rw_mutex);
541}
542
543int gfs2_block_map(struct inode *inode, u64 lblock, int create,
544 struct buffer_head *bh)
545{
546 struct metapath mp;
547 int ret;
548
549 bmap_lock(inode, create);
550 ret = gfs2_block_pointers(inode, lblock, create, bh, &mp);
551 bmap_unlock(inode, create);
552 return ret;
553}
554
555int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
556{
557 struct metapath mp;
558 struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
559 int ret;
560 int create = *new;
561
562 BUG_ON(!extlen);
563 BUG_ON(!dblock);
564 BUG_ON(!new);
565
566 bh.b_size = 1 << (inode->i_blkbits + 5);
567 bmap_lock(inode, create);
568 ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp);
569 bmap_unlock(inode, create);
570 *extlen = bh.b_size >> inode->i_blkbits;
571 *dblock = bh.b_blocknr;
572 if (buffer_new(&bh))
573 *new = 1;
574 else
575 *new = 0;
576 return ret;
577}
578
579/**
580 * recursive_scan - recursively scan through the end of a file
581 * @ip: the inode
582 * @dibh: the dinode buffer
583 * @mp: the path through the metadata to the point to start
584 * @height: the height the recursion is at
585 * @block: the indirect block to look at
586 * @first: 1 if this is the first block
587 * @bc: the call to make for each piece of metadata
588 * @data: data opaque to this function to pass to @bc
589 *
590 * When this is first called @height and @block should be zero and
591 * @first should be 1.
592 *
593 * Returns: errno
594 */
595
596static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
597 struct metapath *mp, unsigned int height,
598 u64 block, int first, block_call_t bc,
599 void *data)
600{
601 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
602 struct buffer_head *bh = NULL;
603 u64 *top, *bottom;
604 u64 bn;
605 int error;
606 int mh_size = sizeof(struct gfs2_meta_header);
607
608 if (!height) {
609 error = gfs2_meta_inode_buffer(ip, &bh);
610 if (error)
611 return error;
612 dibh = bh;
613
614 top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
615 bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
616 } else {
617 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
618 if (error)
619 return error;
620
621 top = (u64 *)(bh->b_data + mh_size) +
622 (first ? mp->mp_list[height] : 0);
623
624 bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
625 }
626
627 error = bc(ip, dibh, bh, top, bottom, height, data);
628 if (error)
629 goto out;
630
631 if (height < ip->i_di.di_height - 1)
632 for (; top < bottom; top++, first = 0) {
633 if (!*top)
634 continue;
635
636 bn = be64_to_cpu(*top);
637
638 error = recursive_scan(ip, dibh, mp, height + 1, bn,
639 first, bc, data);
640 if (error)
641 break;
642 }
643
644out:
645 brelse(bh);
646 return error;
647}
648
649/**
650 * do_strip - Look for a layer a particular layer of the file and strip it off
651 * @ip: the inode
652 * @dibh: the dinode buffer
653 * @bh: A buffer of pointers
654 * @top: The first pointer in the buffer
655 * @bottom: One more than the last pointer
656 * @height: the height this buffer is at
657 * @data: a pointer to a struct strip_mine
658 *
659 * Returns: errno
660 */
661
662static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
663 struct buffer_head *bh, u64 *top, u64 *bottom,
664 unsigned int height, void *data)
665{
666 struct strip_mine *sm = data;
667 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
668 struct gfs2_rgrp_list rlist;
669 u64 bn, bstart;
670 u32 blen;
671 u64 *p;
672 unsigned int rg_blocks = 0;
673 int metadata;
674 unsigned int revokes = 0;
675 int x;
676 int error;
677
678 if (!*top)
679 sm->sm_first = 0;
680
681 if (height != sm->sm_height)
682 return 0;
683
684 if (sm->sm_first) {
685 top++;
686 sm->sm_first = 0;
687 }
688
689 metadata = (height != ip->i_di.di_height - 1);
690 if (metadata)
691 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
692
693 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
694 if (error)
695 return error;
696
697 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
698 bstart = 0;
699 blen = 0;
700
701 for (p = top; p < bottom; p++) {
702 if (!*p)
703 continue;
704
705 bn = be64_to_cpu(*p);
706
707 if (bstart + blen == bn)
708 blen++;
709 else {
710 if (bstart)
711 gfs2_rlist_add(sdp, &rlist, bstart);
712
713 bstart = bn;
714 blen = 1;
715 }
716 }
717
718 if (bstart)
719 gfs2_rlist_add(sdp, &rlist, bstart);
720 else
721 goto out; /* Nothing to do */
722
723 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
724
725 for (x = 0; x < rlist.rl_rgrps; x++) {
726 struct gfs2_rgrpd *rgd;
727 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
728 rg_blocks += rgd->rd_ri.ri_length;
729 }
730
731 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
732 if (error)
733 goto out_rlist;
734
735 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
736 RES_INDIRECT + RES_STATFS + RES_QUOTA,
737 revokes);
738 if (error)
739 goto out_rg_gunlock;
740
741 down_write(&ip->i_rw_mutex);
742
743 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
744 gfs2_trans_add_bh(ip->i_gl, bh, 1);
745
746 bstart = 0;
747 blen = 0;
748
749 for (p = top; p < bottom; p++) {
750 if (!*p)
751 continue;
752
753 bn = be64_to_cpu(*p);
754
755 if (bstart + blen == bn)
756 blen++;
757 else {
758 if (bstart) {
759 if (metadata)
760 gfs2_free_meta(ip, bstart, blen);
761 else
762 gfs2_free_data(ip, bstart, blen);
763 }
764
765 bstart = bn;
766 blen = 1;
767 }
768
769 *p = 0;
770 if (!ip->i_di.di_blocks)
771 gfs2_consist_inode(ip);
772 ip->i_di.di_blocks--;
773 }
774 if (bstart) {
775 if (metadata)
776 gfs2_free_meta(ip, bstart, blen);
777 else
778 gfs2_free_data(ip, bstart, blen);
779 }
780
781 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
782
783 gfs2_dinode_out(&ip->i_di, dibh->b_data);
784
785 up_write(&ip->i_rw_mutex);
786
787 gfs2_trans_end(sdp);
788
789out_rg_gunlock:
790 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
791out_rlist:
792 gfs2_rlist_free(&rlist);
793out:
794 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
795 return error;
796}
797
798/**
799 * do_grow - Make a file look bigger than it is
800 * @ip: the inode
801 * @size: the size to set the file to
802 *
803 * Called with an exclusive lock on @ip.
804 *
805 * Returns: errno
806 */
807
808static int do_grow(struct gfs2_inode *ip, u64 size)
809{
810 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
811 struct gfs2_alloc *al;
812 struct buffer_head *dibh;
813 unsigned int h;
814 int error;
815
816 al = gfs2_alloc_get(ip);
817
818 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
819 if (error)
820 goto out;
821
822 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
823 if (error)
824 goto out_gunlock_q;
825
826 al->al_requested = sdp->sd_max_height + RES_DATA;
827
828 error = gfs2_inplace_reserve(ip);
829 if (error)
830 goto out_gunlock_q;
831
832 error = gfs2_trans_begin(sdp,
833 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
834 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
835 if (error)
836 goto out_ipres;
837
838 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
839 if (gfs2_is_stuffed(ip)) {
840 error = gfs2_unstuff_dinode(ip, NULL);
841 if (error)
842 goto out_end_trans;
843 }
844
845 h = calc_tree_height(ip, size);
846 if (ip->i_di.di_height < h) {
847 down_write(&ip->i_rw_mutex);
848 error = build_height(&ip->i_inode, h);
849 up_write(&ip->i_rw_mutex);
850 if (error)
851 goto out_end_trans;
852 }
853 }
854
855 ip->i_di.di_size = size;
856 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
857
858 error = gfs2_meta_inode_buffer(ip, &dibh);
859 if (error)
860 goto out_end_trans;
861
862 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
863 gfs2_dinode_out(&ip->i_di, dibh->b_data);
864 brelse(dibh);
865
866out_end_trans:
867 gfs2_trans_end(sdp);
868out_ipres:
869 gfs2_inplace_release(ip);
870out_gunlock_q:
871 gfs2_quota_unlock(ip);
872out:
873 gfs2_alloc_put(ip);
874 return error;
875}
876
877
878/**
879 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
880 *
881 * This is partly borrowed from ext3.
882 */
883static int gfs2_block_truncate_page(struct address_space *mapping)
884{
885 struct inode *inode = mapping->host;
886 struct gfs2_inode *ip = GFS2_I(inode);
887 struct gfs2_sbd *sdp = GFS2_SB(inode);
888 loff_t from = inode->i_size;
889 unsigned long index = from >> PAGE_CACHE_SHIFT;
890 unsigned offset = from & (PAGE_CACHE_SIZE-1);
891 unsigned blocksize, iblock, length, pos;
892 struct buffer_head *bh;
893 struct page *page;
894 void *kaddr;
895 int err;
896
897 page = grab_cache_page(mapping, index);
898 if (!page)
899 return 0;
900
901 blocksize = inode->i_sb->s_blocksize;
902 length = blocksize - (offset & (blocksize - 1));
903 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
904
905 if (!page_has_buffers(page))
906 create_empty_buffers(page, blocksize, 0);
907
908 /* Find the buffer that contains "offset" */
909 bh = page_buffers(page);
910 pos = blocksize;
911 while (offset >= pos) {
912 bh = bh->b_this_page;
913 iblock++;
914 pos += blocksize;
915 }
916
917 err = 0;
918
919 if (!buffer_mapped(bh)) {
920 gfs2_get_block(inode, iblock, bh, 0);
921 /* unmapped? It's a hole - nothing to do */
922 if (!buffer_mapped(bh))
923 goto unlock;
924 }
925
926 /* Ok, it's mapped. Make sure it's up-to-date */
927 if (PageUptodate(page))
928 set_buffer_uptodate(bh);
929
930 if (!buffer_uptodate(bh)) {
931 err = -EIO;
932 ll_rw_block(READ, 1, &bh);
933 wait_on_buffer(bh);
934 /* Uhhuh. Read error. Complain and punt. */
935 if (!buffer_uptodate(bh))
936 goto unlock;
937 }
938
939 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
940 gfs2_trans_add_bh(ip->i_gl, bh, 0);
941
942 kaddr = kmap_atomic(page, KM_USER0);
943 memset(kaddr + offset, 0, length);
944 flush_dcache_page(page);
945 kunmap_atomic(kaddr, KM_USER0);
946
947unlock:
948 unlock_page(page);
949 page_cache_release(page);
950 return err;
951}
952
953static int trunc_start(struct gfs2_inode *ip, u64 size)
954{
955 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
956 struct buffer_head *dibh;
957 int journaled = gfs2_is_jdata(ip);
958 int error;
959
960 error = gfs2_trans_begin(sdp,
961 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
962 if (error)
963 return error;
964
965 error = gfs2_meta_inode_buffer(ip, &dibh);
966 if (error)
967 goto out;
968
969 if (gfs2_is_stuffed(ip)) {
970 ip->i_di.di_size = size;
971 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
972 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
973 gfs2_dinode_out(&ip->i_di, dibh->b_data);
974 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
975 error = 1;
976
977 } else {
978 if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
979 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
980
981 if (!error) {
982 ip->i_di.di_size = size;
983 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
984 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
985 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
986 gfs2_dinode_out(&ip->i_di, dibh->b_data);
987 }
988 }
989
990 brelse(dibh);
991
992out:
993 gfs2_trans_end(sdp);
994 return error;
995}
996
997static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
998{
999 unsigned int height = ip->i_di.di_height;
1000 u64 lblock;
1001 struct metapath mp;
1002 int error;
1003
1004 if (!size)
1005 lblock = 0;
1006 else
1007 lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
1008
1009 find_metapath(ip, lblock, &mp);
1010 gfs2_alloc_get(ip);
1011
1012 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1013 if (error)
1014 goto out;
1015
1016 while (height--) {
1017 struct strip_mine sm;
1018 sm.sm_first = !!size;
1019 sm.sm_height = height;
1020
1021 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
1022 if (error)
1023 break;
1024 }
1025
1026 gfs2_quota_unhold(ip);
1027
1028out:
1029 gfs2_alloc_put(ip);
1030 return error;
1031}
1032
1033static int trunc_end(struct gfs2_inode *ip)
1034{
1035 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1036 struct buffer_head *dibh;
1037 int error;
1038
1039 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1040 if (error)
1041 return error;
1042
1043 down_write(&ip->i_rw_mutex);
1044
1045 error = gfs2_meta_inode_buffer(ip, &dibh);
1046 if (error)
1047 goto out;
1048
1049 if (!ip->i_di.di_size) {
1050 ip->i_di.di_height = 0;
1051 ip->i_di.di_goal_meta =
1052 ip->i_di.di_goal_data =
1053 ip->i_num.no_addr;
1054 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1055 }
1056 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1057 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
1058
1059 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1060 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1061 brelse(dibh);
1062
1063out:
1064 up_write(&ip->i_rw_mutex);
1065 gfs2_trans_end(sdp);
1066 return error;
1067}
1068
1069/**
1070 * do_shrink - make a file smaller
1071 * @ip: the inode
1072 * @size: the size to make the file
1073 * @truncator: function to truncate the last partial block
1074 *
1075 * Called with an exclusive lock on @ip.
1076 *
1077 * Returns: errno
1078 */
1079
1080static int do_shrink(struct gfs2_inode *ip, u64 size)
1081{
1082 int error;
1083
1084 error = trunc_start(ip, size);
1085 if (error < 0)
1086 return error;
1087 if (error > 0)
1088 return 0;
1089
1090 error = trunc_dealloc(ip, size);
1091 if (!error)
1092 error = trunc_end(ip);
1093
1094 return error;
1095}
1096
1097/**
1098 * gfs2_truncatei - make a file a given size
1099 * @ip: the inode
1100 * @size: the size to make the file
1101 * @truncator: function to truncate the last partial block
1102 *
1103 * The file size can grow, shrink, or stay the same size.
1104 *
1105 * Returns: errno
1106 */
1107
1108int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1109{
1110 int error;
1111
1112 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
1113 return -EINVAL;
1114
1115 if (size > ip->i_di.di_size)
1116 error = do_grow(ip, size);
1117 else
1118 error = do_shrink(ip, size);
1119
1120 return error;
1121}
1122
1123int gfs2_truncatei_resume(struct gfs2_inode *ip)
1124{
1125 int error;
1126 error = trunc_dealloc(ip, ip->i_di.di_size);
1127 if (!error)
1128 error = trunc_end(ip);
1129 return error;
1130}
1131
1132int gfs2_file_dealloc(struct gfs2_inode *ip)
1133{
1134 return trunc_dealloc(ip, 0);
1135}
1136
1137/**
1138 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1139 * @ip: the file
1140 * @len: the number of bytes to be written to the file
1141 * @data_blocks: returns the number of data blocks required
1142 * @ind_blocks: returns the number of indirect blocks required
1143 *
1144 */
1145
1146void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1147 unsigned int *data_blocks, unsigned int *ind_blocks)
1148{
1149 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1150 unsigned int tmp;
1151
1152 if (gfs2_is_dir(ip)) {
1153 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1154 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1155 } else {
1156 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1157 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1158 }
1159
1160 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1161 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1162 *ind_blocks += tmp;
1163 }
1164}
1165
1166/**
1167 * gfs2_write_alloc_required - figure out if a write will require an allocation
1168 * @ip: the file being written to
1169 * @offset: the offset to write to
1170 * @len: the number of bytes being written
1171 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1172 *
1173 * Returns: errno
1174 */
1175
1176int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1177 unsigned int len, int *alloc_required)
1178{
1179 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1180 u64 lblock, lblock_stop, dblock;
1181 u32 extlen;
1182 int new = 0;
1183 int error = 0;
1184
1185 *alloc_required = 0;
1186
1187 if (!len)
1188 return 0;
1189
1190 if (gfs2_is_stuffed(ip)) {
1191 if (offset + len >
1192 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1193 *alloc_required = 1;
1194 return 0;
1195 }
1196
1197 if (gfs2_is_dir(ip)) {
1198 unsigned int bsize = sdp->sd_jbsize;
1199 lblock = offset;
1200 do_div(lblock, bsize);
1201 lblock_stop = offset + len + bsize - 1;
1202 do_div(lblock_stop, bsize);
1203 } else {
1204 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1205 lblock = offset >> shift;
1206 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1207 }
1208
1209 for (; lblock < lblock_stop; lblock += extlen) {
1210 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
1211 if (error)
1212 return error;
1213
1214 if (!dblock) {
1215 *alloc_required = 1;
1216 return 0;
1217 }
1218 }
1219
1220 return 0;
1221}
1222
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..ac2fd04370dc
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__
12
13struct inode;
14struct gfs2_inode;
15struct page;
16
17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
18int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh);
19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
20
21int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
22int gfs2_truncatei_resume(struct gfs2_inode *ip);
23int gfs2_file_dealloc(struct gfs2_inode *ip);
24
25void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
26 unsigned int *data_blocks,
27 unsigned int *ind_blocks);
28int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
29 unsigned int len, int *alloc_required);
30
31#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..cab1f68d4685
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "daemon.h"
23#include "glock.h"
24#include "log.h"
25#include "quota.h"
26#include "recovery.h"
27#include "super.h"
28#include "util.h"
29
30/* This uses schedule_timeout() instead of msleep() because it's good for
31 the daemons to wake up more often than the timeout when unmounting so
32 the user's unmount doesn't sit there forever.
33
34 The kthread functions used to start these daemons block and flush signals. */
35
36/**
37 * gfs2_scand - Look for cached glocks and inodes to toss from memory
38 * @sdp: Pointer to GFS2 superblock
39 *
40 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
41 * See gfs2_glockd()
42 */
43
44int gfs2_scand(void *data)
45{
46 struct gfs2_sbd *sdp = data;
47 unsigned long t;
48
49 while (!kthread_should_stop()) {
50 gfs2_scand_internal(sdp);
51 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
52 schedule_timeout_interruptible(t);
53 }
54
55 return 0;
56}
57
58/**
59 * gfs2_glockd - Reclaim unused glock structures
60 * @sdp: Pointer to GFS2 superblock
61 *
62 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
63 * Number of daemons can be set by user, with num_glockd mount option.
64 */
65
66int gfs2_glockd(void *data)
67{
68 struct gfs2_sbd *sdp = data;
69
70 while (!kthread_should_stop()) {
71 while (atomic_read(&sdp->sd_reclaim_count))
72 gfs2_reclaim_glock(sdp);
73
74 wait_event_interruptible(sdp->sd_reclaim_wq,
75 (atomic_read(&sdp->sd_reclaim_count) ||
76 kthread_should_stop()));
77 }
78
79 return 0;
80}
81
82/**
83 * gfs2_recoverd - Recover dead machine's journals
84 * @sdp: Pointer to GFS2 superblock
85 *
86 */
87
88int gfs2_recoverd(void *data)
89{
90 struct gfs2_sbd *sdp = data;
91 unsigned long t;
92
93 while (!kthread_should_stop()) {
94 gfs2_check_journals(sdp);
95 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
96 schedule_timeout_interruptible(t);
97 }
98
99 return 0;
100}
101
102/**
103 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
104 * @sdp: Pointer to GFS2 superblock
105 *
106 * Also, periodically check to make sure that we're using the most recent
107 * journal index.
108 */
109
110int gfs2_logd(void *data)
111{
112 struct gfs2_sbd *sdp = data;
113 struct gfs2_holder ji_gh;
114 unsigned long t;
115
116 while (!kthread_should_stop()) {
117 /* Advance the log tail */
118
119 t = sdp->sd_log_flush_time +
120 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
121
122 gfs2_ail1_empty(sdp, DIO_ALL);
123
124 if (time_after_eq(jiffies, t)) {
125 gfs2_log_flush(sdp, NULL);
126 sdp->sd_log_flush_time = jiffies;
127 }
128
129 /* Check for latest journal index */
130
131 t = sdp->sd_jindex_refresh_time +
132 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
133
134 if (time_after_eq(jiffies, t)) {
135 if (!gfs2_jindex_hold(sdp, &ji_gh))
136 gfs2_glock_dq_uninit(&ji_gh);
137 sdp->sd_jindex_refresh_time = jiffies;
138 }
139
140 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
141 schedule_timeout_interruptible(t);
142 }
143
144 return 0;
145}
146
147/**
148 * gfs2_quotad - Write cached quota changes into the quota file
149 * @sdp: Pointer to GFS2 superblock
150 *
151 */
152
153int gfs2_quotad(void *data)
154{
155 struct gfs2_sbd *sdp = data;
156 unsigned long t;
157 int error;
158
159 while (!kthread_should_stop()) {
160 /* Update the master statfs file */
161
162 t = sdp->sd_statfs_sync_time +
163 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
164
165 if (time_after_eq(jiffies, t)) {
166 error = gfs2_statfs_sync(sdp);
167 if (error &&
168 error != -EROFS &&
169 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
170 fs_err(sdp, "quotad: (1) error=%d\n", error);
171 sdp->sd_statfs_sync_time = jiffies;
172 }
173
174 /* Update quota file */
175
176 t = sdp->sd_quota_sync_time +
177 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
178
179 if (time_after_eq(jiffies, t)) {
180 error = gfs2_quota_sync(sdp);
181 if (error &&
182 error != -EROFS &&
183 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
184 fs_err(sdp, "quotad: (2) error=%d\n", error);
185 sdp->sd_quota_sync_time = jiffies;
186 }
187
188 gfs2_quota_scan(sdp);
189
190 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
191 schedule_timeout_interruptible(t);
192 }
193
194 return 0;
195}
196
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..801007120fb2
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data);
16int gfs2_logd(void *data);
17int gfs2_quotad(void *data);
18
19#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..e24af28b1a12
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1957 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10/*
11 * Implements Extendible Hashing as described in:
12 * "Extendible Hashing" by Fagin, et al in
13 * __ACM Trans. on Database Systems__, Sept 1979.
14 *
15 *
16 * Here's the layout of dirents which is essentially the same as that of ext2
17 * within a single block. The field de_name_len is the number of bytes
18 * actually required for the name (no null terminator). The field de_rec_len
19 * is the number of bytes allocated to the dirent. The offset of the next
20 * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21 * deleted, the preceding dirent inherits its allocated space, ie
22 * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23 * by adding de_rec_len to the current dirent, this essentially causes the
24 * deleted dirent to get jumped over when iterating through all the dirents.
25 *
26 * When deleting the first dirent in a block, there is no previous dirent so
27 * the field de_ino is set to zero to designate it as deleted. When allocating
28 * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29 * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30 * dirent is allocated. Otherwise it must go through all the 'used' dirents
31 * searching for one in which the amount of total space minus the amount of
32 * used space will provide enough space for the new dirent.
33 *
34 * There are two types of blocks in which dirents reside. In a stuffed dinode,
35 * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when
38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 *
43 * When the dirents are in leaves, the actual contents of the directory file are
44 * used as an array of 64-bit block pointers pointing to the leaf blocks. The
45 * dirents are NOT in the directory file itself. There can be more than one
46 * block pointer in the array that points to the same leaf. In fact, when a
47 * directory is first converted from linear to exhash, all of the pointers
48 * point to the same leaf.
49 *
50 * When a leaf is completely full, the size of the hash table can be
51 * doubled unless it is already at the maximum size which is hard coded into
52 * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53 * but never before the maximum hash table size has been reached.
54 */
55
56#include <linux/sched.h>
57#include <linux/slab.h>
58#include <linux/spinlock.h>
59#include <linux/buffer_head.h>
60#include <linux/sort.h>
61#include <linux/gfs2_ondisk.h>
62#include <linux/crc32.h>
63#include <linux/vmalloc.h>
64#include <linux/lm_interface.h>
65
66#include "gfs2.h"
67#include "incore.h"
68#include "dir.h"
69#include "glock.h"
70#include "inode.h"
71#include "meta_io.h"
72#include "quota.h"
73#include "rgrp.h"
74#include "trans.h"
75#include "bmap.h"
76#include "util.h"
77
78#define IS_LEAF 1 /* Hashed (leaf) directory */
79#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
80
81#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
82#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
83
84typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
85 u64 leaf_no, void *data);
86typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
87 const struct qstr *name, void *opaque);
88
89
90int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
91 struct buffer_head **bhp)
92{
93 struct buffer_head *bh;
94
95 bh = gfs2_meta_new(ip->i_gl, block);
96 gfs2_trans_add_bh(ip->i_gl, bh, 1);
97 gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
98 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
99 *bhp = bh;
100 return 0;
101}
102
103static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
104 struct buffer_head **bhp)
105{
106 struct buffer_head *bh;
107 int error;
108
109 error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
110 if (error)
111 return error;
112 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
113 brelse(bh);
114 return -EIO;
115 }
116 *bhp = bh;
117 return 0;
118}
119
120static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
121 unsigned int offset, unsigned int size)
122{
123 struct buffer_head *dibh;
124 int error;
125
126 error = gfs2_meta_inode_buffer(ip, &dibh);
127 if (error)
128 return error;
129
130 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
131 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
132 if (ip->i_di.di_size < offset + size)
133 ip->i_di.di_size = offset + size;
134 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
135 gfs2_dinode_out(&ip->i_di, dibh->b_data);
136
137 brelse(dibh);
138
139 return size;
140}
141
142
143
144/**
145 * gfs2_dir_write_data - Write directory information to the inode
146 * @ip: The GFS2 inode
147 * @buf: The buffer containing information to be written
148 * @offset: The file offset to start writing at
149 * @size: The amount of data to write
150 *
151 * Returns: The number of bytes correctly written or error code
152 */
153static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
154 u64 offset, unsigned int size)
155{
156 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
157 struct buffer_head *dibh;
158 u64 lblock, dblock;
159 u32 extlen = 0;
160 unsigned int o;
161 int copied = 0;
162 int error = 0;
163
164 if (!size)
165 return 0;
166
167 if (gfs2_is_stuffed(ip) &&
168 offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
169 return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
170 size);
171
172 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
173 return -EINVAL;
174
175 if (gfs2_is_stuffed(ip)) {
176 error = gfs2_unstuff_dinode(ip, NULL);
177 if (error)
178 return error;
179 }
180
181 lblock = offset;
182 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
183
184 while (copied < size) {
185 unsigned int amount;
186 struct buffer_head *bh;
187 int new = 0;
188
189 amount = size - copied;
190 if (amount > sdp->sd_sb.sb_bsize - o)
191 amount = sdp->sd_sb.sb_bsize - o;
192
193 if (!extlen) {
194 new = 1;
195 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
196 &dblock, &extlen);
197 if (error)
198 goto fail;
199 error = -EIO;
200 if (gfs2_assert_withdraw(sdp, dblock))
201 goto fail;
202 }
203
204 if (amount == sdp->sd_jbsize || new)
205 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
206 else
207 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
208
209 if (error)
210 goto fail;
211
212 gfs2_trans_add_bh(ip->i_gl, bh, 1);
213 memcpy(bh->b_data + o, buf, amount);
214 brelse(bh);
215
216 buf += amount;
217 copied += amount;
218 lblock++;
219 dblock++;
220 extlen--;
221
222 o = sizeof(struct gfs2_meta_header);
223 }
224
225out:
226 error = gfs2_meta_inode_buffer(ip, &dibh);
227 if (error)
228 return error;
229
230 if (ip->i_di.di_size < offset + copied)
231 ip->i_di.di_size = offset + copied;
232 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
233
234 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
235 gfs2_dinode_out(&ip->i_di, dibh->b_data);
236 brelse(dibh);
237
238 return copied;
239fail:
240 if (copied)
241 goto out;
242 return error;
243}
244
245static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
246 u64 offset, unsigned int size)
247{
248 struct buffer_head *dibh;
249 int error;
250
251 error = gfs2_meta_inode_buffer(ip, &dibh);
252 if (!error) {
253 offset += sizeof(struct gfs2_dinode);
254 memcpy(buf, dibh->b_data + offset, size);
255 brelse(dibh);
256 }
257
258 return (error) ? error : size;
259}
260
261
262/**
263 * gfs2_dir_read_data - Read a data from a directory inode
264 * @ip: The GFS2 Inode
265 * @buf: The buffer to place result into
266 * @offset: File offset to begin jdata_readng from
267 * @size: Amount of data to transfer
268 *
269 * Returns: The amount of data actually copied or the error
270 */
271static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
272 unsigned int size, unsigned ra)
273{
274 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
275 u64 lblock, dblock;
276 u32 extlen = 0;
277 unsigned int o;
278 int copied = 0;
279 int error = 0;
280
281 if (offset >= ip->i_di.di_size)
282 return 0;
283
284 if (offset + size > ip->i_di.di_size)
285 size = ip->i_di.di_size - offset;
286
287 if (!size)
288 return 0;
289
290 if (gfs2_is_stuffed(ip))
291 return gfs2_dir_read_stuffed(ip, buf, offset, size);
292
293 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
294 return -EINVAL;
295
296 lblock = offset;
297 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
298
299 while (copied < size) {
300 unsigned int amount;
301 struct buffer_head *bh;
302 int new;
303
304 amount = size - copied;
305 if (amount > sdp->sd_sb.sb_bsize - o)
306 amount = sdp->sd_sb.sb_bsize - o;
307
308 if (!extlen) {
309 new = 0;
310 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
311 &dblock, &extlen);
312 if (error || !dblock)
313 goto fail;
314 BUG_ON(extlen < 1);
315 if (!ra)
316 extlen = 1;
317 bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
318 } else {
319 error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
320 if (error)
321 goto fail;
322 }
323 error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
324 if (error) {
325 brelse(bh);
326 goto fail;
327 }
328 dblock++;
329 extlen--;
330 memcpy(buf, bh->b_data + o, amount);
331 brelse(bh);
332 buf += amount;
333 copied += amount;
334 lblock++;
335 o = sizeof(struct gfs2_meta_header);
336 }
337
338 return copied;
339fail:
340 return (copied) ? copied : error;
341}
342
343static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
344 const struct qstr *name, int ret)
345{
346 if (dent->de_inum.no_addr != 0 &&
347 be32_to_cpu(dent->de_hash) == name->hash &&
348 be16_to_cpu(dent->de_name_len) == name->len &&
349 memcmp(dent+1, name->name, name->len) == 0)
350 return ret;
351 return 0;
352}
353
354static int gfs2_dirent_find(const struct gfs2_dirent *dent,
355 const struct qstr *name,
356 void *opaque)
357{
358 return __gfs2_dirent_find(dent, name, 1);
359}
360
361static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
362 const struct qstr *name,
363 void *opaque)
364{
365 return __gfs2_dirent_find(dent, name, 2);
366}
367
368/*
369 * name->name holds ptr to start of block.
370 * name->len holds size of block.
371 */
372static int gfs2_dirent_last(const struct gfs2_dirent *dent,
373 const struct qstr *name,
374 void *opaque)
375{
376 const char *start = name->name;
377 const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
378 if (name->len == (end - start))
379 return 1;
380 return 0;
381}
382
383static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
384 const struct qstr *name,
385 void *opaque)
386{
387 unsigned required = GFS2_DIRENT_SIZE(name->len);
388 unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
389 unsigned totlen = be16_to_cpu(dent->de_rec_len);
390
391 if (!dent->de_inum.no_addr)
392 actual = GFS2_DIRENT_SIZE(0);
393 if (totlen - actual >= required)
394 return 1;
395 return 0;
396}
397
398struct dirent_gather {
399 const struct gfs2_dirent **pdent;
400 unsigned offset;
401};
402
403static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
404 const struct qstr *name,
405 void *opaque)
406{
407 struct dirent_gather *g = opaque;
408 if (dent->de_inum.no_addr) {
409 g->pdent[g->offset++] = dent;
410 }
411 return 0;
412}
413
414/*
415 * Other possible things to check:
416 * - Inode located within filesystem size (and on valid block)
417 * - Valid directory entry type
418 * Not sure how heavy-weight we want to make this... could also check
419 * hash is correct for example, but that would take a lot of extra time.
420 * For now the most important thing is to check that the various sizes
421 * are correct.
422 */
423static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
424 unsigned int size, unsigned int len, int first)
425{
426 const char *msg = "gfs2_dirent too small";
427 if (unlikely(size < sizeof(struct gfs2_dirent)))
428 goto error;
429 msg = "gfs2_dirent misaligned";
430 if (unlikely(offset & 0x7))
431 goto error;
432 msg = "gfs2_dirent points beyond end of block";
433 if (unlikely(offset + size > len))
434 goto error;
435 msg = "zero inode number";
436 if (unlikely(!first && !dent->de_inum.no_addr))
437 goto error;
438 msg = "name length is greater than space in dirent";
439 if (dent->de_inum.no_addr &&
440 unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
441 size))
442 goto error;
443 return 0;
444error:
445 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
446 first ? "first in block" : "not first in block");
447 return -EIO;
448}
449
450static int gfs2_dirent_offset(const void *buf)
451{
452 const struct gfs2_meta_header *h = buf;
453 int offset;
454
455 BUG_ON(buf == NULL);
456
457 switch(be32_to_cpu(h->mh_type)) {
458 case GFS2_METATYPE_LF:
459 offset = sizeof(struct gfs2_leaf);
460 break;
461 case GFS2_METATYPE_DI:
462 offset = sizeof(struct gfs2_dinode);
463 break;
464 default:
465 goto wrong_type;
466 }
467 return offset;
468wrong_type:
469 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
470 be32_to_cpu(h->mh_type));
471 return -1;
472}
473
474static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
475 unsigned int len, gfs2_dscan_t scan,
476 const struct qstr *name,
477 void *opaque)
478{
479 struct gfs2_dirent *dent, *prev;
480 unsigned offset;
481 unsigned size;
482 int ret = 0;
483
484 ret = gfs2_dirent_offset(buf);
485 if (ret < 0)
486 goto consist_inode;
487
488 offset = ret;
489 prev = NULL;
490 dent = buf + offset;
491 size = be16_to_cpu(dent->de_rec_len);
492 if (gfs2_check_dirent(dent, offset, size, len, 1))
493 goto consist_inode;
494 do {
495 ret = scan(dent, name, opaque);
496 if (ret)
497 break;
498 offset += size;
499 if (offset == len)
500 break;
501 prev = dent;
502 dent = buf + offset;
503 size = be16_to_cpu(dent->de_rec_len);
504 if (gfs2_check_dirent(dent, offset, size, len, 0))
505 goto consist_inode;
506 } while(1);
507
508 switch(ret) {
509 case 0:
510 return NULL;
511 case 1:
512 return dent;
513 case 2:
514 return prev ? prev : dent;
515 default:
516 BUG_ON(ret > 0);
517 return ERR_PTR(ret);
518 }
519
520consist_inode:
521 gfs2_consist_inode(GFS2_I(inode));
522 return ERR_PTR(-EIO);
523}
524
525
526/**
527 * dirent_first - Return the first dirent
528 * @dip: the directory
529 * @bh: The buffer
530 * @dent: Pointer to list of dirents
531 *
532 * return first dirent whether bh points to leaf or stuffed dinode
533 *
534 * Returns: IS_LEAF, IS_DINODE, or -errno
535 */
536
537static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
538 struct gfs2_dirent **dent)
539{
540 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
541
542 if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
543 if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
544 return -EIO;
545 *dent = (struct gfs2_dirent *)(bh->b_data +
546 sizeof(struct gfs2_leaf));
547 return IS_LEAF;
548 } else {
549 if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
550 return -EIO;
551 *dent = (struct gfs2_dirent *)(bh->b_data +
552 sizeof(struct gfs2_dinode));
553 return IS_DINODE;
554 }
555}
556
557static int dirent_check_reclen(struct gfs2_inode *dip,
558 const struct gfs2_dirent *d, const void *end_p)
559{
560 const void *ptr = d;
561 u16 rec_len = be16_to_cpu(d->de_rec_len);
562
563 if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
564 goto broken;
565 ptr += rec_len;
566 if (ptr < end_p)
567 return rec_len;
568 if (ptr == end_p)
569 return -ENOENT;
570broken:
571 gfs2_consist_inode(dip);
572 return -EIO;
573}
574
575/**
576 * dirent_next - Next dirent
577 * @dip: the directory
578 * @bh: The buffer
579 * @dent: Pointer to list of dirents
580 *
581 * Returns: 0 on success, error code otherwise
582 */
583
584static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
585 struct gfs2_dirent **dent)
586{
587 struct gfs2_dirent *cur = *dent, *tmp;
588 char *bh_end = bh->b_data + bh->b_size;
589 int ret;
590
591 ret = dirent_check_reclen(dip, cur, bh_end);
592 if (ret < 0)
593 return ret;
594
595 tmp = (void *)cur + ret;
596 ret = dirent_check_reclen(dip, tmp, bh_end);
597 if (ret == -EIO)
598 return ret;
599
600 /* Only the first dent could ever have de_inum.no_addr == 0 */
601 if (!tmp->de_inum.no_addr) {
602 gfs2_consist_inode(dip);
603 return -EIO;
604 }
605
606 *dent = tmp;
607 return 0;
608}
609
610/**
611 * dirent_del - Delete a dirent
612 * @dip: The GFS2 inode
613 * @bh: The buffer
614 * @prev: The previous dirent
615 * @cur: The current dirent
616 *
617 */
618
619static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
620 struct gfs2_dirent *prev, struct gfs2_dirent *cur)
621{
622 u16 cur_rec_len, prev_rec_len;
623
624 if (!cur->de_inum.no_addr) {
625 gfs2_consist_inode(dip);
626 return;
627 }
628
629 gfs2_trans_add_bh(dip->i_gl, bh, 1);
630
631 /* If there is no prev entry, this is the first entry in the block.
632 The de_rec_len is already as big as it needs to be. Just zero
633 out the inode number and return. */
634
635 if (!prev) {
636 cur->de_inum.no_addr = 0; /* No endianess worries */
637 return;
638 }
639
640 /* Combine this dentry with the previous one. */
641
642 prev_rec_len = be16_to_cpu(prev->de_rec_len);
643 cur_rec_len = be16_to_cpu(cur->de_rec_len);
644
645 if ((char *)prev + prev_rec_len != (char *)cur)
646 gfs2_consist_inode(dip);
647 if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
648 gfs2_consist_inode(dip);
649
650 prev_rec_len += cur_rec_len;
651 prev->de_rec_len = cpu_to_be16(prev_rec_len);
652}
653
654/*
655 * Takes a dent from which to grab space as an argument. Returns the
656 * newly created dent.
657 */
658static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
659 struct gfs2_dirent *dent,
660 const struct qstr *name,
661 struct buffer_head *bh)
662{
663 struct gfs2_inode *ip = GFS2_I(inode);
664 struct gfs2_dirent *ndent;
665 unsigned offset = 0, totlen;
666
667 if (dent->de_inum.no_addr)
668 offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
669 totlen = be16_to_cpu(dent->de_rec_len);
670 BUG_ON(offset + name->len > totlen);
671 gfs2_trans_add_bh(ip->i_gl, bh, 1);
672 ndent = (struct gfs2_dirent *)((char *)dent + offset);
673 dent->de_rec_len = cpu_to_be16(offset);
674 gfs2_qstr2dirent(name, totlen - offset, ndent);
675 return ndent;
676}
677
678static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
679 struct buffer_head *bh,
680 const struct qstr *name)
681{
682 struct gfs2_dirent *dent;
683 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
684 gfs2_dirent_find_space, name, NULL);
685 if (!dent || IS_ERR(dent))
686 return dent;
687 return gfs2_init_dirent(inode, dent, name, bh);
688}
689
690static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
691 struct buffer_head **bhp)
692{
693 int error;
694
695 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
696 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
697 /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
698 error = -EIO;
699 }
700
701 return error;
702}
703
704/**
705 * get_leaf_nr - Get a leaf number associated with the index
706 * @dip: The GFS2 inode
707 * @index:
708 * @leaf_out:
709 *
710 * Returns: 0 on success, error code otherwise
711 */
712
713static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
714 u64 *leaf_out)
715{
716 u64 leaf_no;
717 int error;
718
719 error = gfs2_dir_read_data(dip, (char *)&leaf_no,
720 index * sizeof(u64),
721 sizeof(u64), 0);
722 if (error != sizeof(u64))
723 return (error < 0) ? error : -EIO;
724
725 *leaf_out = be64_to_cpu(leaf_no);
726
727 return 0;
728}
729
730static int get_first_leaf(struct gfs2_inode *dip, u32 index,
731 struct buffer_head **bh_out)
732{
733 u64 leaf_no;
734 int error;
735
736 error = get_leaf_nr(dip, index, &leaf_no);
737 if (!error)
738 error = get_leaf(dip, leaf_no, bh_out);
739
740 return error;
741}
742
743static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
744 const struct qstr *name,
745 gfs2_dscan_t scan,
746 struct buffer_head **pbh)
747{
748 struct buffer_head *bh;
749 struct gfs2_dirent *dent;
750 struct gfs2_inode *ip = GFS2_I(inode);
751 int error;
752
753 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
754 struct gfs2_leaf *leaf;
755 unsigned hsize = 1 << ip->i_di.di_depth;
756 unsigned index;
757 u64 ln;
758 if (hsize * sizeof(u64) != ip->i_di.di_size) {
759 gfs2_consist_inode(ip);
760 return ERR_PTR(-EIO);
761 }
762
763 index = name->hash >> (32 - ip->i_di.di_depth);
764 error = get_first_leaf(ip, index, &bh);
765 if (error)
766 return ERR_PTR(error);
767 do {
768 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
769 scan, name, NULL);
770 if (dent)
771 goto got_dent;
772 leaf = (struct gfs2_leaf *)bh->b_data;
773 ln = be64_to_cpu(leaf->lf_next);
774 brelse(bh);
775 if (!ln)
776 break;
777
778 error = get_leaf(ip, ln, &bh);
779 } while(!error);
780
781 return error ? ERR_PTR(error) : NULL;
782 }
783
784
785 error = gfs2_meta_inode_buffer(ip, &bh);
786 if (error)
787 return ERR_PTR(error);
788 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
789got_dent:
790 if (unlikely(dent == NULL || IS_ERR(dent))) {
791 brelse(bh);
792 bh = NULL;
793 }
794 *pbh = bh;
795 return dent;
796}
797
798static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
799{
800 struct gfs2_inode *ip = GFS2_I(inode);
801 u64 bn = gfs2_alloc_meta(ip);
802 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
803 struct gfs2_leaf *leaf;
804 struct gfs2_dirent *dent;
805 struct qstr name = { .name = "", .len = 0, .hash = 0 };
806 if (!bh)
807 return NULL;
808
809 gfs2_trans_add_bh(ip->i_gl, bh, 1);
810 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
811 leaf = (struct gfs2_leaf *)bh->b_data;
812 leaf->lf_depth = cpu_to_be16(depth);
813 leaf->lf_entries = 0;
814 leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
815 leaf->lf_next = 0;
816 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
817 dent = (struct gfs2_dirent *)(leaf+1);
818 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
819 *pbh = bh;
820 return leaf;
821}
822
823/**
824 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
825 * @dip: The GFS2 inode
826 *
827 * Returns: 0 on success, error code otherwise
828 */
829
830static int dir_make_exhash(struct inode *inode)
831{
832 struct gfs2_inode *dip = GFS2_I(inode);
833 struct gfs2_sbd *sdp = GFS2_SB(inode);
834 struct gfs2_dirent *dent;
835 struct qstr args;
836 struct buffer_head *bh, *dibh;
837 struct gfs2_leaf *leaf;
838 int y;
839 u32 x;
840 u64 *lp, bn;
841 int error;
842
843 error = gfs2_meta_inode_buffer(dip, &dibh);
844 if (error)
845 return error;
846
847 /* Turn over a new leaf */
848
849 leaf = new_leaf(inode, &bh, 0);
850 if (!leaf)
851 return -ENOSPC;
852 bn = bh->b_blocknr;
853
854 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
855 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
856
857 /* Copy dirents */
858
859 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
860 sizeof(struct gfs2_dinode));
861
862 /* Find last entry */
863
864 x = 0;
865 args.len = bh->b_size - sizeof(struct gfs2_dinode) +
866 sizeof(struct gfs2_leaf);
867 args.name = bh->b_data;
868 dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
869 gfs2_dirent_last, &args, NULL);
870 if (!dent) {
871 brelse(bh);
872 brelse(dibh);
873 return -EIO;
874 }
875 if (IS_ERR(dent)) {
876 brelse(bh);
877 brelse(dibh);
878 return PTR_ERR(dent);
879 }
880
881 /* Adjust the last dirent's record length
882 (Remember that dent still points to the last entry.) */
883
884 dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
885 sizeof(struct gfs2_dinode) -
886 sizeof(struct gfs2_leaf));
887
888 brelse(bh);
889
890 /* We're done with the new leaf block, now setup the new
891 hash table. */
892
893 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
894 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
895
896 lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
897
898 for (x = sdp->sd_hash_ptrs; x--; lp++)
899 *lp = cpu_to_be64(bn);
900
901 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
902 dip->i_di.di_blocks++;
903 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
904 dip->i_di.di_payload_format = 0;
905
906 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
907 dip->i_di.di_depth = y;
908
909 gfs2_dinode_out(&dip->i_di, dibh->b_data);
910
911 brelse(dibh);
912
913 return 0;
914}
915
916/**
917 * dir_split_leaf - Split a leaf block into two
918 * @dip: The GFS2 inode
919 * @index:
920 * @leaf_no:
921 *
922 * Returns: 0 on success, error code on failure
923 */
924
925static int dir_split_leaf(struct inode *inode, const struct qstr *name)
926{
927 struct gfs2_inode *dip = GFS2_I(inode);
928 struct buffer_head *nbh, *obh, *dibh;
929 struct gfs2_leaf *nleaf, *oleaf;
930 struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
931 u32 start, len, half_len, divider;
932 u64 bn, *lp, leaf_no;
933 u32 index;
934 int x, moved = 0;
935 int error;
936
937 index = name->hash >> (32 - dip->i_di.di_depth);
938 error = get_leaf_nr(dip, index, &leaf_no);
939 if (error)
940 return error;
941
942 /* Get the old leaf block */
943 error = get_leaf(dip, leaf_no, &obh);
944 if (error)
945 return error;
946
947 oleaf = (struct gfs2_leaf *)obh->b_data;
948 if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
949 brelse(obh);
950 return 1; /* can't split */
951 }
952
953 gfs2_trans_add_bh(dip->i_gl, obh, 1);
954
955 nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
956 if (!nleaf) {
957 brelse(obh);
958 return -ENOSPC;
959 }
960 bn = nbh->b_blocknr;
961
962 /* Compute the start and len of leaf pointers in the hash table. */
963 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
964 half_len = len >> 1;
965 if (!half_len) {
966 printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
967 gfs2_consist_inode(dip);
968 error = -EIO;
969 goto fail_brelse;
970 }
971
972 start = (index & ~(len - 1));
973
974 /* Change the pointers.
975 Don't bother distinguishing stuffed from non-stuffed.
976 This code is complicated enough already. */
977 lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL);
978 /* Change the pointers */
979 for (x = 0; x < half_len; x++)
980 lp[x] = cpu_to_be64(bn);
981
982 error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
983 half_len * sizeof(u64));
984 if (error != half_len * sizeof(u64)) {
985 if (error >= 0)
986 error = -EIO;
987 goto fail_lpfree;
988 }
989
990 kfree(lp);
991
992 /* Compute the divider */
993 divider = (start + half_len) << (32 - dip->i_di.di_depth);
994
995 /* Copy the entries */
996 dirent_first(dip, obh, &dent);
997
998 do {
999 next = dent;
1000 if (dirent_next(dip, obh, &next))
1001 next = NULL;
1002
1003 if (dent->de_inum.no_addr &&
1004 be32_to_cpu(dent->de_hash) < divider) {
1005 struct qstr str;
1006 str.name = (char*)(dent+1);
1007 str.len = be16_to_cpu(dent->de_name_len);
1008 str.hash = be32_to_cpu(dent->de_hash);
1009 new = gfs2_dirent_alloc(inode, nbh, &str);
1010 if (IS_ERR(new)) {
1011 error = PTR_ERR(new);
1012 break;
1013 }
1014
1015 new->de_inum = dent->de_inum; /* No endian worries */
1016 new->de_type = dent->de_type; /* No endian worries */
1017 nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
1018
1019 dirent_del(dip, obh, prev, dent);
1020
1021 if (!oleaf->lf_entries)
1022 gfs2_consist_inode(dip);
1023 oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
1024
1025 if (!prev)
1026 prev = dent;
1027
1028 moved = 1;
1029 } else {
1030 prev = dent;
1031 }
1032 dent = next;
1033 } while (dent);
1034
1035 oleaf->lf_depth = nleaf->lf_depth;
1036
1037 error = gfs2_meta_inode_buffer(dip, &dibh);
1038 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1039 dip->i_di.di_blocks++;
1040 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1041 brelse(dibh);
1042 }
1043
1044 brelse(obh);
1045 brelse(nbh);
1046
1047 return error;
1048
1049fail_lpfree:
1050 kfree(lp);
1051
1052fail_brelse:
1053 brelse(obh);
1054 brelse(nbh);
1055 return error;
1056}
1057
1058/**
1059 * dir_double_exhash - Double size of ExHash table
1060 * @dip: The GFS2 dinode
1061 *
1062 * Returns: 0 on success, error code on failure
1063 */
1064
1065static int dir_double_exhash(struct gfs2_inode *dip)
1066{
1067 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1068 struct buffer_head *dibh;
1069 u32 hsize;
1070 u64 *buf;
1071 u64 *from, *to;
1072 u64 block;
1073 int x;
1074 int error = 0;
1075
1076 hsize = 1 << dip->i_di.di_depth;
1077 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1078 gfs2_consist_inode(dip);
1079 return -EIO;
1080 }
1081
1082 /* Allocate both the "from" and "to" buffers in one big chunk */
1083
1084 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
1085
1086 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1087 error = gfs2_dir_read_data(dip, (char *)buf,
1088 block * sdp->sd_hash_bsize,
1089 sdp->sd_hash_bsize, 1);
1090 if (error != sdp->sd_hash_bsize) {
1091 if (error >= 0)
1092 error = -EIO;
1093 goto fail;
1094 }
1095
1096 from = buf;
1097 to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
1098
1099 for (x = sdp->sd_hash_ptrs; x--; from++) {
1100 *to++ = *from; /* No endianess worries */
1101 *to++ = *from;
1102 }
1103
1104 error = gfs2_dir_write_data(dip,
1105 (char *)buf + sdp->sd_hash_bsize,
1106 block * sdp->sd_sb.sb_bsize,
1107 sdp->sd_sb.sb_bsize);
1108 if (error != sdp->sd_sb.sb_bsize) {
1109 if (error >= 0)
1110 error = -EIO;
1111 goto fail;
1112 }
1113 }
1114
1115 kfree(buf);
1116
1117 error = gfs2_meta_inode_buffer(dip, &dibh);
1118 if (!gfs2_assert_withdraw(sdp, !error)) {
1119 dip->i_di.di_depth++;
1120 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1121 brelse(dibh);
1122 }
1123
1124 return error;
1125
1126fail:
1127 kfree(buf);
1128 return error;
1129}
1130
1131/**
1132 * compare_dents - compare directory entries by hash value
1133 * @a: first dent
1134 * @b: second dent
1135 *
1136 * When comparing the hash entries of @a to @b:
1137 * gt: returns 1
1138 * lt: returns -1
1139 * eq: returns 0
1140 */
1141
1142static int compare_dents(const void *a, const void *b)
1143{
1144 const struct gfs2_dirent *dent_a, *dent_b;
1145 u32 hash_a, hash_b;
1146 int ret = 0;
1147
1148 dent_a = *(const struct gfs2_dirent **)a;
1149 hash_a = be32_to_cpu(dent_a->de_hash);
1150
1151 dent_b = *(const struct gfs2_dirent **)b;
1152 hash_b = be32_to_cpu(dent_b->de_hash);
1153
1154 if (hash_a > hash_b)
1155 ret = 1;
1156 else if (hash_a < hash_b)
1157 ret = -1;
1158 else {
1159 unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1160 unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1161
1162 if (len_a > len_b)
1163 ret = 1;
1164 else if (len_a < len_b)
1165 ret = -1;
1166 else
1167 ret = memcmp(dent_a + 1, dent_b + 1, len_a);
1168 }
1169
1170 return ret;
1171}
1172
1173/**
1174 * do_filldir_main - read out directory entries
1175 * @dip: The GFS2 inode
1176 * @offset: The offset in the file to read from
1177 * @opaque: opaque data to pass to filldir
1178 * @filldir: The function to pass entries to
1179 * @darr: an array of struct gfs2_dirent pointers to read
1180 * @entries: the number of entries in darr
1181 * @copied: pointer to int that's non-zero if a entry has been copied out
1182 *
1183 * Jump through some hoops to make sure that if there are hash collsions,
1184 * they are read out at the beginning of a buffer. We want to minimize
1185 * the possibility that they will fall into different readdir buffers or
1186 * that someone will want to seek to that location.
1187 *
1188 * Returns: errno, >0 on exception from filldir
1189 */
1190
1191static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1192 void *opaque, gfs2_filldir_t filldir,
1193 const struct gfs2_dirent **darr, u32 entries,
1194 int *copied)
1195{
1196 const struct gfs2_dirent *dent, *dent_next;
1197 struct gfs2_inum inum;
1198 u64 off, off_next;
1199 unsigned int x, y;
1200 int run = 0;
1201 int error = 0;
1202
1203 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1204
1205 dent_next = darr[0];
1206 off_next = be32_to_cpu(dent_next->de_hash);
1207 off_next = gfs2_disk_hash2offset(off_next);
1208
1209 for (x = 0, y = 1; x < entries; x++, y++) {
1210 dent = dent_next;
1211 off = off_next;
1212
1213 if (y < entries) {
1214 dent_next = darr[y];
1215 off_next = be32_to_cpu(dent_next->de_hash);
1216 off_next = gfs2_disk_hash2offset(off_next);
1217
1218 if (off < *offset)
1219 continue;
1220 *offset = off;
1221
1222 if (off_next == off) {
1223 if (*copied && !run)
1224 return 1;
1225 run = 1;
1226 } else
1227 run = 0;
1228 } else {
1229 if (off < *offset)
1230 continue;
1231 *offset = off;
1232 }
1233
1234 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1235
1236 error = filldir(opaque, (const char *)(dent + 1),
1237 be16_to_cpu(dent->de_name_len),
1238 off, &inum,
1239 be16_to_cpu(dent->de_type));
1240 if (error)
1241 return 1;
1242
1243 *copied = 1;
1244 }
1245
1246 /* Increment the *offset by one, so the next time we come into the
1247 do_filldir fxn, we get the next entry instead of the last one in the
1248 current leaf */
1249
1250 (*offset)++;
1251
1252 return 0;
1253}
1254
1255static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1256 gfs2_filldir_t filldir, int *copied,
1257 unsigned *depth, u64 leaf_no)
1258{
1259 struct gfs2_inode *ip = GFS2_I(inode);
1260 struct buffer_head *bh;
1261 struct gfs2_leaf *lf;
1262 unsigned entries = 0;
1263 unsigned leaves = 0;
1264 const struct gfs2_dirent **darr, *dent;
1265 struct dirent_gather g;
1266 struct buffer_head **larr;
1267 int leaf = 0;
1268 int error, i;
1269 u64 lfn = leaf_no;
1270
1271 do {
1272 error = get_leaf(ip, lfn, &bh);
1273 if (error)
1274 goto out;
1275 lf = (struct gfs2_leaf *)bh->b_data;
1276 if (leaves == 0)
1277 *depth = be16_to_cpu(lf->lf_depth);
1278 entries += be16_to_cpu(lf->lf_entries);
1279 leaves++;
1280 lfn = be64_to_cpu(lf->lf_next);
1281 brelse(bh);
1282 } while(lfn);
1283
1284 if (!entries)
1285 return 0;
1286
1287 error = -ENOMEM;
1288 larr = vmalloc((leaves + entries) * sizeof(void *));
1289 if (!larr)
1290 goto out;
1291 darr = (const struct gfs2_dirent **)(larr + leaves);
1292 g.pdent = darr;
1293 g.offset = 0;
1294 lfn = leaf_no;
1295
1296 do {
1297 error = get_leaf(ip, lfn, &bh);
1298 if (error)
1299 goto out_kfree;
1300 lf = (struct gfs2_leaf *)bh->b_data;
1301 lfn = be64_to_cpu(lf->lf_next);
1302 if (lf->lf_entries) {
1303 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1304 gfs2_dirent_gather, NULL, &g);
1305 error = PTR_ERR(dent);
1306 if (IS_ERR(dent)) {
1307 goto out_kfree;
1308 }
1309 error = 0;
1310 larr[leaf++] = bh;
1311 } else {
1312 brelse(bh);
1313 }
1314 } while(lfn);
1315
1316 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1317 entries, copied);
1318out_kfree:
1319 for(i = 0; i < leaf; i++)
1320 brelse(larr[i]);
1321 vfree(larr);
1322out:
1323 return error;
1324}
1325
1326/**
1327 * dir_e_read - Reads the entries from a directory into a filldir buffer
1328 * @dip: dinode pointer
1329 * @offset: the hash of the last entry read shifted to the right once
1330 * @opaque: buffer for the filldir function to fill
1331 * @filldir: points to the filldir function to use
1332 *
1333 * Returns: errno
1334 */
1335
1336static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1337 gfs2_filldir_t filldir)
1338{
1339 struct gfs2_inode *dip = GFS2_I(inode);
1340 struct gfs2_sbd *sdp = GFS2_SB(inode);
1341 u32 hsize, len = 0;
1342 u32 ht_offset, lp_offset, ht_offset_cur = -1;
1343 u32 hash, index;
1344 u64 *lp;
1345 int copied = 0;
1346 int error = 0;
1347 unsigned depth = 0;
1348
1349 hsize = 1 << dip->i_di.di_depth;
1350 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1351 gfs2_consist_inode(dip);
1352 return -EIO;
1353 }
1354
1355 hash = gfs2_dir_offset2hash(*offset);
1356 index = hash >> (32 - dip->i_di.di_depth);
1357
1358 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1359 if (!lp)
1360 return -ENOMEM;
1361
1362 while (index < hsize) {
1363 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1364 ht_offset = index - lp_offset;
1365
1366 if (ht_offset_cur != ht_offset) {
1367 error = gfs2_dir_read_data(dip, (char *)lp,
1368 ht_offset * sizeof(u64),
1369 sdp->sd_hash_bsize, 1);
1370 if (error != sdp->sd_hash_bsize) {
1371 if (error >= 0)
1372 error = -EIO;
1373 goto out;
1374 }
1375 ht_offset_cur = ht_offset;
1376 }
1377
1378 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1379 &copied, &depth,
1380 be64_to_cpu(lp[lp_offset]));
1381 if (error)
1382 break;
1383
1384 len = 1 << (dip->i_di.di_depth - depth);
1385 index = (index & ~(len - 1)) + len;
1386 }
1387
1388out:
1389 kfree(lp);
1390 if (error > 0)
1391 error = 0;
1392 return error;
1393}
1394
1395int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1396 gfs2_filldir_t filldir)
1397{
1398 struct gfs2_inode *dip = GFS2_I(inode);
1399 struct dirent_gather g;
1400 const struct gfs2_dirent **darr, *dent;
1401 struct buffer_head *dibh;
1402 int copied = 0;
1403 int error;
1404
1405 if (!dip->i_di.di_entries)
1406 return 0;
1407
1408 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1409 return dir_e_read(inode, offset, opaque, filldir);
1410
1411 if (!gfs2_is_stuffed(dip)) {
1412 gfs2_consist_inode(dip);
1413 return -EIO;
1414 }
1415
1416 error = gfs2_meta_inode_buffer(dip, &dibh);
1417 if (error)
1418 return error;
1419
1420 error = -ENOMEM;
1421 darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
1422 GFP_KERNEL);
1423 if (darr) {
1424 g.pdent = darr;
1425 g.offset = 0;
1426 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1427 gfs2_dirent_gather, NULL, &g);
1428 if (IS_ERR(dent)) {
1429 error = PTR_ERR(dent);
1430 goto out;
1431 }
1432 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1433 dip->i_di.di_entries, &copied);
1434out:
1435 kfree(darr);
1436 }
1437
1438 if (error > 0)
1439 error = 0;
1440
1441 brelse(dibh);
1442
1443 return error;
1444}
1445
1446/**
1447 * gfs2_dir_search - Search a directory
1448 * @dip: The GFS2 inode
1449 * @filename:
1450 * @inode:
1451 *
1452 * This routine searches a directory for a file or another directory.
1453 * Assumes a glock is held on dip.
1454 *
1455 * Returns: errno
1456 */
1457
1458int gfs2_dir_search(struct inode *dir, const struct qstr *name,
1459 struct gfs2_inum *inum, unsigned int *type)
1460{
1461 struct buffer_head *bh;
1462 struct gfs2_dirent *dent;
1463
1464 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1465 if (dent) {
1466 if (IS_ERR(dent))
1467 return PTR_ERR(dent);
1468 if (inum)
1469 gfs2_inum_in(inum, (char *)&dent->de_inum);
1470 if (type)
1471 *type = be16_to_cpu(dent->de_type);
1472 brelse(bh);
1473 return 0;
1474 }
1475 return -ENOENT;
1476}
1477
1478static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1479{
1480 struct buffer_head *bh, *obh;
1481 struct gfs2_inode *ip = GFS2_I(inode);
1482 struct gfs2_leaf *leaf, *oleaf;
1483 int error;
1484 u32 index;
1485 u64 bn;
1486
1487 index = name->hash >> (32 - ip->i_di.di_depth);
1488 error = get_first_leaf(ip, index, &obh);
1489 if (error)
1490 return error;
1491 do {
1492 oleaf = (struct gfs2_leaf *)obh->b_data;
1493 bn = be64_to_cpu(oleaf->lf_next);
1494 if (!bn)
1495 break;
1496 brelse(obh);
1497 error = get_leaf(ip, bn, &obh);
1498 if (error)
1499 return error;
1500 } while(1);
1501
1502 gfs2_trans_add_bh(ip->i_gl, obh, 1);
1503
1504 leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1505 if (!leaf) {
1506 brelse(obh);
1507 return -ENOSPC;
1508 }
1509 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1510 brelse(bh);
1511 brelse(obh);
1512
1513 error = gfs2_meta_inode_buffer(ip, &bh);
1514 if (error)
1515 return error;
1516 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1517 ip->i_di.di_blocks++;
1518 gfs2_dinode_out(&ip->i_di, bh->b_data);
1519 brelse(bh);
1520 return 0;
1521}
1522
1523/**
1524 * gfs2_dir_add - Add new filename into directory
1525 * @dip: The GFS2 inode
1526 * @filename: The new name
1527 * @inode: The inode number of the entry
1528 * @type: The type of the entry
1529 *
1530 * Returns: 0 on success, error code on failure
1531 */
1532
1533int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1534 const struct gfs2_inum *inum, unsigned type)
1535{
1536 struct gfs2_inode *ip = GFS2_I(inode);
1537 struct buffer_head *bh;
1538 struct gfs2_dirent *dent;
1539 struct gfs2_leaf *leaf;
1540 int error;
1541
1542 while(1) {
1543 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1544 &bh);
1545 if (dent) {
1546 if (IS_ERR(dent))
1547 return PTR_ERR(dent);
1548 dent = gfs2_init_dirent(inode, dent, name, bh);
1549 gfs2_inum_out(inum, (char *)&dent->de_inum);
1550 dent->de_type = cpu_to_be16(type);
1551 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1552 leaf = (struct gfs2_leaf *)bh->b_data;
1553 leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
1554 }
1555 brelse(bh);
1556 error = gfs2_meta_inode_buffer(ip, &bh);
1557 if (error)
1558 break;
1559 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1560 ip->i_di.di_entries++;
1561 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1562 gfs2_dinode_out(&ip->i_di, bh->b_data);
1563 brelse(bh);
1564 error = 0;
1565 break;
1566 }
1567 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
1568 error = dir_make_exhash(inode);
1569 if (error)
1570 break;
1571 continue;
1572 }
1573 error = dir_split_leaf(inode, name);
1574 if (error == 0)
1575 continue;
1576 if (error < 0)
1577 break;
1578 if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
1579 error = dir_double_exhash(ip);
1580 if (error)
1581 break;
1582 error = dir_split_leaf(inode, name);
1583 if (error < 0)
1584 break;
1585 if (error == 0)
1586 continue;
1587 }
1588 error = dir_new_leaf(inode, name);
1589 if (!error)
1590 continue;
1591 error = -ENOSPC;
1592 break;
1593 }
1594 return error;
1595}
1596
1597
1598/**
1599 * gfs2_dir_del - Delete a directory entry
1600 * @dip: The GFS2 inode
1601 * @filename: The filename
1602 *
1603 * Returns: 0 on success, error code on failure
1604 */
1605
1606int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1607{
1608 struct gfs2_dirent *dent, *prev = NULL;
1609 struct buffer_head *bh;
1610 int error;
1611
1612 /* Returns _either_ the entry (if its first in block) or the
1613 previous entry otherwise */
1614 dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
1615 if (!dent) {
1616 gfs2_consist_inode(dip);
1617 return -EIO;
1618 }
1619 if (IS_ERR(dent)) {
1620 gfs2_consist_inode(dip);
1621 return PTR_ERR(dent);
1622 }
1623 /* If not first in block, adjust pointers accordingly */
1624 if (gfs2_dirent_find(dent, name, NULL) == 0) {
1625 prev = dent;
1626 dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1627 }
1628
1629 dirent_del(dip, bh, prev, dent);
1630 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1631 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1632 u16 entries = be16_to_cpu(leaf->lf_entries);
1633 if (!entries)
1634 gfs2_consist_inode(dip);
1635 leaf->lf_entries = cpu_to_be16(--entries);
1636 }
1637 brelse(bh);
1638
1639 error = gfs2_meta_inode_buffer(dip, &bh);
1640 if (error)
1641 return error;
1642
1643 if (!dip->i_di.di_entries)
1644 gfs2_consist_inode(dip);
1645 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1646 dip->i_di.di_entries--;
1647 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1648 gfs2_dinode_out(&dip->i_di, bh->b_data);
1649 brelse(bh);
1650 mark_inode_dirty(&dip->i_inode);
1651
1652 return error;
1653}
1654
1655/**
1656 * gfs2_dir_mvino - Change inode number of directory entry
1657 * @dip: The GFS2 inode
1658 * @filename:
1659 * @new_inode:
1660 *
1661 * This routine changes the inode number of a directory entry. It's used
1662 * by rename to change ".." when a directory is moved.
1663 * Assumes a glock is held on dvp.
1664 *
1665 * Returns: errno
1666 */
1667
1668int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1669 struct gfs2_inum *inum, unsigned int new_type)
1670{
1671 struct buffer_head *bh;
1672 struct gfs2_dirent *dent;
1673 int error;
1674
1675 dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
1676 if (!dent) {
1677 gfs2_consist_inode(dip);
1678 return -EIO;
1679 }
1680 if (IS_ERR(dent))
1681 return PTR_ERR(dent);
1682
1683 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1684 gfs2_inum_out(inum, (char *)&dent->de_inum);
1685 dent->de_type = cpu_to_be16(new_type);
1686
1687 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1688 brelse(bh);
1689 error = gfs2_meta_inode_buffer(dip, &bh);
1690 if (error)
1691 return error;
1692 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1693 }
1694
1695 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1696 gfs2_dinode_out(&dip->i_di, bh->b_data);
1697 brelse(bh);
1698 return 0;
1699}
1700
1701/**
1702 * foreach_leaf - call a function for each leaf in a directory
1703 * @dip: the directory
1704 * @lc: the function to call for each each
1705 * @data: private data to pass to it
1706 *
1707 * Returns: errno
1708 */
1709
1710static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1711{
1712 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1713 struct buffer_head *bh;
1714 struct gfs2_leaf *leaf;
1715 u32 hsize, len;
1716 u32 ht_offset, lp_offset, ht_offset_cur = -1;
1717 u32 index = 0;
1718 u64 *lp;
1719 u64 leaf_no;
1720 int error = 0;
1721
1722 hsize = 1 << dip->i_di.di_depth;
1723 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1724 gfs2_consist_inode(dip);
1725 return -EIO;
1726 }
1727
1728 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1729 if (!lp)
1730 return -ENOMEM;
1731
1732 while (index < hsize) {
1733 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1734 ht_offset = index - lp_offset;
1735
1736 if (ht_offset_cur != ht_offset) {
1737 error = gfs2_dir_read_data(dip, (char *)lp,
1738 ht_offset * sizeof(u64),
1739 sdp->sd_hash_bsize, 1);
1740 if (error != sdp->sd_hash_bsize) {
1741 if (error >= 0)
1742 error = -EIO;
1743 goto out;
1744 }
1745 ht_offset_cur = ht_offset;
1746 }
1747
1748 leaf_no = be64_to_cpu(lp[lp_offset]);
1749 if (leaf_no) {
1750 error = get_leaf(dip, leaf_no, &bh);
1751 if (error)
1752 goto out;
1753 leaf = (struct gfs2_leaf *)bh->b_data;
1754 len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
1755 brelse(bh);
1756
1757 error = lc(dip, index, len, leaf_no, data);
1758 if (error)
1759 goto out;
1760
1761 index = (index & ~(len - 1)) + len;
1762 } else
1763 index++;
1764 }
1765
1766 if (index != hsize) {
1767 gfs2_consist_inode(dip);
1768 error = -EIO;
1769 }
1770
1771out:
1772 kfree(lp);
1773
1774 return error;
1775}
1776
1777/**
1778 * leaf_dealloc - Deallocate a directory leaf
1779 * @dip: the directory
1780 * @index: the hash table offset in the directory
1781 * @len: the number of pointers to this leaf
1782 * @leaf_no: the leaf number
1783 * @data: not used
1784 *
1785 * Returns: errno
1786 */
1787
1788static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1789 u64 leaf_no, void *data)
1790{
1791 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1792 struct gfs2_leaf *tmp_leaf;
1793 struct gfs2_rgrp_list rlist;
1794 struct buffer_head *bh, *dibh;
1795 u64 blk, nblk;
1796 unsigned int rg_blocks = 0, l_blocks = 0;
1797 char *ht;
1798 unsigned int x, size = len * sizeof(u64);
1799 int error;
1800
1801 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1802
1803 ht = kzalloc(size, GFP_KERNEL);
1804 if (!ht)
1805 return -ENOMEM;
1806
1807 gfs2_alloc_get(dip);
1808
1809 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1810 if (error)
1811 goto out;
1812
1813 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
1814 if (error)
1815 goto out_qs;
1816
1817 /* Count the number of leaves */
1818
1819 for (blk = leaf_no; blk; blk = nblk) {
1820 error = get_leaf(dip, blk, &bh);
1821 if (error)
1822 goto out_rlist;
1823 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1824 nblk = be64_to_cpu(tmp_leaf->lf_next);
1825 brelse(bh);
1826
1827 gfs2_rlist_add(sdp, &rlist, blk);
1828 l_blocks++;
1829 }
1830
1831 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1832
1833 for (x = 0; x < rlist.rl_rgrps; x++) {
1834 struct gfs2_rgrpd *rgd;
1835 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1836 rg_blocks += rgd->rd_ri.ri_length;
1837 }
1838
1839 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1840 if (error)
1841 goto out_rlist;
1842
1843 error = gfs2_trans_begin(sdp,
1844 rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1845 RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1846 if (error)
1847 goto out_rg_gunlock;
1848
1849 for (blk = leaf_no; blk; blk = nblk) {
1850 error = get_leaf(dip, blk, &bh);
1851 if (error)
1852 goto out_end_trans;
1853 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1854 nblk = be64_to_cpu(tmp_leaf->lf_next);
1855 brelse(bh);
1856
1857 gfs2_free_meta(dip, blk, 1);
1858
1859 if (!dip->i_di.di_blocks)
1860 gfs2_consist_inode(dip);
1861 dip->i_di.di_blocks--;
1862 }
1863
1864 error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
1865 if (error != size) {
1866 if (error >= 0)
1867 error = -EIO;
1868 goto out_end_trans;
1869 }
1870
1871 error = gfs2_meta_inode_buffer(dip, &dibh);
1872 if (error)
1873 goto out_end_trans;
1874
1875 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1876 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1877 brelse(dibh);
1878
1879out_end_trans:
1880 gfs2_trans_end(sdp);
1881out_rg_gunlock:
1882 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1883out_rlist:
1884 gfs2_rlist_free(&rlist);
1885 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
1886out_qs:
1887 gfs2_quota_unhold(dip);
1888out:
1889 gfs2_alloc_put(dip);
1890 kfree(ht);
1891 return error;
1892}
1893
1894/**
1895 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1896 * @dip: the directory
1897 *
1898 * Dealloc all on-disk directory leaves to FREEMETA state
1899 * Change on-disk inode type to "regular file"
1900 *
1901 * Returns: errno
1902 */
1903
1904int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1905{
1906 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1907 struct buffer_head *bh;
1908 int error;
1909
1910 /* Dealloc on-disk leaves to FREEMETA state */
1911 error = foreach_leaf(dip, leaf_dealloc, NULL);
1912 if (error)
1913 return error;
1914
1915 /* Make this a regular file in case we crash.
1916 (We don't want to free these blocks a second time.) */
1917
1918 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1919 if (error)
1920 return error;
1921
1922 error = gfs2_meta_inode_buffer(dip, &bh);
1923 if (!error) {
1924 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1925 ((struct gfs2_dinode *)bh->b_data)->di_mode =
1926 cpu_to_be32(S_IFREG);
1927 brelse(bh);
1928 }
1929
1930 gfs2_trans_end(sdp);
1931
1932 return error;
1933}
1934
1935/**
1936 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
1937 * @ip: the file being written to
1938 * @filname: the filename that's going to be added
1939 *
1940 * Returns: 1 if alloc required, 0 if not, -ve on error
1941 */
1942
1943int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
1944{
1945 struct gfs2_dirent *dent;
1946 struct buffer_head *bh;
1947
1948 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
1949 if (!dent) {
1950 return 1;
1951 }
1952 if (IS_ERR(dent))
1953 return PTR_ERR(dent);
1954 brelse(bh);
1955 return 0;
1956}
1957
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..371233419b07
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,79 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DIR_DOT_H__
11#define __DIR_DOT_H__
12
13#include <linux/dcache.h>
14
15struct inode;
16struct gfs2_inode;
17struct gfs2_inum;
18
19/**
20 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
21 * @opaque: opaque data used by the function
22 * @name: the name of the directory entry
23 * @length: the length of the name
24 * @offset: the entry's offset in the directory
25 * @inum: the inode number the entry points to
26 * @type: the type of inode the entry points to
27 *
28 * Returns: 0 on success, 1 if buffer full
29 */
30
31typedef int (*gfs2_filldir_t) (void *opaque,
32 const char *name, unsigned int length,
33 u64 offset,
34 struct gfs2_inum *inum, unsigned int type);
35
36int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
37 struct gfs2_inum *inum, unsigned int *type);
38int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
39 const struct gfs2_inum *inum, unsigned int type);
40int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
41int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
42 gfs2_filldir_t filldir);
43int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
44 struct gfs2_inum *new_inum, unsigned int new_type);
45
46int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
47
48int gfs2_diradd_alloc_required(struct inode *dir,
49 const struct qstr *filename);
50int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
51 struct buffer_head **bhp);
52
53static inline u32 gfs2_disk_hash(const char *data, int len)
54{
55 return crc32_le((u32)~0, data, len) ^ (u32)~0;
56}
57
58
59static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
60{
61 name->name = fname;
62 name->len = strlen(fname);
63 name->hash = gfs2_disk_hash(name->name, name->len);
64}
65
66/* N.B. This probably ought to take inum & type as args as well */
67static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
68{
69 dent->de_inum.no_addr = cpu_to_be64(0);
70 dent->de_inum.no_formal_ino = cpu_to_be64(0);
71 dent->de_hash = cpu_to_be32(name->hash);
72 dent->de_rec_len = cpu_to_be16(reclen);
73 dent->de_name_len = cpu_to_be16(name->len);
74 dent->de_type = cpu_to_be16(0);
75 memset(dent->__pad, 0, sizeof(dent->__pad));
76 memcpy(dent + 1, name->name, name->len);
77}
78
79#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..92c54e9b0dc3
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "util.h"
26
27/**
28 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
29 * @namep: ea name, possibly with type appended
30 *
31 * Returns: GFS2_EATYPE_XXX
32 */
33
34unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
35{
36 unsigned int type;
37
38 if (strncmp(name, "system.", 7) == 0) {
39 type = GFS2_EATYPE_SYS;
40 if (truncated_name)
41 *truncated_name = name + sizeof("system.") - 1;
42 } else if (strncmp(name, "user.", 5) == 0) {
43 type = GFS2_EATYPE_USR;
44 if (truncated_name)
45 *truncated_name = name + sizeof("user.") - 1;
46 } else if (strncmp(name, "security.", 9) == 0) {
47 type = GFS2_EATYPE_SECURITY;
48 if (truncated_name)
49 *truncated_name = name + sizeof("security.") - 1;
50 } else {
51 type = GFS2_EATYPE_UNUSED;
52 if (truncated_name)
53 *truncated_name = NULL;
54 }
55
56 return type;
57}
58
59static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
60{
61 struct inode *inode = &ip->i_inode;
62 int error = permission(inode, MAY_READ, NULL);
63 if (error)
64 return error;
65
66 return gfs2_ea_get_i(ip, er);
67}
68
69static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
70{
71 struct inode *inode = &ip->i_inode;
72
73 if (S_ISREG(inode->i_mode) ||
74 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
75 int error = permission(inode, MAY_WRITE, NULL);
76 if (error)
77 return error;
78 } else
79 return -EPERM;
80
81 return gfs2_ea_set_i(ip, er);
82}
83
84static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
85{
86 struct inode *inode = &ip->i_inode;
87
88 if (S_ISREG(inode->i_mode) ||
89 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
90 int error = permission(inode, MAY_WRITE, NULL);
91 if (error)
92 return error;
93 } else
94 return -EPERM;
95
96 return gfs2_ea_remove_i(ip, er);
97}
98
99static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
100{
101 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
102 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
103 !capable(CAP_SYS_ADMIN))
104 return -EPERM;
105
106 if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
107 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
108 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
109 return -EOPNOTSUPP;
110
111
112
113 return gfs2_ea_get_i(ip, er);
114}
115
116static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
117{
118 int remove = 0;
119 int error;
120
121 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
122 if (!(er->er_flags & GFS2_ERF_MODE)) {
123 er->er_mode = ip->i_di.di_mode;
124 er->er_flags |= GFS2_ERF_MODE;
125 }
126 error = gfs2_acl_validate_set(ip, 1, er,
127 &remove, &er->er_mode);
128 if (error)
129 return error;
130 error = gfs2_ea_set_i(ip, er);
131 if (error)
132 return error;
133 if (remove)
134 gfs2_ea_remove_i(ip, er);
135 return 0;
136
137 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
138 error = gfs2_acl_validate_set(ip, 0, er,
139 &remove, NULL);
140 if (error)
141 return error;
142 if (!remove)
143 error = gfs2_ea_set_i(ip, er);
144 else {
145 error = gfs2_ea_remove_i(ip, er);
146 if (error == -ENODATA)
147 error = 0;
148 }
149 return error;
150 }
151
152 return -EPERM;
153}
154
155static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
156{
157 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
158 int error = gfs2_acl_validate_remove(ip, 1);
159 if (error)
160 return error;
161
162 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
163 int error = gfs2_acl_validate_remove(ip, 0);
164 if (error)
165 return error;
166
167 } else
168 return -EPERM;
169
170 return gfs2_ea_remove_i(ip, er);
171}
172
173static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
174{
175 struct inode *inode = &ip->i_inode;
176 int error = permission(inode, MAY_READ, NULL);
177 if (error)
178 return error;
179
180 return gfs2_ea_get_i(ip, er);
181}
182
183static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
184{
185 struct inode *inode = &ip->i_inode;
186 int error = permission(inode, MAY_WRITE, NULL);
187 if (error)
188 return error;
189
190 return gfs2_ea_set_i(ip, er);
191}
192
193static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
194{
195 struct inode *inode = &ip->i_inode;
196 int error = permission(inode, MAY_WRITE, NULL);
197 if (error)
198 return error;
199
200 return gfs2_ea_remove_i(ip, er);
201}
202
203static struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get,
205 .eo_set = user_eo_set,
206 .eo_remove = user_eo_remove,
207 .eo_name = "user",
208};
209
210struct gfs2_eattr_operations gfs2_system_eaops = {
211 .eo_get = system_eo_get,
212 .eo_set = system_eo_set,
213 .eo_remove = system_eo_remove,
214 .eo_name = "system",
215};
216
217static struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get,
219 .eo_set = security_eo_set,
220 .eo_remove = security_eo_remove,
221 .eo_name = "security",
222};
223
224struct gfs2_eattr_operations *gfs2_ea_ops[] = {
225 NULL,
226 &gfs2_user_eaops,
227 &gfs2_system_eaops,
228 &gfs2_security_eaops,
229};
230
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..508b4f7a2449
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14struct gfs2_inode;
15
16struct gfs2_eattr_operations {
17 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
20 char *eo_name;
21};
22
23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
24
25extern struct gfs2_eattr_operations gfs2_system_eaops;
26
27extern struct gfs2_eattr_operations *gfs2_ea_ops[];
28
29#endif /* __EAOPS_DOT_H__ */
30
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..a65a4ccfd4dd
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1501 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "quota.h"
29#include "rgrp.h"
30#include "trans.h"
31#include "util.h"
32
33/**
34 * ea_calc_size - returns the acutal number of bytes the request will take up
35 * (not counting any unstuffed data blocks)
36 * @sdp:
37 * @er:
38 * @size:
39 *
40 * Returns: 1 if the EA should be stuffed
41 */
42
43static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
44 unsigned int *size)
45{
46 *size = GFS2_EAREQ_SIZE_STUFFED(er);
47 if (*size <= sdp->sd_jbsize)
48 return 1;
49
50 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
51
52 return 0;
53}
54
55static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
56{
57 unsigned int size;
58
59 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
60 return -ERANGE;
61
62 ea_calc_size(sdp, er, &size);
63
64 /* This can only happen with 512 byte blocks */
65 if (size > sdp->sd_jbsize)
66 return -ERANGE;
67
68 return 0;
69}
70
71typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
72 struct gfs2_ea_header *ea,
73 struct gfs2_ea_header *prev, void *private);
74
75static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
76 ea_call_t ea_call, void *data)
77{
78 struct gfs2_ea_header *ea, *prev = NULL;
79 int error = 0;
80
81 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
82 return -EIO;
83
84 for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
85 if (!GFS2_EA_REC_LEN(ea))
86 goto fail;
87 if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
88 bh->b_data + bh->b_size))
89 goto fail;
90 if (!GFS2_EATYPE_VALID(ea->ea_type))
91 goto fail;
92
93 error = ea_call(ip, bh, ea, prev, data);
94 if (error)
95 return error;
96
97 if (GFS2_EA_IS_LAST(ea)) {
98 if ((char *)GFS2_EA2NEXT(ea) !=
99 bh->b_data + bh->b_size)
100 goto fail;
101 break;
102 }
103 }
104
105 return error;
106
107fail:
108 gfs2_consist_inode(ip);
109 return -EIO;
110}
111
112static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
113{
114 struct buffer_head *bh, *eabh;
115 u64 *eablk, *end;
116 int error;
117
118 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
119 if (error)
120 return error;
121
122 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
123 error = ea_foreach_i(ip, bh, ea_call, data);
124 goto out;
125 }
126
127 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
128 error = -EIO;
129 goto out;
130 }
131
132 eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
133 end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
134
135 for (; eablk < end; eablk++) {
136 u64 bn;
137
138 if (!*eablk)
139 break;
140 bn = be64_to_cpu(*eablk);
141
142 error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
143 if (error)
144 break;
145 error = ea_foreach_i(ip, eabh, ea_call, data);
146 brelse(eabh);
147 if (error)
148 break;
149 }
150out:
151 brelse(bh);
152 return error;
153}
154
155struct ea_find {
156 struct gfs2_ea_request *ef_er;
157 struct gfs2_ea_location *ef_el;
158};
159
160static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
161 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
162 void *private)
163{
164 struct ea_find *ef = private;
165 struct gfs2_ea_request *er = ef->ef_er;
166
167 if (ea->ea_type == GFS2_EATYPE_UNUSED)
168 return 0;
169
170 if (ea->ea_type == er->er_type) {
171 if (ea->ea_name_len == er->er_name_len &&
172 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
173 struct gfs2_ea_location *el = ef->ef_el;
174 get_bh(bh);
175 el->el_bh = bh;
176 el->el_ea = ea;
177 el->el_prev = prev;
178 return 1;
179 }
180 }
181
182 return 0;
183}
184
185int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
186 struct gfs2_ea_location *el)
187{
188 struct ea_find ef;
189 int error;
190
191 ef.ef_er = er;
192 ef.ef_el = el;
193
194 memset(el, 0, sizeof(struct gfs2_ea_location));
195
196 error = ea_foreach(ip, ea_find_i, &ef);
197 if (error > 0)
198 return 0;
199
200 return error;
201}
202
203/**
204 * ea_dealloc_unstuffed -
205 * @ip:
206 * @bh:
207 * @ea:
208 * @prev:
209 * @private:
210 *
211 * Take advantage of the fact that all unstuffed blocks are
212 * allocated from the same RG. But watch, this may not always
213 * be true.
214 *
215 * Returns: errno
216 */
217
218static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
219 struct gfs2_ea_header *ea,
220 struct gfs2_ea_header *prev, void *private)
221{
222 int *leave = private;
223 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
224 struct gfs2_rgrpd *rgd;
225 struct gfs2_holder rg_gh;
226 struct buffer_head *dibh;
227 u64 *dataptrs, bn = 0;
228 u64 bstart = 0;
229 unsigned int blen = 0;
230 unsigned int blks = 0;
231 unsigned int x;
232 int error;
233
234 if (GFS2_EA_IS_STUFFED(ea))
235 return 0;
236
237 dataptrs = GFS2_EA2DATAPTRS(ea);
238 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
239 if (*dataptrs) {
240 blks++;
241 bn = be64_to_cpu(*dataptrs);
242 }
243 }
244 if (!blks)
245 return 0;
246
247 rgd = gfs2_blk2rgrpd(sdp, bn);
248 if (!rgd) {
249 gfs2_consist_inode(ip);
250 return -EIO;
251 }
252
253 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
254 if (error)
255 return error;
256
257 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE +
258 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
259 if (error)
260 goto out_gunlock;
261
262 gfs2_trans_add_bh(ip->i_gl, bh, 1);
263
264 dataptrs = GFS2_EA2DATAPTRS(ea);
265 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
266 if (!*dataptrs)
267 break;
268 bn = be64_to_cpu(*dataptrs);
269
270 if (bstart + blen == bn)
271 blen++;
272 else {
273 if (bstart)
274 gfs2_free_meta(ip, bstart, blen);
275 bstart = bn;
276 blen = 1;
277 }
278
279 *dataptrs = 0;
280 if (!ip->i_di.di_blocks)
281 gfs2_consist_inode(ip);
282 ip->i_di.di_blocks--;
283 }
284 if (bstart)
285 gfs2_free_meta(ip, bstart, blen);
286
287 if (prev && !leave) {
288 u32 len;
289
290 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
291 prev->ea_rec_len = cpu_to_be32(len);
292
293 if (GFS2_EA_IS_LAST(ea))
294 prev->ea_flags |= GFS2_EAFLAG_LAST;
295 } else {
296 ea->ea_type = GFS2_EATYPE_UNUSED;
297 ea->ea_num_ptrs = 0;
298 }
299
300 error = gfs2_meta_inode_buffer(ip, &dibh);
301 if (!error) {
302 ip->i_di.di_ctime = get_seconds();
303 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
304 gfs2_dinode_out(&ip->i_di, dibh->b_data);
305 brelse(dibh);
306 }
307
308 gfs2_trans_end(sdp);
309
310out_gunlock:
311 gfs2_glock_dq_uninit(&rg_gh);
312 return error;
313}
314
315static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
316 struct gfs2_ea_header *ea,
317 struct gfs2_ea_header *prev, int leave)
318{
319 struct gfs2_alloc *al;
320 int error;
321
322 al = gfs2_alloc_get(ip);
323
324 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
325 if (error)
326 goto out_alloc;
327
328 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
329 if (error)
330 goto out_quota;
331
332 error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
333
334 gfs2_glock_dq_uninit(&al->al_ri_gh);
335
336out_quota:
337 gfs2_quota_unhold(ip);
338out_alloc:
339 gfs2_alloc_put(ip);
340 return error;
341}
342
343struct ea_list {
344 struct gfs2_ea_request *ei_er;
345 unsigned int ei_size;
346};
347
348static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
349 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
350 void *private)
351{
352 struct ea_list *ei = private;
353 struct gfs2_ea_request *er = ei->ei_er;
354 unsigned int ea_size = gfs2_ea_strlen(ea);
355
356 if (ea->ea_type == GFS2_EATYPE_UNUSED)
357 return 0;
358
359 if (er->er_data_len) {
360 char *prefix = NULL;
361 unsigned int l = 0;
362 char c = 0;
363
364 if (ei->ei_size + ea_size > er->er_data_len)
365 return -ERANGE;
366
367 switch (ea->ea_type) {
368 case GFS2_EATYPE_USR:
369 prefix = "user.";
370 l = 5;
371 break;
372 case GFS2_EATYPE_SYS:
373 prefix = "system.";
374 l = 7;
375 break;
376 case GFS2_EATYPE_SECURITY:
377 prefix = "security.";
378 l = 9;
379 break;
380 }
381
382 BUG_ON(l == 0);
383
384 memcpy(er->er_data + ei->ei_size, prefix, l);
385 memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
386 ea->ea_name_len);
387 memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
388 }
389
390 ei->ei_size += ea_size;
391
392 return 0;
393}
394
395/**
396 * gfs2_ea_list -
397 * @ip:
398 * @er:
399 *
400 * Returns: actual size of data on success, -errno on error
401 */
402
403int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
404{
405 struct gfs2_holder i_gh;
406 int error;
407
408 if (!er->er_data || !er->er_data_len) {
409 er->er_data = NULL;
410 er->er_data_len = 0;
411 }
412
413 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
414 if (error)
415 return error;
416
417 if (ip->i_di.di_eattr) {
418 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
419
420 error = ea_foreach(ip, ea_list_i, &ei);
421 if (!error)
422 error = ei.ei_size;
423 }
424
425 gfs2_glock_dq_uninit(&i_gh);
426
427 return error;
428}
429
430/**
431 * ea_get_unstuffed - actually copies the unstuffed data into the
432 * request buffer
433 * @ip: The GFS2 inode
434 * @ea: The extended attribute header structure
435 * @data: The data to be copied
436 *
437 * Returns: errno
438 */
439
440static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
441 char *data)
442{
443 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
444 struct buffer_head **bh;
445 unsigned int amount = GFS2_EA_DATA_LEN(ea);
446 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
447 u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
448 unsigned int x;
449 int error = 0;
450
451 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
452 if (!bh)
453 return -ENOMEM;
454
455 for (x = 0; x < nptrs; x++) {
456 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
457 bh + x);
458 if (error) {
459 while (x--)
460 brelse(bh[x]);
461 goto out;
462 }
463 dataptrs++;
464 }
465
466 for (x = 0; x < nptrs; x++) {
467 error = gfs2_meta_wait(sdp, bh[x]);
468 if (error) {
469 for (; x < nptrs; x++)
470 brelse(bh[x]);
471 goto out;
472 }
473 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
474 for (; x < nptrs; x++)
475 brelse(bh[x]);
476 error = -EIO;
477 goto out;
478 }
479
480 memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
481 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
482
483 amount -= sdp->sd_jbsize;
484 data += sdp->sd_jbsize;
485
486 brelse(bh[x]);
487 }
488
489out:
490 kfree(bh);
491 return error;
492}
493
494int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
495 char *data)
496{
497 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
498 memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
499 return 0;
500 } else
501 return ea_get_unstuffed(ip, el->el_ea, data);
502}
503
504/**
505 * gfs2_ea_get_i -
506 * @ip: The GFS2 inode
507 * @er: The request structure
508 *
509 * Returns: actual size of data on success, -errno on error
510 */
511
512int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
513{
514 struct gfs2_ea_location el;
515 int error;
516
517 if (!ip->i_di.di_eattr)
518 return -ENODATA;
519
520 error = gfs2_ea_find(ip, er, &el);
521 if (error)
522 return error;
523 if (!el.el_ea)
524 return -ENODATA;
525
526 if (er->er_data_len) {
527 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
528 error = -ERANGE;
529 else
530 error = gfs2_ea_get_copy(ip, &el, er->er_data);
531 }
532 if (!error)
533 error = GFS2_EA_DATA_LEN(el.el_ea);
534
535 brelse(el.el_bh);
536
537 return error;
538}
539
540/**
541 * gfs2_ea_get -
542 * @ip: The GFS2 inode
543 * @er: The request structure
544 *
545 * Returns: actual size of data on success, -errno on error
546 */
547
548int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
549{
550 struct gfs2_holder i_gh;
551 int error;
552
553 if (!er->er_name_len ||
554 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
555 return -EINVAL;
556 if (!er->er_data || !er->er_data_len) {
557 er->er_data = NULL;
558 er->er_data_len = 0;
559 }
560
561 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
562 if (error)
563 return error;
564
565 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
566
567 gfs2_glock_dq_uninit(&i_gh);
568
569 return error;
570}
571
572/**
573 * ea_alloc_blk - allocates a new block for extended attributes.
574 * @ip: A pointer to the inode that's getting extended attributes
575 * @bhp: Pointer to pointer to a struct buffer_head
576 *
577 * Returns: errno
578 */
579
580static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
581{
582 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
583 struct gfs2_ea_header *ea;
584 u64 block;
585
586 block = gfs2_alloc_meta(ip);
587
588 *bhp = gfs2_meta_new(ip->i_gl, block);
589 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
590 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
591 gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
592
593 ea = GFS2_EA_BH2FIRST(*bhp);
594 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
595 ea->ea_type = GFS2_EATYPE_UNUSED;
596 ea->ea_flags = GFS2_EAFLAG_LAST;
597 ea->ea_num_ptrs = 0;
598
599 ip->i_di.di_blocks++;
600
601 return 0;
602}
603
604/**
605 * ea_write - writes the request info to an ea, creating new blocks if
606 * necessary
607 * @ip: inode that is being modified
608 * @ea: the location of the new ea in a block
609 * @er: the write request
610 *
611 * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
612 *
613 * returns : errno
614 */
615
616static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
617 struct gfs2_ea_request *er)
618{
619 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
620
621 ea->ea_data_len = cpu_to_be32(er->er_data_len);
622 ea->ea_name_len = er->er_name_len;
623 ea->ea_type = er->er_type;
624 ea->__pad = 0;
625
626 memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
627
628 if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
629 ea->ea_num_ptrs = 0;
630 memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
631 } else {
632 u64 *dataptr = GFS2_EA2DATAPTRS(ea);
633 const char *data = er->er_data;
634 unsigned int data_len = er->er_data_len;
635 unsigned int copy;
636 unsigned int x;
637
638 ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
639 for (x = 0; x < ea->ea_num_ptrs; x++) {
640 struct buffer_head *bh;
641 u64 block;
642 int mh_size = sizeof(struct gfs2_meta_header);
643
644 block = gfs2_alloc_meta(ip);
645
646 bh = gfs2_meta_new(ip->i_gl, block);
647 gfs2_trans_add_bh(ip->i_gl, bh, 1);
648 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
649
650 ip->i_di.di_blocks++;
651
652 copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
653 data_len;
654 memcpy(bh->b_data + mh_size, data, copy);
655 if (copy < sdp->sd_jbsize)
656 memset(bh->b_data + mh_size + copy, 0,
657 sdp->sd_jbsize - copy);
658
659 *dataptr++ = cpu_to_be64(bh->b_blocknr);
660 data += copy;
661 data_len -= copy;
662
663 brelse(bh);
664 }
665
666 gfs2_assert_withdraw(sdp, !data_len);
667 }
668
669 return 0;
670}
671
672typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
673 struct gfs2_ea_request *er, void *private);
674
675static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
676 unsigned int blks,
677 ea_skeleton_call_t skeleton_call, void *private)
678{
679 struct gfs2_alloc *al;
680 struct buffer_head *dibh;
681 int error;
682
683 al = gfs2_alloc_get(ip);
684
685 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
686 if (error)
687 goto out;
688
689 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
690 if (error)
691 goto out_gunlock_q;
692
693 al->al_requested = blks;
694
695 error = gfs2_inplace_reserve(ip);
696 if (error)
697 goto out_gunlock_q;
698
699 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
700 blks + al->al_rgd->rd_ri.ri_length +
701 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
702 if (error)
703 goto out_ipres;
704
705 error = skeleton_call(ip, er, private);
706 if (error)
707 goto out_end_trans;
708
709 error = gfs2_meta_inode_buffer(ip, &dibh);
710 if (!error) {
711 if (er->er_flags & GFS2_ERF_MODE) {
712 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
713 (ip->i_di.di_mode & S_IFMT) ==
714 (er->er_mode & S_IFMT));
715 ip->i_di.di_mode = er->er_mode;
716 }
717 ip->i_di.di_ctime = get_seconds();
718 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
719 gfs2_dinode_out(&ip->i_di, dibh->b_data);
720 brelse(dibh);
721 }
722
723out_end_trans:
724 gfs2_trans_end(GFS2_SB(&ip->i_inode));
725out_ipres:
726 gfs2_inplace_release(ip);
727out_gunlock_q:
728 gfs2_quota_unlock(ip);
729out:
730 gfs2_alloc_put(ip);
731 return error;
732}
733
734static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
735 void *private)
736{
737 struct buffer_head *bh;
738 int error;
739
740 error = ea_alloc_blk(ip, &bh);
741 if (error)
742 return error;
743
744 ip->i_di.di_eattr = bh->b_blocknr;
745 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
746
747 brelse(bh);
748
749 return error;
750}
751
752/**
753 * ea_init - initializes a new eattr block
754 * @ip:
755 * @er:
756 *
757 * Returns: errno
758 */
759
760static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
761{
762 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
763 unsigned int blks = 1;
764
765 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
766 blks += DIV_ROUND_UP(er->er_data_len, jbsize);
767
768 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
769}
770
771static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
772{
773 u32 ea_size = GFS2_EA_SIZE(ea);
774 struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
775 ea_size);
776 u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
777 int last = ea->ea_flags & GFS2_EAFLAG_LAST;
778
779 ea->ea_rec_len = cpu_to_be32(ea_size);
780 ea->ea_flags ^= last;
781
782 new->ea_rec_len = cpu_to_be32(new_size);
783 new->ea_flags = last;
784
785 return new;
786}
787
788static void ea_set_remove_stuffed(struct gfs2_inode *ip,
789 struct gfs2_ea_location *el)
790{
791 struct gfs2_ea_header *ea = el->el_ea;
792 struct gfs2_ea_header *prev = el->el_prev;
793 u32 len;
794
795 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
796
797 if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
798 ea->ea_type = GFS2_EATYPE_UNUSED;
799 return;
800 } else if (GFS2_EA2NEXT(prev) != ea) {
801 prev = GFS2_EA2NEXT(prev);
802 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
803 }
804
805 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
806 prev->ea_rec_len = cpu_to_be32(len);
807
808 if (GFS2_EA_IS_LAST(ea))
809 prev->ea_flags |= GFS2_EAFLAG_LAST;
810}
811
812struct ea_set {
813 int ea_split;
814
815 struct gfs2_ea_request *es_er;
816 struct gfs2_ea_location *es_el;
817
818 struct buffer_head *es_bh;
819 struct gfs2_ea_header *es_ea;
820};
821
822static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
823 struct gfs2_ea_header *ea, struct ea_set *es)
824{
825 struct gfs2_ea_request *er = es->es_er;
826 struct buffer_head *dibh;
827 int error;
828
829 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
830 if (error)
831 return error;
832
833 gfs2_trans_add_bh(ip->i_gl, bh, 1);
834
835 if (es->ea_split)
836 ea = ea_split_ea(ea);
837
838 ea_write(ip, ea, er);
839
840 if (es->es_el)
841 ea_set_remove_stuffed(ip, es->es_el);
842
843 error = gfs2_meta_inode_buffer(ip, &dibh);
844 if (error)
845 goto out;
846
847 if (er->er_flags & GFS2_ERF_MODE) {
848 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
849 (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
850 ip->i_di.di_mode = er->er_mode;
851 }
852 ip->i_di.di_ctime = get_seconds();
853 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
854 gfs2_dinode_out(&ip->i_di, dibh->b_data);
855 brelse(dibh);
856out:
857 gfs2_trans_end(GFS2_SB(&ip->i_inode));
858 return error;
859}
860
861static int ea_set_simple_alloc(struct gfs2_inode *ip,
862 struct gfs2_ea_request *er, void *private)
863{
864 struct ea_set *es = private;
865 struct gfs2_ea_header *ea = es->es_ea;
866 int error;
867
868 gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
869
870 if (es->ea_split)
871 ea = ea_split_ea(ea);
872
873 error = ea_write(ip, ea, er);
874 if (error)
875 return error;
876
877 if (es->es_el)
878 ea_set_remove_stuffed(ip, es->es_el);
879
880 return 0;
881}
882
883static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
884 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
885 void *private)
886{
887 struct ea_set *es = private;
888 unsigned int size;
889 int stuffed;
890 int error;
891
892 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
893
894 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
895 if (GFS2_EA_REC_LEN(ea) < size)
896 return 0;
897 if (!GFS2_EA_IS_STUFFED(ea)) {
898 error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
899 if (error)
900 return error;
901 }
902 es->ea_split = 0;
903 } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
904 es->ea_split = 1;
905 else
906 return 0;
907
908 if (stuffed) {
909 error = ea_set_simple_noalloc(ip, bh, ea, es);
910 if (error)
911 return error;
912 } else {
913 unsigned int blks;
914
915 es->es_bh = bh;
916 es->es_ea = ea;
917 blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
918 GFS2_SB(&ip->i_inode)->sd_jbsize);
919
920 error = ea_alloc_skeleton(ip, es->es_er, blks,
921 ea_set_simple_alloc, es);
922 if (error)
923 return error;
924 }
925
926 return 1;
927}
928
929static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
930 void *private)
931{
932 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
933 struct buffer_head *indbh, *newbh;
934 u64 *eablk;
935 int error;
936 int mh_size = sizeof(struct gfs2_meta_header);
937
938 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
939 u64 *end;
940
941 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
942 &indbh);
943 if (error)
944 return error;
945
946 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
947 error = -EIO;
948 goto out;
949 }
950
951 eablk = (u64 *)(indbh->b_data + mh_size);
952 end = eablk + sdp->sd_inptrs;
953
954 for (; eablk < end; eablk++)
955 if (!*eablk)
956 break;
957
958 if (eablk == end) {
959 error = -ENOSPC;
960 goto out;
961 }
962
963 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
964 } else {
965 u64 blk;
966
967 blk = gfs2_alloc_meta(ip);
968
969 indbh = gfs2_meta_new(ip->i_gl, blk);
970 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
971 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
972 gfs2_buffer_clear_tail(indbh, mh_size);
973
974 eablk = (u64 *)(indbh->b_data + mh_size);
975 *eablk = cpu_to_be64(ip->i_di.di_eattr);
976 ip->i_di.di_eattr = blk;
977 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
978 ip->i_di.di_blocks++;
979
980 eablk++;
981 }
982
983 error = ea_alloc_blk(ip, &newbh);
984 if (error)
985 goto out;
986
987 *eablk = cpu_to_be64((u64)newbh->b_blocknr);
988 error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
989 brelse(newbh);
990 if (error)
991 goto out;
992
993 if (private)
994 ea_set_remove_stuffed(ip, private);
995
996out:
997 brelse(indbh);
998 return error;
999}
1000
1001static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1002 struct gfs2_ea_location *el)
1003{
1004 struct ea_set es;
1005 unsigned int blks = 2;
1006 int error;
1007
1008 memset(&es, 0, sizeof(struct ea_set));
1009 es.es_er = er;
1010 es.es_el = el;
1011
1012 error = ea_foreach(ip, ea_set_simple, &es);
1013 if (error > 0)
1014 return 0;
1015 if (error)
1016 return error;
1017
1018 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
1019 blks++;
1020 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1021 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
1022
1023 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
1024}
1025
1026static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1027 struct gfs2_ea_location *el)
1028{
1029 if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
1030 el->el_prev = GFS2_EA2NEXT(el->el_prev);
1031 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
1032 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1033 }
1034
1035 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
1036}
1037
1038int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1039{
1040 struct gfs2_ea_location el;
1041 int error;
1042
1043 if (!ip->i_di.di_eattr) {
1044 if (er->er_flags & XATTR_REPLACE)
1045 return -ENODATA;
1046 return ea_init(ip, er);
1047 }
1048
1049 error = gfs2_ea_find(ip, er, &el);
1050 if (error)
1051 return error;
1052
1053 if (el.el_ea) {
1054 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
1055 brelse(el.el_bh);
1056 return -EPERM;
1057 }
1058
1059 error = -EEXIST;
1060 if (!(er->er_flags & XATTR_CREATE)) {
1061 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1062 error = ea_set_i(ip, er, &el);
1063 if (!error && unstuffed)
1064 ea_set_remove_unstuffed(ip, &el);
1065 }
1066
1067 brelse(el.el_bh);
1068 } else {
1069 error = -ENODATA;
1070 if (!(er->er_flags & XATTR_REPLACE))
1071 error = ea_set_i(ip, er, NULL);
1072 }
1073
1074 return error;
1075}
1076
1077int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1078{
1079 struct gfs2_holder i_gh;
1080 int error;
1081
1082 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1083 return -EINVAL;
1084 if (!er->er_data || !er->er_data_len) {
1085 er->er_data = NULL;
1086 er->er_data_len = 0;
1087 }
1088 error = ea_check_size(GFS2_SB(&ip->i_inode), er);
1089 if (error)
1090 return error;
1091
1092 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1093 if (error)
1094 return error;
1095
1096 if (IS_IMMUTABLE(&ip->i_inode))
1097 error = -EPERM;
1098 else
1099 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1100
1101 gfs2_glock_dq_uninit(&i_gh);
1102
1103 return error;
1104}
1105
1106static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1107{
1108 struct gfs2_ea_header *ea = el->el_ea;
1109 struct gfs2_ea_header *prev = el->el_prev;
1110 struct buffer_head *dibh;
1111 int error;
1112
1113 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1114 if (error)
1115 return error;
1116
1117 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1118
1119 if (prev) {
1120 u32 len;
1121
1122 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
1123 prev->ea_rec_len = cpu_to_be32(len);
1124
1125 if (GFS2_EA_IS_LAST(ea))
1126 prev->ea_flags |= GFS2_EAFLAG_LAST;
1127 } else
1128 ea->ea_type = GFS2_EATYPE_UNUSED;
1129
1130 error = gfs2_meta_inode_buffer(ip, &dibh);
1131 if (!error) {
1132 ip->i_di.di_ctime = get_seconds();
1133 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1134 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1135 brelse(dibh);
1136 }
1137
1138 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1139
1140 return error;
1141}
1142
1143int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1144{
1145 struct gfs2_ea_location el;
1146 int error;
1147
1148 if (!ip->i_di.di_eattr)
1149 return -ENODATA;
1150
1151 error = gfs2_ea_find(ip, er, &el);
1152 if (error)
1153 return error;
1154 if (!el.el_ea)
1155 return -ENODATA;
1156
1157 if (GFS2_EA_IS_STUFFED(el.el_ea))
1158 error = ea_remove_stuffed(ip, &el);
1159 else
1160 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
1161 0);
1162
1163 brelse(el.el_bh);
1164
1165 return error;
1166}
1167
1168/**
1169 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
1170 * @ip: pointer to the inode of the target file
1171 * @er: request information
1172 *
1173 * Returns: errno
1174 */
1175
1176int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1177{
1178 struct gfs2_holder i_gh;
1179 int error;
1180
1181 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1182 return -EINVAL;
1183
1184 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1185 if (error)
1186 return error;
1187
1188 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1189 error = -EPERM;
1190 else
1191 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
1192
1193 gfs2_glock_dq_uninit(&i_gh);
1194
1195 return error;
1196}
1197
1198static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1199 struct gfs2_ea_header *ea, char *data)
1200{
1201 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1202 struct buffer_head **bh;
1203 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1204 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
1205 u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
1206 unsigned int x;
1207 int error;
1208
1209 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
1210 if (!bh)
1211 return -ENOMEM;
1212
1213 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1214 if (error)
1215 goto out;
1216
1217 for (x = 0; x < nptrs; x++) {
1218 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
1219 bh + x);
1220 if (error) {
1221 while (x--)
1222 brelse(bh[x]);
1223 goto fail;
1224 }
1225 dataptrs++;
1226 }
1227
1228 for (x = 0; x < nptrs; x++) {
1229 error = gfs2_meta_wait(sdp, bh[x]);
1230 if (error) {
1231 for (; x < nptrs; x++)
1232 brelse(bh[x]);
1233 goto fail;
1234 }
1235 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1236 for (; x < nptrs; x++)
1237 brelse(bh[x]);
1238 error = -EIO;
1239 goto fail;
1240 }
1241
1242 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1243
1244 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
1245 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1246
1247 amount -= sdp->sd_jbsize;
1248 data += sdp->sd_jbsize;
1249
1250 brelse(bh[x]);
1251 }
1252
1253out:
1254 kfree(bh);
1255 return error;
1256
1257fail:
1258 gfs2_trans_end(sdp);
1259 kfree(bh);
1260 return error;
1261}
1262
1263int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1264 struct iattr *attr, char *data)
1265{
1266 struct buffer_head *dibh;
1267 int error;
1268
1269 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
1270 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1271 if (error)
1272 return error;
1273
1274 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1275 memcpy(GFS2_EA2DATA(el->el_ea), data,
1276 GFS2_EA_DATA_LEN(el->el_ea));
1277 } else
1278 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
1279
1280 if (error)
1281 return error;
1282
1283 error = gfs2_meta_inode_buffer(ip, &dibh);
1284 if (!error) {
1285 error = inode_setattr(&ip->i_inode, attr);
1286 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1287 gfs2_inode_attr_out(ip);
1288 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1289 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1290 brelse(dibh);
1291 }
1292
1293 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1294
1295 return error;
1296}
1297
1298static int ea_dealloc_indirect(struct gfs2_inode *ip)
1299{
1300 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1301 struct gfs2_rgrp_list rlist;
1302 struct buffer_head *indbh, *dibh;
1303 u64 *eablk, *end;
1304 unsigned int rg_blocks = 0;
1305 u64 bstart = 0;
1306 unsigned int blen = 0;
1307 unsigned int blks = 0;
1308 unsigned int x;
1309 int error;
1310
1311 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1312
1313 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
1314 if (error)
1315 return error;
1316
1317 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
1318 error = -EIO;
1319 goto out;
1320 }
1321
1322 eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1323 end = eablk + sdp->sd_inptrs;
1324
1325 for (; eablk < end; eablk++) {
1326 u64 bn;
1327
1328 if (!*eablk)
1329 break;
1330 bn = be64_to_cpu(*eablk);
1331
1332 if (bstart + blen == bn)
1333 blen++;
1334 else {
1335 if (bstart)
1336 gfs2_rlist_add(sdp, &rlist, bstart);
1337 bstart = bn;
1338 blen = 1;
1339 }
1340 blks++;
1341 }
1342 if (bstart)
1343 gfs2_rlist_add(sdp, &rlist, bstart);
1344 else
1345 goto out;
1346
1347 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1348
1349 for (x = 0; x < rlist.rl_rgrps; x++) {
1350 struct gfs2_rgrpd *rgd;
1351 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1352 rg_blocks += rgd->rd_ri.ri_length;
1353 }
1354
1355 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1356 if (error)
1357 goto out_rlist_free;
1358
1359 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
1360 RES_STATFS + RES_QUOTA, blks);
1361 if (error)
1362 goto out_gunlock;
1363
1364 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1365
1366 eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1367 bstart = 0;
1368 blen = 0;
1369
1370 for (; eablk < end; eablk++) {
1371 u64 bn;
1372
1373 if (!*eablk)
1374 break;
1375 bn = be64_to_cpu(*eablk);
1376
1377 if (bstart + blen == bn)
1378 blen++;
1379 else {
1380 if (bstart)
1381 gfs2_free_meta(ip, bstart, blen);
1382 bstart = bn;
1383 blen = 1;
1384 }
1385
1386 *eablk = 0;
1387 if (!ip->i_di.di_blocks)
1388 gfs2_consist_inode(ip);
1389 ip->i_di.di_blocks--;
1390 }
1391 if (bstart)
1392 gfs2_free_meta(ip, bstart, blen);
1393
1394 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
1395
1396 error = gfs2_meta_inode_buffer(ip, &dibh);
1397 if (!error) {
1398 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1399 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1400 brelse(dibh);
1401 }
1402
1403 gfs2_trans_end(sdp);
1404
1405out_gunlock:
1406 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1407out_rlist_free:
1408 gfs2_rlist_free(&rlist);
1409out:
1410 brelse(indbh);
1411 return error;
1412}
1413
1414static int ea_dealloc_block(struct gfs2_inode *ip)
1415{
1416 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1417 struct gfs2_alloc *al = &ip->i_alloc;
1418 struct gfs2_rgrpd *rgd;
1419 struct buffer_head *dibh;
1420 int error;
1421
1422 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
1423 if (!rgd) {
1424 gfs2_consist_inode(ip);
1425 return -EIO;
1426 }
1427
1428 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1429 &al->al_rgd_gh);
1430 if (error)
1431 return error;
1432
1433 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
1434 RES_QUOTA, 1);
1435 if (error)
1436 goto out_gunlock;
1437
1438 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1439
1440 ip->i_di.di_eattr = 0;
1441 if (!ip->i_di.di_blocks)
1442 gfs2_consist_inode(ip);
1443 ip->i_di.di_blocks--;
1444
1445 error = gfs2_meta_inode_buffer(ip, &dibh);
1446 if (!error) {
1447 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1448 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1449 brelse(dibh);
1450 }
1451
1452 gfs2_trans_end(sdp);
1453
1454out_gunlock:
1455 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1456 return error;
1457}
1458
1459/**
1460 * gfs2_ea_dealloc - deallocate the extended attribute fork
1461 * @ip: the inode
1462 *
1463 * Returns: errno
1464 */
1465
1466int gfs2_ea_dealloc(struct gfs2_inode *ip)
1467{
1468 struct gfs2_alloc *al;
1469 int error;
1470
1471 al = gfs2_alloc_get(ip);
1472
1473 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1474 if (error)
1475 goto out_alloc;
1476
1477 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
1478 if (error)
1479 goto out_quota;
1480
1481 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1482 if (error)
1483 goto out_rindex;
1484
1485 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
1486 error = ea_dealloc_indirect(ip);
1487 if (error)
1488 goto out_rindex;
1489 }
1490
1491 error = ea_dealloc_block(ip);
1492
1493out_rindex:
1494 gfs2_glock_dq_uninit(&al->al_ri_gh);
1495out_quota:
1496 gfs2_quota_unhold(ip);
1497out_alloc:
1498 gfs2_alloc_put(ip);
1499 return error;
1500}
1501
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ffa65947d686
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,100 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __EATTR_DOT_H__
11#define __EATTR_DOT_H__
12
13struct gfs2_inode;
14struct iattr;
15
16#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
17#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
18
19#define GFS2_EA_SIZE(ea) \
20ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
21 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
22 (sizeof(u64) * (ea)->ea_num_ptrs)), 8)
23
24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
26
27#define GFS2_EAREQ_SIZE_STUFFED(er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
29
30#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
31ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
32 sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
33
34#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
35#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
36
37#define GFS2_EA2DATAPTRS(ea) \
38((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
39
40#define GFS2_EA2NEXT(ea) \
41((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
42
43#define GFS2_EA_BH2FIRST(bh) \
44((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
45
46#define GFS2_ERF_MODE 0x80000000
47
48struct gfs2_ea_request {
49 const char *er_name;
50 char *er_data;
51 unsigned int er_name_len;
52 unsigned int er_data_len;
53 unsigned int er_type; /* GFS2_EATYPE_... */
54 int er_flags;
55 mode_t er_mode;
56};
57
58struct gfs2_ea_location {
59 struct buffer_head *el_bh;
60 struct gfs2_ea_header *el_ea;
61 struct gfs2_ea_header *el_prev;
62};
63
64int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
65int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
66int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
67
68int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
69int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
70int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
71int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
72
73int gfs2_ea_dealloc(struct gfs2_inode *ip);
74
75/* Exported to acl.c */
76
77int gfs2_ea_find(struct gfs2_inode *ip,
78 struct gfs2_ea_request *er,
79 struct gfs2_ea_location *el);
80int gfs2_ea_get_copy(struct gfs2_inode *ip,
81 struct gfs2_ea_location *el,
82 char *data);
83int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
84 struct iattr *attr, char *data);
85
86static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
87{
88 switch (ea->ea_type) {
89 case GFS2_EATYPE_USR:
90 return 5 + ea->ea_name_len + 1;
91 case GFS2_EATYPE_SYS:
92 return 7 + ea->ea_name_len + 1;
93 case GFS2_EATYPE_SECURITY:
94 return 9 + ea->ea_name_len + 1;
95 default:
96 return 0;
97 }
98}
99
100#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..3bb11c0f8b56
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __GFS2_DOT_H__
11#define __GFS2_DOT_H__
12
13enum {
14 NO_CREATE = 0,
15 CREATE = 1,
16};
17
18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0,
25 FORCE = 1,
26};
27
28#define GFS2_FAST_NAME_SIZE 8
29
30#endif /* __GFS2_DOT_H__ */
31
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..78fe0fae23ff
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2231 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/sort.h>
17#include <linux/jhash.h>
18#include <linux/kallsyms.h>
19#include <linux/gfs2_ondisk.h>
20#include <linux/list.h>
21#include <linux/lm_interface.h>
22#include <asm/uaccess.h>
23
24#include "gfs2.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "lm.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "quota.h"
33#include "super.h"
34#include "util.h"
35
36struct greedy {
37 struct gfs2_holder gr_gh;
38 struct work_struct gr_work;
39};
40
41struct gfs2_gl_hash_bucket {
42 struct hlist_head hb_list;
43};
44
45typedef void (*glock_examiner) (struct gfs2_glock * gl);
46
47static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
48static int dump_glock(struct gfs2_glock *gl);
49static int dump_inode(struct gfs2_inode *ip);
50
51#define GFS2_GL_HASH_SHIFT 15
52#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
53#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
54
55static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
56
57/*
58 * Despite what you might think, the numbers below are not arbitrary :-)
59 * They are taken from the ipv4 routing hash code, which is well tested
60 * and thus should be nearly optimal. Later on we might tweek the numbers
61 * but for now this should be fine.
62 *
63 * The reason for putting the locks in a separate array from the list heads
64 * is that we can have fewer locks than list heads and save memory. We use
65 * the same hash function for both, but with a different hash mask.
66 */
67#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
68 defined(CONFIG_PROVE_LOCKING)
69
70#ifdef CONFIG_LOCKDEP
71# define GL_HASH_LOCK_SZ 256
72#else
73# if NR_CPUS >= 32
74# define GL_HASH_LOCK_SZ 4096
75# elif NR_CPUS >= 16
76# define GL_HASH_LOCK_SZ 2048
77# elif NR_CPUS >= 8
78# define GL_HASH_LOCK_SZ 1024
79# elif NR_CPUS >= 4
80# define GL_HASH_LOCK_SZ 512
81# else
82# define GL_HASH_LOCK_SZ 256
83# endif
84#endif
85
86/* We never want more locks than chains */
87#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
88# undef GL_HASH_LOCK_SZ
89# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
90#endif
91
92static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
93
94static inline rwlock_t *gl_lock_addr(unsigned int x)
95{
96 return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
97}
98#else /* not SMP, so no spinlocks required */
99static inline rwlock_t *gl_lock_addr(x)
100{
101 return NULL;
102}
103#endif
104
105/**
106 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
107 * @actual: the current state of the lock
108 * @requested: the lock state that was requested by the caller
109 * @flags: the modifier flags passed in by the caller
110 *
111 * Returns: 1 if the locks are compatible, 0 otherwise
112 */
113
114static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
115 int flags)
116{
117 if (actual == requested)
118 return 1;
119
120 if (flags & GL_EXACT)
121 return 0;
122
123 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
124 return 1;
125
126 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
127 return 1;
128
129 return 0;
130}
131
132/**
133 * gl_hash() - Turn glock number into hash bucket number
134 * @lock: The glock number
135 *
136 * Returns: The number of the corresponding hash bucket
137 */
138
139static unsigned int gl_hash(const struct gfs2_sbd *sdp,
140 const struct lm_lockname *name)
141{
142 unsigned int h;
143
144 h = jhash(&name->ln_number, sizeof(u64), 0);
145 h = jhash(&name->ln_type, sizeof(unsigned int), h);
146 h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
147 h &= GFS2_GL_HASH_MASK;
148
149 return h;
150}
151
152/**
153 * glock_free() - Perform a few checks and then release struct gfs2_glock
154 * @gl: The glock to release
155 *
156 * Also calls lock module to release its internal structure for this glock.
157 *
158 */
159
160static void glock_free(struct gfs2_glock *gl)
161{
162 struct gfs2_sbd *sdp = gl->gl_sbd;
163 struct inode *aspace = gl->gl_aspace;
164
165 gfs2_lm_put_lock(sdp, gl->gl_lock);
166
167 if (aspace)
168 gfs2_aspace_put(aspace);
169
170 kmem_cache_free(gfs2_glock_cachep, gl);
171}
172
173/**
174 * gfs2_glock_hold() - increment reference count on glock
175 * @gl: The glock to hold
176 *
177 */
178
179void gfs2_glock_hold(struct gfs2_glock *gl)
180{
181 atomic_inc(&gl->gl_ref);
182}
183
184/**
185 * gfs2_glock_put() - Decrement reference count on glock
186 * @gl: The glock to put
187 *
188 */
189
190int gfs2_glock_put(struct gfs2_glock *gl)
191{
192 int rv = 0;
193 struct gfs2_sbd *sdp = gl->gl_sbd;
194
195 write_lock(gl_lock_addr(gl->gl_hash));
196 if (atomic_dec_and_test(&gl->gl_ref)) {
197 hlist_del(&gl->gl_list);
198 write_unlock(gl_lock_addr(gl->gl_hash));
199 BUG_ON(spin_is_locked(&gl->gl_spin));
200 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
201 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
202 gfs2_assert(sdp, list_empty(&gl->gl_holders));
203 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
204 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
205 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
206 glock_free(gl);
207 rv = 1;
208 goto out;
209 }
210 write_unlock(gl_lock_addr(gl->gl_hash));
211out:
212 return rv;
213}
214
215/**
216 * queue_empty - check to see if a glock's queue is empty
217 * @gl: the glock
218 * @head: the head of the queue to check
219 *
220 * This function protects the list in the event that a process already
221 * has a holder on the list and is adding a second holder for itself.
222 * The glmutex lock is what generally prevents processes from working
223 * on the same glock at once, but the special case of adding a second
224 * holder for yourself ("recursive" locking) doesn't involve locking
225 * glmutex, making the spin lock necessary.
226 *
227 * Returns: 1 if the queue is empty
228 */
229
230static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
231{
232 int empty;
233 spin_lock(&gl->gl_spin);
234 empty = list_empty(head);
235 spin_unlock(&gl->gl_spin);
236 return empty;
237}
238
239/**
240 * search_bucket() - Find struct gfs2_glock by lock number
241 * @bucket: the bucket to search
242 * @name: The lock name
243 *
244 * Returns: NULL, or the struct gfs2_glock with the requested number
245 */
246
247static struct gfs2_glock *search_bucket(unsigned int hash,
248 const struct gfs2_sbd *sdp,
249 const struct lm_lockname *name)
250{
251 struct gfs2_glock *gl;
252 struct hlist_node *h;
253
254 hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
255 if (!lm_name_equal(&gl->gl_name, name))
256 continue;
257 if (gl->gl_sbd != sdp)
258 continue;
259
260 atomic_inc(&gl->gl_ref);
261
262 return gl;
263 }
264
265 return NULL;
266}
267
268/**
269 * gfs2_glock_find() - Find glock by lock number
270 * @sdp: The GFS2 superblock
271 * @name: The lock name
272 *
273 * Returns: NULL, or the struct gfs2_glock with the requested number
274 */
275
276static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
277 const struct lm_lockname *name)
278{
279 unsigned int hash = gl_hash(sdp, name);
280 struct gfs2_glock *gl;
281
282 read_lock(gl_lock_addr(hash));
283 gl = search_bucket(hash, sdp, name);
284 read_unlock(gl_lock_addr(hash));
285
286 return gl;
287}
288
289/**
290 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
291 * @sdp: The GFS2 superblock
292 * @number: the lock number
293 * @glops: The glock_operations to use
294 * @create: If 0, don't create the glock if it doesn't exist
295 * @glp: the glock is returned here
296 *
297 * This does not lock a glock, just finds/creates structures for one.
298 *
299 * Returns: errno
300 */
301
302int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
303 const struct gfs2_glock_operations *glops, int create,
304 struct gfs2_glock **glp)
305{
306 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
307 struct gfs2_glock *gl, *tmp;
308 unsigned int hash = gl_hash(sdp, &name);
309 int error;
310
311 read_lock(gl_lock_addr(hash));
312 gl = search_bucket(hash, sdp, &name);
313 read_unlock(gl_lock_addr(hash));
314
315 if (gl || !create) {
316 *glp = gl;
317 return 0;
318 }
319
320 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
321 if (!gl)
322 return -ENOMEM;
323
324 gl->gl_flags = 0;
325 gl->gl_name = name;
326 atomic_set(&gl->gl_ref, 1);
327 gl->gl_state = LM_ST_UNLOCKED;
328 gl->gl_hash = hash;
329 gl->gl_owner = NULL;
330 gl->gl_ip = 0;
331 gl->gl_ops = glops;
332 gl->gl_req_gh = NULL;
333 gl->gl_req_bh = NULL;
334 gl->gl_vn = 0;
335 gl->gl_stamp = jiffies;
336 gl->gl_object = NULL;
337 gl->gl_sbd = sdp;
338 gl->gl_aspace = NULL;
339 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
340
341 /* If this glock protects actual on-disk data or metadata blocks,
342 create a VFS inode to manage the pages/buffers holding them. */
343 if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
344 gl->gl_aspace = gfs2_aspace_get(sdp);
345 if (!gl->gl_aspace) {
346 error = -ENOMEM;
347 goto fail;
348 }
349 }
350
351 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
352 if (error)
353 goto fail_aspace;
354
355 write_lock(gl_lock_addr(hash));
356 tmp = search_bucket(hash, sdp, &name);
357 if (tmp) {
358 write_unlock(gl_lock_addr(hash));
359 glock_free(gl);
360 gl = tmp;
361 } else {
362 hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
363 write_unlock(gl_lock_addr(hash));
364 }
365
366 *glp = gl;
367
368 return 0;
369
370fail_aspace:
371 if (gl->gl_aspace)
372 gfs2_aspace_put(gl->gl_aspace);
373fail:
374 kmem_cache_free(gfs2_glock_cachep, gl);
375 return error;
376}
377
378/**
379 * gfs2_holder_init - initialize a struct gfs2_holder in the default way
380 * @gl: the glock
381 * @state: the state we're requesting
382 * @flags: the modifier flags
383 * @gh: the holder structure
384 *
385 */
386
387void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
388 struct gfs2_holder *gh)
389{
390 INIT_LIST_HEAD(&gh->gh_list);
391 gh->gh_gl = gl;
392 gh->gh_ip = (unsigned long)__builtin_return_address(0);
393 gh->gh_owner = current;
394 gh->gh_state = state;
395 gh->gh_flags = flags;
396 gh->gh_error = 0;
397 gh->gh_iflags = 0;
398 init_completion(&gh->gh_wait);
399
400 if (gh->gh_state == LM_ST_EXCLUSIVE)
401 gh->gh_flags |= GL_LOCAL_EXCL;
402
403 gfs2_glock_hold(gl);
404}
405
406/**
407 * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
408 * @state: the state we're requesting
409 * @flags: the modifier flags
410 * @gh: the holder structure
411 *
412 * Don't mess with the glock.
413 *
414 */
415
416void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
417{
418 gh->gh_state = state;
419 gh->gh_flags = flags;
420 if (gh->gh_state == LM_ST_EXCLUSIVE)
421 gh->gh_flags |= GL_LOCAL_EXCL;
422
423 gh->gh_iflags &= 1 << HIF_ALLOCED;
424 gh->gh_ip = (unsigned long)__builtin_return_address(0);
425}
426
427/**
428 * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
429 * @gh: the holder structure
430 *
431 */
432
433void gfs2_holder_uninit(struct gfs2_holder *gh)
434{
435 gfs2_glock_put(gh->gh_gl);
436 gh->gh_gl = NULL;
437 gh->gh_ip = 0;
438}
439
440/**
441 * gfs2_holder_get - get a struct gfs2_holder structure
442 * @gl: the glock
443 * @state: the state we're requesting
444 * @flags: the modifier flags
445 * @gfp_flags:
446 *
447 * Figure out how big an impact this function has. Either:
448 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
449 * 2) Leave it like it is
450 *
451 * Returns: the holder structure, NULL on ENOMEM
452 */
453
454static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
455 unsigned int state,
456 int flags, gfp_t gfp_flags)
457{
458 struct gfs2_holder *gh;
459
460 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
461 if (!gh)
462 return NULL;
463
464 gfs2_holder_init(gl, state, flags, gh);
465 set_bit(HIF_ALLOCED, &gh->gh_iflags);
466 gh->gh_ip = (unsigned long)__builtin_return_address(0);
467 return gh;
468}
469
470/**
471 * gfs2_holder_put - get rid of a struct gfs2_holder structure
472 * @gh: the holder structure
473 *
474 */
475
476static void gfs2_holder_put(struct gfs2_holder *gh)
477{
478 gfs2_holder_uninit(gh);
479 kfree(gh);
480}
481
482/**
483 * rq_mutex - process a mutex request in the queue
484 * @gh: the glock holder
485 *
486 * Returns: 1 if the queue is blocked
487 */
488
489static int rq_mutex(struct gfs2_holder *gh)
490{
491 struct gfs2_glock *gl = gh->gh_gl;
492
493 list_del_init(&gh->gh_list);
494 /* gh->gh_error never examined. */
495 set_bit(GLF_LOCK, &gl->gl_flags);
496 complete(&gh->gh_wait);
497
498 return 1;
499}
500
501/**
502 * rq_promote - process a promote request in the queue
503 * @gh: the glock holder
504 *
505 * Acquire a new inter-node lock, or change a lock state to more restrictive.
506 *
507 * Returns: 1 if the queue is blocked
508 */
509
510static int rq_promote(struct gfs2_holder *gh)
511{
512 struct gfs2_glock *gl = gh->gh_gl;
513 struct gfs2_sbd *sdp = gl->gl_sbd;
514 const struct gfs2_glock_operations *glops = gl->gl_ops;
515
516 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
517 if (list_empty(&gl->gl_holders)) {
518 gl->gl_req_gh = gh;
519 set_bit(GLF_LOCK, &gl->gl_flags);
520 spin_unlock(&gl->gl_spin);
521
522 if (atomic_read(&sdp->sd_reclaim_count) >
523 gfs2_tune_get(sdp, gt_reclaim_limit) &&
524 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
525 gfs2_reclaim_glock(sdp);
526 gfs2_reclaim_glock(sdp);
527 }
528
529 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
530 spin_lock(&gl->gl_spin);
531 }
532 return 1;
533 }
534
535 if (list_empty(&gl->gl_holders)) {
536 set_bit(HIF_FIRST, &gh->gh_iflags);
537 set_bit(GLF_LOCK, &gl->gl_flags);
538 } else {
539 struct gfs2_holder *next_gh;
540 if (gh->gh_flags & GL_LOCAL_EXCL)
541 return 1;
542 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
543 gh_list);
544 if (next_gh->gh_flags & GL_LOCAL_EXCL)
545 return 1;
546 }
547
548 list_move_tail(&gh->gh_list, &gl->gl_holders);
549 gh->gh_error = 0;
550 set_bit(HIF_HOLDER, &gh->gh_iflags);
551
552 complete(&gh->gh_wait);
553
554 return 0;
555}
556
557/**
558 * rq_demote - process a demote request in the queue
559 * @gh: the glock holder
560 *
561 * Returns: 1 if the queue is blocked
562 */
563
564static int rq_demote(struct gfs2_holder *gh)
565{
566 struct gfs2_glock *gl = gh->gh_gl;
567 const struct gfs2_glock_operations *glops = gl->gl_ops;
568
569 if (!list_empty(&gl->gl_holders))
570 return 1;
571
572 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
573 list_del_init(&gh->gh_list);
574 gh->gh_error = 0;
575 spin_unlock(&gl->gl_spin);
576 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
577 gfs2_holder_put(gh);
578 else
579 complete(&gh->gh_wait);
580 spin_lock(&gl->gl_spin);
581 } else {
582 gl->gl_req_gh = gh;
583 set_bit(GLF_LOCK, &gl->gl_flags);
584 spin_unlock(&gl->gl_spin);
585
586 if (gh->gh_state == LM_ST_UNLOCKED ||
587 gl->gl_state != LM_ST_EXCLUSIVE)
588 glops->go_drop_th(gl);
589 else
590 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
591
592 spin_lock(&gl->gl_spin);
593 }
594
595 return 0;
596}
597
598/**
599 * rq_greedy - process a queued request to drop greedy status
600 * @gh: the glock holder
601 *
602 * Returns: 1 if the queue is blocked
603 */
604
605static int rq_greedy(struct gfs2_holder *gh)
606{
607 struct gfs2_glock *gl = gh->gh_gl;
608
609 list_del_init(&gh->gh_list);
610 /* gh->gh_error never examined. */
611 clear_bit(GLF_GREEDY, &gl->gl_flags);
612 spin_unlock(&gl->gl_spin);
613
614 gfs2_holder_uninit(gh);
615 kfree(container_of(gh, struct greedy, gr_gh));
616
617 spin_lock(&gl->gl_spin);
618
619 return 0;
620}
621
622/**
623 * run_queue - process holder structures on a glock
624 * @gl: the glock
625 *
626 */
627static void run_queue(struct gfs2_glock *gl)
628{
629 struct gfs2_holder *gh;
630 int blocked = 1;
631
632 for (;;) {
633 if (test_bit(GLF_LOCK, &gl->gl_flags))
634 break;
635
636 if (!list_empty(&gl->gl_waiters1)) {
637 gh = list_entry(gl->gl_waiters1.next,
638 struct gfs2_holder, gh_list);
639
640 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
641 blocked = rq_mutex(gh);
642 else
643 gfs2_assert_warn(gl->gl_sbd, 0);
644
645 } else if (!list_empty(&gl->gl_waiters2) &&
646 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
647 gh = list_entry(gl->gl_waiters2.next,
648 struct gfs2_holder, gh_list);
649
650 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
651 blocked = rq_demote(gh);
652 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
653 blocked = rq_greedy(gh);
654 else
655 gfs2_assert_warn(gl->gl_sbd, 0);
656
657 } else if (!list_empty(&gl->gl_waiters3)) {
658 gh = list_entry(gl->gl_waiters3.next,
659 struct gfs2_holder, gh_list);
660
661 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
662 blocked = rq_promote(gh);
663 else
664 gfs2_assert_warn(gl->gl_sbd, 0);
665
666 } else
667 break;
668
669 if (blocked)
670 break;
671 }
672}
673
674/**
675 * gfs2_glmutex_lock - acquire a local lock on a glock
676 * @gl: the glock
677 *
678 * Gives caller exclusive access to manipulate a glock structure.
679 */
680
681static void gfs2_glmutex_lock(struct gfs2_glock *gl)
682{
683 struct gfs2_holder gh;
684
685 gfs2_holder_init(gl, 0, 0, &gh);
686 set_bit(HIF_MUTEX, &gh.gh_iflags);
687
688 spin_lock(&gl->gl_spin);
689 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
690 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
691 } else {
692 gl->gl_owner = current;
693 gl->gl_ip = (unsigned long)__builtin_return_address(0);
694 complete(&gh.gh_wait);
695 }
696 spin_unlock(&gl->gl_spin);
697
698 wait_for_completion(&gh.gh_wait);
699 gfs2_holder_uninit(&gh);
700}
701
702/**
703 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
704 * @gl: the glock
705 *
706 * Returns: 1 if the glock is acquired
707 */
708
709static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
710{
711 int acquired = 1;
712
713 spin_lock(&gl->gl_spin);
714 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
715 acquired = 0;
716 } else {
717 gl->gl_owner = current;
718 gl->gl_ip = (unsigned long)__builtin_return_address(0);
719 }
720 spin_unlock(&gl->gl_spin);
721
722 return acquired;
723}
724
725/**
726 * gfs2_glmutex_unlock - release a local lock on a glock
727 * @gl: the glock
728 *
729 */
730
731static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
732{
733 spin_lock(&gl->gl_spin);
734 clear_bit(GLF_LOCK, &gl->gl_flags);
735 gl->gl_owner = NULL;
736 gl->gl_ip = 0;
737 run_queue(gl);
738 BUG_ON(!spin_is_locked(&gl->gl_spin));
739 spin_unlock(&gl->gl_spin);
740}
741
742/**
743 * handle_callback - add a demote request to a lock's queue
744 * @gl: the glock
745 * @state: the state the caller wants us to change to
746 *
747 * Note: This may fail sliently if we are out of memory.
748 */
749
750static void handle_callback(struct gfs2_glock *gl, unsigned int state)
751{
752 struct gfs2_holder *gh, *new_gh = NULL;
753
754restart:
755 spin_lock(&gl->gl_spin);
756
757 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
758 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
759 gl->gl_req_gh != gh) {
760 if (gh->gh_state != state)
761 gh->gh_state = LM_ST_UNLOCKED;
762 goto out;
763 }
764 }
765
766 if (new_gh) {
767 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
768 new_gh = NULL;
769 } else {
770 spin_unlock(&gl->gl_spin);
771
772 new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
773 if (!new_gh)
774 return;
775 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
776 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
777
778 goto restart;
779 }
780
781out:
782 spin_unlock(&gl->gl_spin);
783
784 if (new_gh)
785 gfs2_holder_put(new_gh);
786}
787
788void gfs2_glock_inode_squish(struct inode *inode)
789{
790 struct gfs2_holder gh;
791 struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
792 gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
793 set_bit(HIF_DEMOTE, &gh.gh_iflags);
794 spin_lock(&gl->gl_spin);
795 gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
796 list_add_tail(&gh.gh_list, &gl->gl_waiters2);
797 run_queue(gl);
798 spin_unlock(&gl->gl_spin);
799 wait_for_completion(&gh.gh_wait);
800 gfs2_holder_uninit(&gh);
801}
802
803/**
804 * state_change - record that the glock is now in a different state
805 * @gl: the glock
806 * @new_state the new state
807 *
808 */
809
810static void state_change(struct gfs2_glock *gl, unsigned int new_state)
811{
812 int held1, held2;
813
814 held1 = (gl->gl_state != LM_ST_UNLOCKED);
815 held2 = (new_state != LM_ST_UNLOCKED);
816
817 if (held1 != held2) {
818 if (held2)
819 gfs2_glock_hold(gl);
820 else
821 gfs2_glock_put(gl);
822 }
823
824 gl->gl_state = new_state;
825}
826
827/**
828 * xmote_bh - Called after the lock module is done acquiring a lock
829 * @gl: The glock in question
830 * @ret: the int returned from the lock module
831 *
832 */
833
834static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
835{
836 struct gfs2_sbd *sdp = gl->gl_sbd;
837 const struct gfs2_glock_operations *glops = gl->gl_ops;
838 struct gfs2_holder *gh = gl->gl_req_gh;
839 int prev_state = gl->gl_state;
840 int op_done = 1;
841
842 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
843 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
844 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
845
846 state_change(gl, ret & LM_OUT_ST_MASK);
847
848 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
849 if (glops->go_inval)
850 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
851 } else if (gl->gl_state == LM_ST_DEFERRED) {
852 /* We might not want to do this here.
853 Look at moving to the inode glops. */
854 if (glops->go_inval)
855 glops->go_inval(gl, DIO_DATA);
856 }
857
858 /* Deal with each possible exit condition */
859
860 if (!gh)
861 gl->gl_stamp = jiffies;
862 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
863 spin_lock(&gl->gl_spin);
864 list_del_init(&gh->gh_list);
865 gh->gh_error = -EIO;
866 spin_unlock(&gl->gl_spin);
867 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
868 spin_lock(&gl->gl_spin);
869 list_del_init(&gh->gh_list);
870 if (gl->gl_state == gh->gh_state ||
871 gl->gl_state == LM_ST_UNLOCKED) {
872 gh->gh_error = 0;
873 } else {
874 if (gfs2_assert_warn(sdp, gh->gh_flags &
875 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
876 fs_warn(sdp, "ret = 0x%.8X\n", ret);
877 gh->gh_error = GLR_TRYFAILED;
878 }
879 spin_unlock(&gl->gl_spin);
880
881 if (ret & LM_OUT_CANCELED)
882 handle_callback(gl, LM_ST_UNLOCKED);
883
884 } else if (ret & LM_OUT_CANCELED) {
885 spin_lock(&gl->gl_spin);
886 list_del_init(&gh->gh_list);
887 gh->gh_error = GLR_CANCELED;
888 spin_unlock(&gl->gl_spin);
889
890 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
891 spin_lock(&gl->gl_spin);
892 list_move_tail(&gh->gh_list, &gl->gl_holders);
893 gh->gh_error = 0;
894 set_bit(HIF_HOLDER, &gh->gh_iflags);
895 spin_unlock(&gl->gl_spin);
896
897 set_bit(HIF_FIRST, &gh->gh_iflags);
898
899 op_done = 0;
900
901 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
902 spin_lock(&gl->gl_spin);
903 list_del_init(&gh->gh_list);
904 gh->gh_error = GLR_TRYFAILED;
905 spin_unlock(&gl->gl_spin);
906
907 } else {
908 if (gfs2_assert_withdraw(sdp, 0) == -1)
909 fs_err(sdp, "ret = 0x%.8X\n", ret);
910 }
911
912 if (glops->go_xmote_bh)
913 glops->go_xmote_bh(gl);
914
915 if (op_done) {
916 spin_lock(&gl->gl_spin);
917 gl->gl_req_gh = NULL;
918 gl->gl_req_bh = NULL;
919 clear_bit(GLF_LOCK, &gl->gl_flags);
920 run_queue(gl);
921 spin_unlock(&gl->gl_spin);
922 }
923
924 gfs2_glock_put(gl);
925
926 if (gh) {
927 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
928 gfs2_holder_put(gh);
929 else
930 complete(&gh->gh_wait);
931 }
932}
933
934/**
935 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
936 * @gl: The glock in question
937 * @state: the requested state
938 * @flags: modifier flags to the lock call
939 *
940 */
941
942void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
943{
944 struct gfs2_sbd *sdp = gl->gl_sbd;
945 const struct gfs2_glock_operations *glops = gl->gl_ops;
946 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
947 LM_FLAG_NOEXP | LM_FLAG_ANY |
948 LM_FLAG_PRIORITY);
949 unsigned int lck_ret;
950
951 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
952 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
953 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
954 gfs2_assert_warn(sdp, state != gl->gl_state);
955
956 if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
957 glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
958
959 gfs2_glock_hold(gl);
960 gl->gl_req_bh = xmote_bh;
961
962 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
963
964 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
965 return;
966
967 if (lck_ret & LM_OUT_ASYNC)
968 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
969 else
970 xmote_bh(gl, lck_ret);
971}
972
973/**
974 * drop_bh - Called after a lock module unlock completes
975 * @gl: the glock
976 * @ret: the return status
977 *
978 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
979 * Doesn't drop the reference on the glock the top half took out
980 *
981 */
982
983static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
984{
985 struct gfs2_sbd *sdp = gl->gl_sbd;
986 const struct gfs2_glock_operations *glops = gl->gl_ops;
987 struct gfs2_holder *gh = gl->gl_req_gh;
988
989 clear_bit(GLF_PREFETCH, &gl->gl_flags);
990
991 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
992 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
993 gfs2_assert_warn(sdp, !ret);
994
995 state_change(gl, LM_ST_UNLOCKED);
996
997 if (glops->go_inval)
998 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
999
1000 if (gh) {
1001 spin_lock(&gl->gl_spin);
1002 list_del_init(&gh->gh_list);
1003 gh->gh_error = 0;
1004 spin_unlock(&gl->gl_spin);
1005 }
1006
1007 if (glops->go_drop_bh)
1008 glops->go_drop_bh(gl);
1009
1010 spin_lock(&gl->gl_spin);
1011 gl->gl_req_gh = NULL;
1012 gl->gl_req_bh = NULL;
1013 clear_bit(GLF_LOCK, &gl->gl_flags);
1014 run_queue(gl);
1015 spin_unlock(&gl->gl_spin);
1016
1017 gfs2_glock_put(gl);
1018
1019 if (gh) {
1020 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
1021 gfs2_holder_put(gh);
1022 else
1023 complete(&gh->gh_wait);
1024 }
1025}
1026
1027/**
1028 * gfs2_glock_drop_th - call into the lock module to unlock a lock
1029 * @gl: the glock
1030 *
1031 */
1032
1033void gfs2_glock_drop_th(struct gfs2_glock *gl)
1034{
1035 struct gfs2_sbd *sdp = gl->gl_sbd;
1036 const struct gfs2_glock_operations *glops = gl->gl_ops;
1037 unsigned int ret;
1038
1039 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1040 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1041 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1042
1043 if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
1044 glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
1045
1046 gfs2_glock_hold(gl);
1047 gl->gl_req_bh = drop_bh;
1048
1049 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
1050
1051 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
1052 return;
1053
1054 if (!ret)
1055 drop_bh(gl, ret);
1056 else
1057 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
1058}
1059
1060/**
1061 * do_cancels - cancel requests for locks stuck waiting on an expire flag
1062 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
1063 *
1064 * Don't cancel GL_NOCANCEL requests.
1065 */
1066
1067static void do_cancels(struct gfs2_holder *gh)
1068{
1069 struct gfs2_glock *gl = gh->gh_gl;
1070
1071 spin_lock(&gl->gl_spin);
1072
1073 while (gl->gl_req_gh != gh &&
1074 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1075 !list_empty(&gh->gh_list)) {
1076 if (gl->gl_req_bh && !(gl->gl_req_gh &&
1077 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
1078 spin_unlock(&gl->gl_spin);
1079 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
1080 msleep(100);
1081 spin_lock(&gl->gl_spin);
1082 } else {
1083 spin_unlock(&gl->gl_spin);
1084 msleep(100);
1085 spin_lock(&gl->gl_spin);
1086 }
1087 }
1088
1089 spin_unlock(&gl->gl_spin);
1090}
1091
1092/**
1093 * glock_wait_internal - wait on a glock acquisition
1094 * @gh: the glock holder
1095 *
1096 * Returns: 0 on success
1097 */
1098
1099static int glock_wait_internal(struct gfs2_holder *gh)
1100{
1101 struct gfs2_glock *gl = gh->gh_gl;
1102 struct gfs2_sbd *sdp = gl->gl_sbd;
1103 const struct gfs2_glock_operations *glops = gl->gl_ops;
1104
1105 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1106 return -EIO;
1107
1108 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1109 spin_lock(&gl->gl_spin);
1110 if (gl->gl_req_gh != gh &&
1111 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1112 !list_empty(&gh->gh_list)) {
1113 list_del_init(&gh->gh_list);
1114 gh->gh_error = GLR_TRYFAILED;
1115 run_queue(gl);
1116 spin_unlock(&gl->gl_spin);
1117 return gh->gh_error;
1118 }
1119 spin_unlock(&gl->gl_spin);
1120 }
1121
1122 if (gh->gh_flags & LM_FLAG_PRIORITY)
1123 do_cancels(gh);
1124
1125 wait_for_completion(&gh->gh_wait);
1126
1127 if (gh->gh_error)
1128 return gh->gh_error;
1129
1130 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1131 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
1132 gh->gh_flags));
1133
1134 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1135 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1136
1137 if (glops->go_lock) {
1138 gh->gh_error = glops->go_lock(gh);
1139 if (gh->gh_error) {
1140 spin_lock(&gl->gl_spin);
1141 list_del_init(&gh->gh_list);
1142 spin_unlock(&gl->gl_spin);
1143 }
1144 }
1145
1146 spin_lock(&gl->gl_spin);
1147 gl->gl_req_gh = NULL;
1148 gl->gl_req_bh = NULL;
1149 clear_bit(GLF_LOCK, &gl->gl_flags);
1150 run_queue(gl);
1151 spin_unlock(&gl->gl_spin);
1152 }
1153
1154 return gh->gh_error;
1155}
1156
1157static inline struct gfs2_holder *
1158find_holder_by_owner(struct list_head *head, struct task_struct *owner)
1159{
1160 struct gfs2_holder *gh;
1161
1162 list_for_each_entry(gh, head, gh_list) {
1163 if (gh->gh_owner == owner)
1164 return gh;
1165 }
1166
1167 return NULL;
1168}
1169
1170/**
1171 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1172 * @gh: the holder structure to add
1173 *
1174 */
1175
1176static void add_to_queue(struct gfs2_holder *gh)
1177{
1178 struct gfs2_glock *gl = gh->gh_gl;
1179 struct gfs2_holder *existing;
1180
1181 BUG_ON(!gh->gh_owner);
1182
1183 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1184 if (existing) {
1185 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1186 printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
1187 printk(KERN_INFO "lock type : %d lock state : %d\n",
1188 existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
1189 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1190 printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
1191 printk(KERN_INFO "lock type : %d lock state : %d\n",
1192 gl->gl_name.ln_type, gl->gl_state);
1193 BUG();
1194 }
1195
1196 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
1197 if (existing) {
1198 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1199 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1200 BUG();
1201 }
1202
1203 if (gh->gh_flags & LM_FLAG_PRIORITY)
1204 list_add(&gh->gh_list, &gl->gl_waiters3);
1205 else
1206 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1207}
1208
1209/**
1210 * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
1211 * @gh: the holder structure
1212 *
1213 * if (gh->gh_flags & GL_ASYNC), this never returns an error
1214 *
1215 * Returns: 0, GLR_TRYFAILED, or errno on failure
1216 */
1217
1218int gfs2_glock_nq(struct gfs2_holder *gh)
1219{
1220 struct gfs2_glock *gl = gh->gh_gl;
1221 struct gfs2_sbd *sdp = gl->gl_sbd;
1222 int error = 0;
1223
1224restart:
1225 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1226 set_bit(HIF_ABORTED, &gh->gh_iflags);
1227 return -EIO;
1228 }
1229
1230 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1231
1232 spin_lock(&gl->gl_spin);
1233 add_to_queue(gh);
1234 run_queue(gl);
1235 spin_unlock(&gl->gl_spin);
1236
1237 if (!(gh->gh_flags & GL_ASYNC)) {
1238 error = glock_wait_internal(gh);
1239 if (error == GLR_CANCELED) {
1240 msleep(100);
1241 goto restart;
1242 }
1243 }
1244
1245 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1246
1247 if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
1248 dump_glock(gl);
1249
1250 return error;
1251}
1252
1253/**
1254 * gfs2_glock_poll - poll to see if an async request has been completed
1255 * @gh: the holder
1256 *
1257 * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
1258 */
1259
1260int gfs2_glock_poll(struct gfs2_holder *gh)
1261{
1262 struct gfs2_glock *gl = gh->gh_gl;
1263 int ready = 0;
1264
1265 spin_lock(&gl->gl_spin);
1266
1267 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1268 ready = 1;
1269 else if (list_empty(&gh->gh_list)) {
1270 if (gh->gh_error == GLR_CANCELED) {
1271 spin_unlock(&gl->gl_spin);
1272 msleep(100);
1273 if (gfs2_glock_nq(gh))
1274 return 1;
1275 return 0;
1276 } else
1277 ready = 1;
1278 }
1279
1280 spin_unlock(&gl->gl_spin);
1281
1282 return ready;
1283}
1284
1285/**
1286 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1287 * @gh: the holder structure
1288 *
1289 * Returns: 0, GLR_TRYFAILED, or errno on failure
1290 */
1291
1292int gfs2_glock_wait(struct gfs2_holder *gh)
1293{
1294 int error;
1295
1296 error = glock_wait_internal(gh);
1297 if (error == GLR_CANCELED) {
1298 msleep(100);
1299 gh->gh_flags &= ~GL_ASYNC;
1300 error = gfs2_glock_nq(gh);
1301 }
1302
1303 return error;
1304}
1305
1306/**
1307 * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1308 * @gh: the glock holder
1309 *
1310 */
1311
1312void gfs2_glock_dq(struct gfs2_holder *gh)
1313{
1314 struct gfs2_glock *gl = gh->gh_gl;
1315 const struct gfs2_glock_operations *glops = gl->gl_ops;
1316
1317 if (gh->gh_flags & GL_NOCACHE)
1318 handle_callback(gl, LM_ST_UNLOCKED);
1319
1320 gfs2_glmutex_lock(gl);
1321
1322 spin_lock(&gl->gl_spin);
1323 list_del_init(&gh->gh_list);
1324
1325 if (list_empty(&gl->gl_holders)) {
1326 spin_unlock(&gl->gl_spin);
1327
1328 if (glops->go_unlock)
1329 glops->go_unlock(gh);
1330
1331 gl->gl_stamp = jiffies;
1332
1333 spin_lock(&gl->gl_spin);
1334 }
1335
1336 clear_bit(GLF_LOCK, &gl->gl_flags);
1337 run_queue(gl);
1338 spin_unlock(&gl->gl_spin);
1339}
1340
1341/**
1342 * gfs2_glock_prefetch - Try to prefetch a glock
1343 * @gl: the glock
1344 * @state: the state to prefetch in
1345 * @flags: flags passed to go_xmote_th()
1346 *
1347 */
1348
1349static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
1350 int flags)
1351{
1352 const struct gfs2_glock_operations *glops = gl->gl_ops;
1353
1354 spin_lock(&gl->gl_spin);
1355
1356 if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
1357 !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
1358 !list_empty(&gl->gl_waiters3) ||
1359 relaxed_state_ok(gl->gl_state, state, flags)) {
1360 spin_unlock(&gl->gl_spin);
1361 return;
1362 }
1363
1364 set_bit(GLF_PREFETCH, &gl->gl_flags);
1365 set_bit(GLF_LOCK, &gl->gl_flags);
1366 spin_unlock(&gl->gl_spin);
1367
1368 glops->go_xmote_th(gl, state, flags);
1369}
1370
1371static void greedy_work(void *data)
1372{
1373 struct greedy *gr = data;
1374 struct gfs2_holder *gh = &gr->gr_gh;
1375 struct gfs2_glock *gl = gh->gh_gl;
1376 const struct gfs2_glock_operations *glops = gl->gl_ops;
1377
1378 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1379
1380 if (glops->go_greedy)
1381 glops->go_greedy(gl);
1382
1383 spin_lock(&gl->gl_spin);
1384
1385 if (list_empty(&gl->gl_waiters2)) {
1386 clear_bit(GLF_GREEDY, &gl->gl_flags);
1387 spin_unlock(&gl->gl_spin);
1388 gfs2_holder_uninit(gh);
1389 kfree(gr);
1390 } else {
1391 gfs2_glock_hold(gl);
1392 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1393 run_queue(gl);
1394 spin_unlock(&gl->gl_spin);
1395 gfs2_glock_put(gl);
1396 }
1397}
1398
1399/**
1400 * gfs2_glock_be_greedy -
1401 * @gl:
1402 * @time:
1403 *
1404 * Returns: 0 if go_greedy will be called, 1 otherwise
1405 */
1406
1407int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1408{
1409 struct greedy *gr;
1410 struct gfs2_holder *gh;
1411
1412 if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
1413 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1414 return 1;
1415
1416 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1417 if (!gr) {
1418 clear_bit(GLF_GREEDY, &gl->gl_flags);
1419 return 1;
1420 }
1421 gh = &gr->gr_gh;
1422
1423 gfs2_holder_init(gl, 0, 0, gh);
1424 set_bit(HIF_GREEDY, &gh->gh_iflags);
1425 INIT_WORK(&gr->gr_work, greedy_work, gr);
1426
1427 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1428 schedule_delayed_work(&gr->gr_work, time);
1429
1430 return 0;
1431}
1432
1433/**
1434 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1435 * @gh: the holder structure
1436 *
1437 */
1438
1439void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1440{
1441 gfs2_glock_dq(gh);
1442 gfs2_holder_uninit(gh);
1443}
1444
1445/**
1446 * gfs2_glock_nq_num - acquire a glock based on lock number
1447 * @sdp: the filesystem
1448 * @number: the lock number
1449 * @glops: the glock operations for the type of glock
1450 * @state: the state to acquire the glock in
1451 * @flags: modifier flags for the aquisition
1452 * @gh: the struct gfs2_holder
1453 *
1454 * Returns: errno
1455 */
1456
1457int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
1458 const struct gfs2_glock_operations *glops,
1459 unsigned int state, int flags, struct gfs2_holder *gh)
1460{
1461 struct gfs2_glock *gl;
1462 int error;
1463
1464 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1465 if (!error) {
1466 error = gfs2_glock_nq_init(gl, state, flags, gh);
1467 gfs2_glock_put(gl);
1468 }
1469
1470 return error;
1471}
1472
1473/**
1474 * glock_compare - Compare two struct gfs2_glock structures for sorting
1475 * @arg_a: the first structure
1476 * @arg_b: the second structure
1477 *
1478 */
1479
1480static int glock_compare(const void *arg_a, const void *arg_b)
1481{
1482 const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
1483 const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
1484 const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1485 const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1486
1487 if (a->ln_number > b->ln_number)
1488 return 1;
1489 if (a->ln_number < b->ln_number)
1490 return -1;
1491 if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
1492 return 1;
1493 if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
1494 return 1;
1495 return 0;
1496}
1497
1498/**
1499 * nq_m_sync - synchonously acquire more than one glock in deadlock free order
1500 * @num_gh: the number of structures
1501 * @ghs: an array of struct gfs2_holder structures
1502 *
1503 * Returns: 0 on success (all glocks acquired),
1504 * errno on failure (no glocks acquired)
1505 */
1506
1507static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1508 struct gfs2_holder **p)
1509{
1510 unsigned int x;
1511 int error = 0;
1512
1513 for (x = 0; x < num_gh; x++)
1514 p[x] = &ghs[x];
1515
1516 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
1517
1518 for (x = 0; x < num_gh; x++) {
1519 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1520
1521 error = gfs2_glock_nq(p[x]);
1522 if (error) {
1523 while (x--)
1524 gfs2_glock_dq(p[x]);
1525 break;
1526 }
1527 }
1528
1529 return error;
1530}
1531
1532/**
1533 * gfs2_glock_nq_m - acquire multiple glocks
1534 * @num_gh: the number of structures
1535 * @ghs: an array of struct gfs2_holder structures
1536 *
1537 * Figure out how big an impact this function has. Either:
1538 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1539 * 2) Forget async stuff and just call nq_m_sync()
1540 * 3) Leave it like it is
1541 *
1542 * Returns: 0 on success (all glocks acquired),
1543 * errno on failure (no glocks acquired)
1544 */
1545
1546int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1547{
1548 int *e;
1549 unsigned int x;
1550 int borked = 0, serious = 0;
1551 int error = 0;
1552
1553 if (!num_gh)
1554 return 0;
1555
1556 if (num_gh == 1) {
1557 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1558 return gfs2_glock_nq(ghs);
1559 }
1560
1561 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1562 if (!e)
1563 return -ENOMEM;
1564
1565 for (x = 0; x < num_gh; x++) {
1566 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1567 error = gfs2_glock_nq(&ghs[x]);
1568 if (error) {
1569 borked = 1;
1570 serious = error;
1571 num_gh = x;
1572 break;
1573 }
1574 }
1575
1576 for (x = 0; x < num_gh; x++) {
1577 error = e[x] = glock_wait_internal(&ghs[x]);
1578 if (error) {
1579 borked = 1;
1580 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1581 serious = error;
1582 }
1583 }
1584
1585 if (!borked) {
1586 kfree(e);
1587 return 0;
1588 }
1589
1590 for (x = 0; x < num_gh; x++)
1591 if (!e[x])
1592 gfs2_glock_dq(&ghs[x]);
1593
1594 if (serious)
1595 error = serious;
1596 else {
1597 for (x = 0; x < num_gh; x++)
1598 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1599 &ghs[x]);
1600 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1601 }
1602
1603 kfree(e);
1604
1605 return error;
1606}
1607
1608/**
1609 * gfs2_glock_dq_m - release multiple glocks
1610 * @num_gh: the number of structures
1611 * @ghs: an array of struct gfs2_holder structures
1612 *
1613 */
1614
1615void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1616{
1617 unsigned int x;
1618
1619 for (x = 0; x < num_gh; x++)
1620 gfs2_glock_dq(&ghs[x]);
1621}
1622
1623/**
1624 * gfs2_glock_dq_uninit_m - release multiple glocks
1625 * @num_gh: the number of structures
1626 * @ghs: an array of struct gfs2_holder structures
1627 *
1628 */
1629
1630void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1631{
1632 unsigned int x;
1633
1634 for (x = 0; x < num_gh; x++)
1635 gfs2_glock_dq_uninit(&ghs[x]);
1636}
1637
1638/**
1639 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1640 * @sdp: the filesystem
1641 * @number: the lock number
1642 * @glops: the glock operations for the type of glock
1643 * @state: the state to acquire the glock in
1644 * @flags: modifier flags for the aquisition
1645 *
1646 * Returns: errno
1647 */
1648
1649void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
1650 const struct gfs2_glock_operations *glops,
1651 unsigned int state, int flags)
1652{
1653 struct gfs2_glock *gl;
1654 int error;
1655
1656 if (atomic_read(&sdp->sd_reclaim_count) <
1657 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1658 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1659 if (!error) {
1660 gfs2_glock_prefetch(gl, state, flags);
1661 gfs2_glock_put(gl);
1662 }
1663 }
1664}
1665
1666/**
1667 * gfs2_lvb_hold - attach a LVB from a glock
1668 * @gl: The glock in question
1669 *
1670 */
1671
1672int gfs2_lvb_hold(struct gfs2_glock *gl)
1673{
1674 int error;
1675
1676 gfs2_glmutex_lock(gl);
1677
1678 if (!atomic_read(&gl->gl_lvb_count)) {
1679 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1680 if (error) {
1681 gfs2_glmutex_unlock(gl);
1682 return error;
1683 }
1684 gfs2_glock_hold(gl);
1685 }
1686 atomic_inc(&gl->gl_lvb_count);
1687
1688 gfs2_glmutex_unlock(gl);
1689
1690 return 0;
1691}
1692
1693/**
1694 * gfs2_lvb_unhold - detach a LVB from a glock
1695 * @gl: The glock in question
1696 *
1697 */
1698
1699void gfs2_lvb_unhold(struct gfs2_glock *gl)
1700{
1701 gfs2_glock_hold(gl);
1702 gfs2_glmutex_lock(gl);
1703
1704 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1705 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1706 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1707 gl->gl_lvb = NULL;
1708 gfs2_glock_put(gl);
1709 }
1710
1711 gfs2_glmutex_unlock(gl);
1712 gfs2_glock_put(gl);
1713}
1714
1715static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1716 unsigned int state)
1717{
1718 struct gfs2_glock *gl;
1719
1720 gl = gfs2_glock_find(sdp, name);
1721 if (!gl)
1722 return;
1723
1724 if (gl->gl_ops->go_callback)
1725 gl->gl_ops->go_callback(gl, state);
1726 handle_callback(gl, state);
1727
1728 spin_lock(&gl->gl_spin);
1729 run_queue(gl);
1730 spin_unlock(&gl->gl_spin);
1731
1732 gfs2_glock_put(gl);
1733}
1734
1735/**
1736 * gfs2_glock_cb - Callback used by locking module
1737 * @sdp: Pointer to the superblock
1738 * @type: Type of callback
1739 * @data: Type dependent data pointer
1740 *
1741 * Called by the locking module when it wants to tell us something.
1742 * Either we need to drop a lock, one of our ASYNC requests completed, or
1743 * a journal from another client needs to be recovered.
1744 */
1745
1746void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1747{
1748 struct gfs2_sbd *sdp = cb_data;
1749
1750 switch (type) {
1751 case LM_CB_NEED_E:
1752 blocking_cb(sdp, data, LM_ST_UNLOCKED);
1753 return;
1754
1755 case LM_CB_NEED_D:
1756 blocking_cb(sdp, data, LM_ST_DEFERRED);
1757 return;
1758
1759 case LM_CB_NEED_S:
1760 blocking_cb(sdp, data, LM_ST_SHARED);
1761 return;
1762
1763 case LM_CB_ASYNC: {
1764 struct lm_async_cb *async = data;
1765 struct gfs2_glock *gl;
1766
1767 gl = gfs2_glock_find(sdp, &async->lc_name);
1768 if (gfs2_assert_warn(sdp, gl))
1769 return;
1770 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1771 gl->gl_req_bh(gl, async->lc_ret);
1772 gfs2_glock_put(gl);
1773 return;
1774 }
1775
1776 case LM_CB_NEED_RECOVERY:
1777 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1778 if (sdp->sd_recoverd_process)
1779 wake_up_process(sdp->sd_recoverd_process);
1780 return;
1781
1782 case LM_CB_DROPLOCKS:
1783 gfs2_gl_hash_clear(sdp, NO_WAIT);
1784 gfs2_quota_scan(sdp);
1785 return;
1786
1787 default:
1788 gfs2_assert_warn(sdp, 0);
1789 return;
1790 }
1791}
1792
1793/**
1794 * demote_ok - Check to see if it's ok to unlock a glock
1795 * @gl: the glock
1796 *
1797 * Returns: 1 if it's ok
1798 */
1799
1800static int demote_ok(struct gfs2_glock *gl)
1801{
1802 struct gfs2_sbd *sdp = gl->gl_sbd;
1803 const struct gfs2_glock_operations *glops = gl->gl_ops;
1804 int demote = 1;
1805
1806 if (test_bit(GLF_STICKY, &gl->gl_flags))
1807 demote = 0;
1808 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
1809 demote = time_after_eq(jiffies, gl->gl_stamp +
1810 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
1811 else if (glops->go_demote_ok)
1812 demote = glops->go_demote_ok(gl);
1813
1814 return demote;
1815}
1816
1817/**
1818 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1819 * @gl: the glock
1820 *
1821 */
1822
1823void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1824{
1825 struct gfs2_sbd *sdp = gl->gl_sbd;
1826
1827 spin_lock(&sdp->sd_reclaim_lock);
1828 if (list_empty(&gl->gl_reclaim)) {
1829 gfs2_glock_hold(gl);
1830 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
1831 atomic_inc(&sdp->sd_reclaim_count);
1832 }
1833 spin_unlock(&sdp->sd_reclaim_lock);
1834
1835 wake_up(&sdp->sd_reclaim_wq);
1836}
1837
1838/**
1839 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1840 * @sdp: the filesystem
1841 *
1842 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1843 * different glock and we notice that there are a lot of glocks in the
1844 * reclaim list.
1845 *
1846 */
1847
1848void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1849{
1850 struct gfs2_glock *gl;
1851
1852 spin_lock(&sdp->sd_reclaim_lock);
1853 if (list_empty(&sdp->sd_reclaim_list)) {
1854 spin_unlock(&sdp->sd_reclaim_lock);
1855 return;
1856 }
1857 gl = list_entry(sdp->sd_reclaim_list.next,
1858 struct gfs2_glock, gl_reclaim);
1859 list_del_init(&gl->gl_reclaim);
1860 spin_unlock(&sdp->sd_reclaim_lock);
1861
1862 atomic_dec(&sdp->sd_reclaim_count);
1863 atomic_inc(&sdp->sd_reclaimed);
1864
1865 if (gfs2_glmutex_trylock(gl)) {
1866 if (queue_empty(gl, &gl->gl_holders) &&
1867 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1868 handle_callback(gl, LM_ST_UNLOCKED);
1869 gfs2_glmutex_unlock(gl);
1870 }
1871
1872 gfs2_glock_put(gl);
1873}
1874
1875/**
1876 * examine_bucket - Call a function for glock in a hash bucket
1877 * @examiner: the function
1878 * @sdp: the filesystem
1879 * @bucket: the bucket
1880 *
1881 * Returns: 1 if the bucket has entries
1882 */
1883
1884static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
1885 unsigned int hash)
1886{
1887 struct gfs2_glock *gl, *prev = NULL;
1888 int has_entries = 0;
1889 struct hlist_head *head = &gl_hash_table[hash].hb_list;
1890
1891 read_lock(gl_lock_addr(hash));
1892 /* Can't use hlist_for_each_entry - don't want prefetch here */
1893 if (hlist_empty(head))
1894 goto out;
1895 gl = list_entry(head->first, struct gfs2_glock, gl_list);
1896 while(1) {
1897 if (gl->gl_sbd == sdp) {
1898 gfs2_glock_hold(gl);
1899 read_unlock(gl_lock_addr(hash));
1900 if (prev)
1901 gfs2_glock_put(prev);
1902 prev = gl;
1903 examiner(gl);
1904 has_entries = 1;
1905 read_lock(gl_lock_addr(hash));
1906 }
1907 if (gl->gl_list.next == NULL)
1908 break;
1909 gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
1910 }
1911out:
1912 read_unlock(gl_lock_addr(hash));
1913 if (prev)
1914 gfs2_glock_put(prev);
1915 return has_entries;
1916}
1917
1918/**
1919 * scan_glock - look at a glock and see if we can reclaim it
1920 * @gl: the glock to look at
1921 *
1922 */
1923
1924static void scan_glock(struct gfs2_glock *gl)
1925{
1926 if (gl->gl_ops == &gfs2_inode_glops)
1927 return;
1928
1929 if (gfs2_glmutex_trylock(gl)) {
1930 if (queue_empty(gl, &gl->gl_holders) &&
1931 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1932 goto out_schedule;
1933 gfs2_glmutex_unlock(gl);
1934 }
1935 return;
1936
1937out_schedule:
1938 gfs2_glmutex_unlock(gl);
1939 gfs2_glock_schedule_for_reclaim(gl);
1940}
1941
1942/**
1943 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
1944 * @sdp: the filesystem
1945 *
1946 */
1947
1948void gfs2_scand_internal(struct gfs2_sbd *sdp)
1949{
1950 unsigned int x;
1951
1952 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1953 examine_bucket(scan_glock, sdp, x);
1954}
1955
1956/**
1957 * clear_glock - look at a glock and see if we can free it from glock cache
1958 * @gl: the glock to look at
1959 *
1960 */
1961
1962static void clear_glock(struct gfs2_glock *gl)
1963{
1964 struct gfs2_sbd *sdp = gl->gl_sbd;
1965 int released;
1966
1967 spin_lock(&sdp->sd_reclaim_lock);
1968 if (!list_empty(&gl->gl_reclaim)) {
1969 list_del_init(&gl->gl_reclaim);
1970 atomic_dec(&sdp->sd_reclaim_count);
1971 spin_unlock(&sdp->sd_reclaim_lock);
1972 released = gfs2_glock_put(gl);
1973 gfs2_assert(sdp, !released);
1974 } else {
1975 spin_unlock(&sdp->sd_reclaim_lock);
1976 }
1977
1978 if (gfs2_glmutex_trylock(gl)) {
1979 if (queue_empty(gl, &gl->gl_holders) &&
1980 gl->gl_state != LM_ST_UNLOCKED)
1981 handle_callback(gl, LM_ST_UNLOCKED);
1982 gfs2_glmutex_unlock(gl);
1983 }
1984}
1985
1986/**
1987 * gfs2_gl_hash_clear - Empty out the glock hash table
1988 * @sdp: the filesystem
1989 * @wait: wait until it's all gone
1990 *
1991 * Called when unmounting the filesystem, or when inter-node lock manager
1992 * requests DROPLOCKS because it is running out of capacity.
1993 */
1994
1995void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
1996{
1997 unsigned long t;
1998 unsigned int x;
1999 int cont;
2000
2001 t = jiffies;
2002
2003 for (;;) {
2004 cont = 0;
2005 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2006 if (examine_bucket(clear_glock, sdp, x))
2007 cont = 1;
2008 }
2009
2010 if (!wait || !cont)
2011 break;
2012
2013 if (time_after_eq(jiffies,
2014 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
2015 fs_warn(sdp, "Unmount seems to be stalled. "
2016 "Dumping lock state...\n");
2017 gfs2_dump_lockstate(sdp);
2018 t = jiffies;
2019 }
2020
2021 invalidate_inodes(sdp->sd_vfs);
2022 msleep(10);
2023 }
2024}
2025
2026/*
2027 * Diagnostic routines to help debug distributed deadlock
2028 */
2029
2030/**
2031 * dump_holder - print information about a glock holder
2032 * @str: a string naming the type of holder
2033 * @gh: the glock holder
2034 *
2035 * Returns: 0 on success, -ENOBUFS when we run out of space
2036 */
2037
2038static int dump_holder(char *str, struct gfs2_holder *gh)
2039{
2040 unsigned int x;
2041 int error = -ENOBUFS;
2042
2043 printk(KERN_INFO " %s\n", str);
2044 printk(KERN_INFO " owner = %ld\n",
2045 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
2046 printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
2047 printk(KERN_INFO " gh_flags =");
2048 for (x = 0; x < 32; x++)
2049 if (gh->gh_flags & (1 << x))
2050 printk(" %u", x);
2051 printk(" \n");
2052 printk(KERN_INFO " error = %d\n", gh->gh_error);
2053 printk(KERN_INFO " gh_iflags =");
2054 for (x = 0; x < 32; x++)
2055 if (test_bit(x, &gh->gh_iflags))
2056 printk(" %u", x);
2057 printk(" \n");
2058 print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
2059
2060 error = 0;
2061
2062 return error;
2063}
2064
2065/**
2066 * dump_inode - print information about an inode
2067 * @ip: the inode
2068 *
2069 * Returns: 0 on success, -ENOBUFS when we run out of space
2070 */
2071
2072static int dump_inode(struct gfs2_inode *ip)
2073{
2074 unsigned int x;
2075 int error = -ENOBUFS;
2076
2077 printk(KERN_INFO " Inode:\n");
2078 printk(KERN_INFO " num = %llu %llu\n",
2079 (unsigned long long)ip->i_num.no_formal_ino,
2080 (unsigned long long)ip->i_num.no_addr);
2081 printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
2082 printk(KERN_INFO " i_flags =");
2083 for (x = 0; x < 32; x++)
2084 if (test_bit(x, &ip->i_flags))
2085 printk(" %u", x);
2086 printk(" \n");
2087
2088 error = 0;
2089
2090 return error;
2091}
2092
2093/**
2094 * dump_glock - print information about a glock
2095 * @gl: the glock
2096 * @count: where we are in the buffer
2097 *
2098 * Returns: 0 on success, -ENOBUFS when we run out of space
2099 */
2100
2101static int dump_glock(struct gfs2_glock *gl)
2102{
2103 struct gfs2_holder *gh;
2104 unsigned int x;
2105 int error = -ENOBUFS;
2106
2107 spin_lock(&gl->gl_spin);
2108
2109 printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
2110 (unsigned long long)gl->gl_name.ln_number);
2111 printk(KERN_INFO " gl_flags =");
2112 for (x = 0; x < 32; x++) {
2113 if (test_bit(x, &gl->gl_flags))
2114 printk(" %u", x);
2115 }
2116 printk(" \n");
2117 printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref));
2118 printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
2119 printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm);
2120 print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip);
2121 printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
2122 printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
2123 printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
2124 printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
2125 printk(KERN_INFO " le = %s\n",
2126 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
2127 printk(KERN_INFO " reclaim = %s\n",
2128 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
2129 if (gl->gl_aspace)
2130 printk(KERN_INFO " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
2131 gl->gl_aspace->i_mapping->nrpages);
2132 else
2133 printk(KERN_INFO " aspace = no\n");
2134 printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
2135 if (gl->gl_req_gh) {
2136 error = dump_holder("Request", gl->gl_req_gh);
2137 if (error)
2138 goto out;
2139 }
2140 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
2141 error = dump_holder("Holder", gh);
2142 if (error)
2143 goto out;
2144 }
2145 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
2146 error = dump_holder("Waiter1", gh);
2147 if (error)
2148 goto out;
2149 }
2150 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
2151 error = dump_holder("Waiter2", gh);
2152 if (error)
2153 goto out;
2154 }
2155 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
2156 error = dump_holder("Waiter3", gh);
2157 if (error)
2158 goto out;
2159 }
2160 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
2161 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
2162 list_empty(&gl->gl_holders)) {
2163 error = dump_inode(gl->gl_object);
2164 if (error)
2165 goto out;
2166 } else {
2167 error = -ENOBUFS;
2168 printk(KERN_INFO " Inode: busy\n");
2169 }
2170 }
2171
2172 error = 0;
2173
2174out:
2175 spin_unlock(&gl->gl_spin);
2176 return error;
2177}
2178
2179/**
2180 * gfs2_dump_lockstate - print out the current lockstate
2181 * @sdp: the filesystem
2182 * @ub: the buffer to copy the information into
2183 *
2184 * If @ub is NULL, dump the lockstate to the console.
2185 *
2186 */
2187
2188static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2189{
2190 struct gfs2_glock *gl;
2191 struct hlist_node *h;
2192 unsigned int x;
2193 int error = 0;
2194
2195 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2196
2197 read_lock(gl_lock_addr(x));
2198
2199 hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
2200 if (gl->gl_sbd != sdp)
2201 continue;
2202
2203 error = dump_glock(gl);
2204 if (error)
2205 break;
2206 }
2207
2208 read_unlock(gl_lock_addr(x));
2209
2210 if (error)
2211 break;
2212 }
2213
2214
2215 return error;
2216}
2217
2218int __init gfs2_glock_init(void)
2219{
2220 unsigned i;
2221 for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
2222 INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
2223 }
2224#ifdef GL_HASH_LOCK_SZ
2225 for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
2226 rwlock_init(&gl_hash_locks[i]);
2227 }
2228#endif
2229 return 0;
2230}
2231
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..2b2a889ee2cc
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,153 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __GLOCK_DOT_H__
11#define __GLOCK_DOT_H__
12
13#include "incore.h"
14
15/* Flags for lock requests; used in gfs2_holder gh_flag field.
16 From lm_interface.h:
17#define LM_FLAG_TRY 0x00000001
18#define LM_FLAG_TRY_1CB 0x00000002
19#define LM_FLAG_NOEXP 0x00000004
20#define LM_FLAG_ANY 0x00000008
21#define LM_FLAG_PRIORITY 0x00000010 */
22
23#define GL_LOCAL_EXCL 0x00000020
24#define GL_ASYNC 0x00000040
25#define GL_EXACT 0x00000080
26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400
29#define GL_NOCANCEL 0x00001000
30#define GL_AOP 0x00004000
31#define GL_DUMP 0x00008000
32
33#define GLR_TRYFAILED 13
34#define GLR_CANCELED 14
35
36static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
37{
38 struct gfs2_holder *gh;
39 int locked = 0;
40
41 /* Look in glock's list of holders for one with current task as owner */
42 spin_lock(&gl->gl_spin);
43 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
44 if (gh->gh_owner == current) {
45 locked = 1;
46 break;
47 }
48 }
49 spin_unlock(&gl->gl_spin);
50
51 return locked;
52}
53
54static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
55{
56 return gl->gl_state == LM_ST_EXCLUSIVE;
57}
58
59static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
60{
61 return gl->gl_state == LM_ST_DEFERRED;
62}
63
64static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
65{
66 return gl->gl_state == LM_ST_SHARED;
67}
68
69static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
70{
71 int ret;
72 spin_lock(&gl->gl_spin);
73 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
74 spin_unlock(&gl->gl_spin);
75 return ret;
76}
77
78int gfs2_glock_get(struct gfs2_sbd *sdp,
79 u64 number, const struct gfs2_glock_operations *glops,
80 int create, struct gfs2_glock **glp);
81void gfs2_glock_hold(struct gfs2_glock *gl);
82int gfs2_glock_put(struct gfs2_glock *gl);
83void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
84 struct gfs2_holder *gh);
85void gfs2_holder_reinit(unsigned int state, unsigned flags,
86 struct gfs2_holder *gh);
87void gfs2_holder_uninit(struct gfs2_holder *gh);
88
89void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
90void gfs2_glock_drop_th(struct gfs2_glock *gl);
91
92int gfs2_glock_nq(struct gfs2_holder *gh);
93int gfs2_glock_poll(struct gfs2_holder *gh);
94int gfs2_glock_wait(struct gfs2_holder *gh);
95void gfs2_glock_dq(struct gfs2_holder *gh);
96
97int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
98
99void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
100int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
101 u64 number, const struct gfs2_glock_operations *glops,
102 unsigned int state, int flags, struct gfs2_holder *gh);
103
104int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
105void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
106void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
107
108void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
109 const struct gfs2_glock_operations *glops,
110 unsigned int state, int flags);
111void gfs2_glock_inode_squish(struct inode *inode);
112
113/**
114 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
115 * @gl: the glock
116 * @state: the state we're requesting
117 * @flags: the modifier flags
118 * @gh: the holder structure
119 *
120 * Returns: 0, GLR_*, or errno
121 */
122
123static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
124 unsigned int state, int flags,
125 struct gfs2_holder *gh)
126{
127 int error;
128
129 gfs2_holder_init(gl, state, flags, gh);
130
131 error = gfs2_glock_nq(gh);
132 if (error)
133 gfs2_holder_uninit(gh);
134
135 return error;
136}
137
138/* Lock Value Block functions */
139
140int gfs2_lvb_hold(struct gfs2_glock *gl);
141void gfs2_lvb_unhold(struct gfs2_glock *gl);
142
143void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
144
145void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
146void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
147
148void gfs2_scand_internal(struct gfs2_sbd *sdp);
149void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
150
151int __init gfs2_glock_init(void);
152
153#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..41a6b6818a50
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,615 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17
18#include "gfs2.h"
19#include "incore.h"
20#include "bmap.h"
21#include "glock.h"
22#include "glops.h"
23#include "inode.h"
24#include "log.h"
25#include "meta_io.h"
26#include "recovery.h"
27#include "rgrp.h"
28#include "util.h"
29#include "trans.h"
30
31/**
32 * ail_empty_gl - remove all buffers for a given lock from the AIL
33 * @gl: the glock
34 *
35 * None of the buffers should be dirty, locked, or pinned.
36 */
37
38static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
39{
40 struct gfs2_sbd *sdp = gl->gl_sbd;
41 unsigned int blocks;
42 struct list_head *head = &gl->gl_ail_list;
43 struct gfs2_bufdata *bd;
44 struct buffer_head *bh;
45 u64 blkno;
46 int error;
47
48 blocks = atomic_read(&gl->gl_ail_count);
49 if (!blocks)
50 return;
51
52 error = gfs2_trans_begin(sdp, 0, blocks);
53 if (gfs2_assert_withdraw(sdp, !error))
54 return;
55
56 gfs2_log_lock(sdp);
57 while (!list_empty(head)) {
58 bd = list_entry(head->next, struct gfs2_bufdata,
59 bd_ail_gl_list);
60 bh = bd->bd_bh;
61 blkno = bh->b_blocknr;
62 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
63
64 bd->bd_ail = NULL;
65 list_del(&bd->bd_ail_st_list);
66 list_del(&bd->bd_ail_gl_list);
67 atomic_dec(&gl->gl_ail_count);
68 brelse(bh);
69 gfs2_log_unlock(sdp);
70
71 gfs2_trans_add_revoke(sdp, blkno);
72
73 gfs2_log_lock(sdp);
74 }
75 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
76 gfs2_log_unlock(sdp);
77
78 gfs2_trans_end(sdp);
79 gfs2_log_flush(sdp, NULL);
80}
81
82/**
83 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
84 * @gl: the glock
85 *
86 */
87
88static void gfs2_pte_inval(struct gfs2_glock *gl)
89{
90 struct gfs2_inode *ip;
91 struct inode *inode;
92
93 ip = gl->gl_object;
94 inode = &ip->i_inode;
95 if (!ip || !S_ISREG(ip->i_di.di_mode))
96 return;
97
98 if (!test_bit(GIF_PAGED, &ip->i_flags))
99 return;
100
101 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
102
103 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
104 set_bit(GLF_DIRTY, &gl->gl_flags);
105
106 clear_bit(GIF_SW_PAGED, &ip->i_flags);
107}
108
109/**
110 * gfs2_page_inval - Invalidate all pages associated with a glock
111 * @gl: the glock
112 *
113 */
114
115static void gfs2_page_inval(struct gfs2_glock *gl)
116{
117 struct gfs2_inode *ip;
118 struct inode *inode;
119
120 ip = gl->gl_object;
121 inode = &ip->i_inode;
122 if (!ip || !S_ISREG(ip->i_di.di_mode))
123 return;
124
125 truncate_inode_pages(inode->i_mapping, 0);
126 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
127 clear_bit(GIF_PAGED, &ip->i_flags);
128}
129
130/**
131 * gfs2_page_wait - Wait for writeback of data
132 * @gl: the glock
133 *
134 * Syncs data (not metadata) for a regular file.
135 * No-op for all other types.
136 */
137
138static void gfs2_page_wait(struct gfs2_glock *gl)
139{
140 struct gfs2_inode *ip = gl->gl_object;
141 struct inode *inode = &ip->i_inode;
142 struct address_space *mapping = inode->i_mapping;
143 int error;
144
145 if (!S_ISREG(ip->i_di.di_mode))
146 return;
147
148 error = filemap_fdatawait(mapping);
149
150 /* Put back any errors cleared by filemap_fdatawait()
151 so they can be caught by someone who can pass them
152 up to user space. */
153
154 if (error == -ENOSPC)
155 set_bit(AS_ENOSPC, &mapping->flags);
156 else if (error)
157 set_bit(AS_EIO, &mapping->flags);
158
159}
160
161static void gfs2_page_writeback(struct gfs2_glock *gl)
162{
163 struct gfs2_inode *ip = gl->gl_object;
164 struct inode *inode = &ip->i_inode;
165 struct address_space *mapping = inode->i_mapping;
166
167 if (!S_ISREG(ip->i_di.di_mode))
168 return;
169
170 filemap_fdatawrite(mapping);
171}
172
173/**
174 * meta_go_sync - sync out the metadata for this glock
175 * @gl: the glock
176 * @flags: DIO_*
177 *
178 * Called when demoting or unlocking an EX glock. We must flush
179 * to disk all dirty buffers/pages relating to this glock, and must not
180 * not return to caller to demote/unlock the glock until I/O is complete.
181 */
182
183static void meta_go_sync(struct gfs2_glock *gl, int flags)
184{
185 if (!(flags & DIO_METADATA))
186 return;
187
188 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
189 gfs2_log_flush(gl->gl_sbd, gl);
190 gfs2_meta_sync(gl);
191 if (flags & DIO_RELEASE)
192 gfs2_ail_empty_gl(gl);
193 }
194
195}
196
197/**
198 * meta_go_inval - invalidate the metadata for this glock
199 * @gl: the glock
200 * @flags:
201 *
202 */
203
204static void meta_go_inval(struct gfs2_glock *gl, int flags)
205{
206 if (!(flags & DIO_METADATA))
207 return;
208
209 gfs2_meta_inval(gl);
210 gl->gl_vn++;
211}
212
213/**
214 * inode_go_xmote_th - promote/demote a glock
215 * @gl: the glock
216 * @state: the requested state
217 * @flags:
218 *
219 */
220
221static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
222 int flags)
223{
224 if (gl->gl_state != LM_ST_UNLOCKED)
225 gfs2_pte_inval(gl);
226 gfs2_glock_xmote_th(gl, state, flags);
227}
228
229/**
230 * inode_go_xmote_bh - After promoting/demoting a glock
231 * @gl: the glock
232 *
233 */
234
235static void inode_go_xmote_bh(struct gfs2_glock *gl)
236{
237 struct gfs2_holder *gh = gl->gl_req_gh;
238 struct buffer_head *bh;
239 int error;
240
241 if (gl->gl_state != LM_ST_UNLOCKED &&
242 (!gh || !(gh->gh_flags & GL_SKIP))) {
243 error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
244 if (!error)
245 brelse(bh);
246 }
247}
248
249/**
250 * inode_go_drop_th - unlock a glock
251 * @gl: the glock
252 *
253 * Invoked from rq_demote().
254 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
255 * is being purged from our node's glock cache; we're dropping lock.
256 */
257
258static void inode_go_drop_th(struct gfs2_glock *gl)
259{
260 gfs2_pte_inval(gl);
261 gfs2_glock_drop_th(gl);
262}
263
264/**
265 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
266 * @gl: the glock protecting the inode
267 * @flags:
268 *
269 */
270
271static void inode_go_sync(struct gfs2_glock *gl, int flags)
272{
273 int meta = (flags & DIO_METADATA);
274 int data = (flags & DIO_DATA);
275
276 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
277 if (meta && data) {
278 gfs2_page_writeback(gl);
279 gfs2_log_flush(gl->gl_sbd, gl);
280 gfs2_meta_sync(gl);
281 gfs2_page_wait(gl);
282 clear_bit(GLF_DIRTY, &gl->gl_flags);
283 } else if (meta) {
284 gfs2_log_flush(gl->gl_sbd, gl);
285 gfs2_meta_sync(gl);
286 } else if (data) {
287 gfs2_page_writeback(gl);
288 gfs2_page_wait(gl);
289 }
290 if (flags & DIO_RELEASE)
291 gfs2_ail_empty_gl(gl);
292 }
293}
294
295/**
296 * inode_go_inval - prepare a inode glock to be released
297 * @gl: the glock
298 * @flags:
299 *
300 */
301
302static void inode_go_inval(struct gfs2_glock *gl, int flags)
303{
304 int meta = (flags & DIO_METADATA);
305 int data = (flags & DIO_DATA);
306
307 if (meta) {
308 gfs2_meta_inval(gl);
309 gl->gl_vn++;
310 }
311 if (data)
312 gfs2_page_inval(gl);
313}
314
315/**
316 * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
317 * @gl: the glock
318 *
319 * Returns: 1 if it's ok
320 */
321
322static int inode_go_demote_ok(struct gfs2_glock *gl)
323{
324 struct gfs2_sbd *sdp = gl->gl_sbd;
325 int demote = 0;
326
327 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
328 demote = 1;
329 else if (!sdp->sd_args.ar_localcaching &&
330 time_after_eq(jiffies, gl->gl_stamp +
331 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
332 demote = 1;
333
334 return demote;
335}
336
337/**
338 * inode_go_lock - operation done after an inode lock is locked by a process
339 * @gl: the glock
340 * @flags:
341 *
342 * Returns: errno
343 */
344
345static int inode_go_lock(struct gfs2_holder *gh)
346{
347 struct gfs2_glock *gl = gh->gh_gl;
348 struct gfs2_inode *ip = gl->gl_object;
349 int error = 0;
350
351 if (!ip)
352 return 0;
353
354 if (ip->i_vn != gl->gl_vn) {
355 error = gfs2_inode_refresh(ip);
356 if (error)
357 return error;
358 gfs2_inode_attr_in(ip);
359 }
360
361 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
362 (gl->gl_state == LM_ST_EXCLUSIVE) &&
363 (gh->gh_flags & GL_LOCAL_EXCL))
364 error = gfs2_truncatei_resume(ip);
365
366 return error;
367}
368
369/**
370 * inode_go_unlock - operation done before an inode lock is unlocked by a
371 * process
372 * @gl: the glock
373 * @flags:
374 *
375 */
376
377static void inode_go_unlock(struct gfs2_holder *gh)
378{
379 struct gfs2_glock *gl = gh->gh_gl;
380 struct gfs2_inode *ip = gl->gl_object;
381
382 if (ip == NULL)
383 return;
384 if (test_bit(GLF_DIRTY, &gl->gl_flags))
385 gfs2_inode_attr_in(ip);
386 gfs2_meta_cache_flush(ip);
387}
388
389/**
390 * inode_greedy -
391 * @gl: the glock
392 *
393 */
394
395static void inode_greedy(struct gfs2_glock *gl)
396{
397 struct gfs2_sbd *sdp = gl->gl_sbd;
398 struct gfs2_inode *ip = gl->gl_object;
399 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
400 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
401 unsigned int new_time;
402
403 spin_lock(&ip->i_spin);
404
405 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
406 new_time = ip->i_greedy + quantum;
407 if (new_time > max)
408 new_time = max;
409 } else {
410 new_time = ip->i_greedy - quantum;
411 if (!new_time || new_time > max)
412 new_time = 1;
413 }
414
415 ip->i_greedy = new_time;
416
417 spin_unlock(&ip->i_spin);
418
419 iput(&ip->i_inode);
420}
421
422/**
423 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
424 * @gl: the glock
425 *
426 * Returns: 1 if it's ok
427 */
428
429static int rgrp_go_demote_ok(struct gfs2_glock *gl)
430{
431 return !gl->gl_aspace->i_mapping->nrpages;
432}
433
434/**
435 * rgrp_go_lock - operation done after an rgrp lock is locked by
436 * a first holder on this node.
437 * @gl: the glock
438 * @flags:
439 *
440 * Returns: errno
441 */
442
443static int rgrp_go_lock(struct gfs2_holder *gh)
444{
445 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
446}
447
448/**
449 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
450 * a last holder on this node.
451 * @gl: the glock
452 * @flags:
453 *
454 */
455
456static void rgrp_go_unlock(struct gfs2_holder *gh)
457{
458 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
459}
460
461/**
462 * trans_go_xmote_th - promote/demote the transaction glock
463 * @gl: the glock
464 * @state: the requested state
465 * @flags:
466 *
467 */
468
469static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
470 int flags)
471{
472 struct gfs2_sbd *sdp = gl->gl_sbd;
473
474 if (gl->gl_state != LM_ST_UNLOCKED &&
475 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
476 gfs2_meta_syncfs(sdp);
477 gfs2_log_shutdown(sdp);
478 }
479
480 gfs2_glock_xmote_th(gl, state, flags);
481}
482
483/**
484 * trans_go_xmote_bh - After promoting/demoting the transaction glock
485 * @gl: the glock
486 *
487 */
488
489static void trans_go_xmote_bh(struct gfs2_glock *gl)
490{
491 struct gfs2_sbd *sdp = gl->gl_sbd;
492 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
493 struct gfs2_glock *j_gl = ip->i_gl;
494 struct gfs2_log_header head;
495 int error;
496
497 if (gl->gl_state != LM_ST_UNLOCKED &&
498 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
499 gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
500 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
501
502 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
503 if (error)
504 gfs2_consist(sdp);
505 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
506 gfs2_consist(sdp);
507
508 /* Initialize some head of the log stuff */
509 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
510 sdp->sd_log_sequence = head.lh_sequence + 1;
511 gfs2_log_pointers_init(sdp, head.lh_blkno);
512 }
513 }
514}
515
516/**
517 * trans_go_drop_th - unlock the transaction glock
518 * @gl: the glock
519 *
520 * We want to sync the device even with localcaching. Remember
521 * that localcaching journal replay only marks buffers dirty.
522 */
523
524static void trans_go_drop_th(struct gfs2_glock *gl)
525{
526 struct gfs2_sbd *sdp = gl->gl_sbd;
527
528 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
529 gfs2_meta_syncfs(sdp);
530 gfs2_log_shutdown(sdp);
531 }
532
533 gfs2_glock_drop_th(gl);
534}
535
536/**
537 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
538 * @gl: the glock
539 *
540 * Returns: 1 if it's ok
541 */
542
543static int quota_go_demote_ok(struct gfs2_glock *gl)
544{
545 return !atomic_read(&gl->gl_lvb_count);
546}
547
548const struct gfs2_glock_operations gfs2_meta_glops = {
549 .go_xmote_th = gfs2_glock_xmote_th,
550 .go_drop_th = gfs2_glock_drop_th,
551 .go_type = LM_TYPE_META,
552};
553
554const struct gfs2_glock_operations gfs2_inode_glops = {
555 .go_xmote_th = inode_go_xmote_th,
556 .go_xmote_bh = inode_go_xmote_bh,
557 .go_drop_th = inode_go_drop_th,
558 .go_sync = inode_go_sync,
559 .go_inval = inode_go_inval,
560 .go_demote_ok = inode_go_demote_ok,
561 .go_lock = inode_go_lock,
562 .go_unlock = inode_go_unlock,
563 .go_greedy = inode_greedy,
564 .go_type = LM_TYPE_INODE,
565};
566
567const struct gfs2_glock_operations gfs2_rgrp_glops = {
568 .go_xmote_th = gfs2_glock_xmote_th,
569 .go_drop_th = gfs2_glock_drop_th,
570 .go_sync = meta_go_sync,
571 .go_inval = meta_go_inval,
572 .go_demote_ok = rgrp_go_demote_ok,
573 .go_lock = rgrp_go_lock,
574 .go_unlock = rgrp_go_unlock,
575 .go_type = LM_TYPE_RGRP,
576};
577
578const struct gfs2_glock_operations gfs2_trans_glops = {
579 .go_xmote_th = trans_go_xmote_th,
580 .go_xmote_bh = trans_go_xmote_bh,
581 .go_drop_th = trans_go_drop_th,
582 .go_type = LM_TYPE_NONDISK,
583};
584
585const struct gfs2_glock_operations gfs2_iopen_glops = {
586 .go_xmote_th = gfs2_glock_xmote_th,
587 .go_drop_th = gfs2_glock_drop_th,
588 .go_type = LM_TYPE_IOPEN,
589};
590
591const struct gfs2_glock_operations gfs2_flock_glops = {
592 .go_xmote_th = gfs2_glock_xmote_th,
593 .go_drop_th = gfs2_glock_drop_th,
594 .go_type = LM_TYPE_FLOCK,
595};
596
597const struct gfs2_glock_operations gfs2_nondisk_glops = {
598 .go_xmote_th = gfs2_glock_xmote_th,
599 .go_drop_th = gfs2_glock_drop_th,
600 .go_type = LM_TYPE_NONDISK,
601};
602
603const struct gfs2_glock_operations gfs2_quota_glops = {
604 .go_xmote_th = gfs2_glock_xmote_th,
605 .go_drop_th = gfs2_glock_drop_th,
606 .go_demote_ok = quota_go_demote_ok,
607 .go_type = LM_TYPE_QUOTA,
608};
609
610const struct gfs2_glock_operations gfs2_journal_glops = {
611 .go_xmote_th = gfs2_glock_xmote_th,
612 .go_drop_th = gfs2_glock_drop_th,
613 .go_type = LM_TYPE_JOURNAL,
614};
615
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..a1d9b5b024e6
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __GLOPS_DOT_H__
11#define __GLOPS_DOT_H__
12
13#include "incore.h"
14
15extern const struct gfs2_glock_operations gfs2_meta_glops;
16extern const struct gfs2_glock_operations gfs2_inode_glops;
17extern const struct gfs2_glock_operations gfs2_rgrp_glops;
18extern const struct gfs2_glock_operations gfs2_trans_glops;
19extern const struct gfs2_glock_operations gfs2_iopen_glops;
20extern const struct gfs2_glock_operations gfs2_flock_glops;
21extern const struct gfs2_glock_operations gfs2_nondisk_glops;
22extern const struct gfs2_glock_operations gfs2_quota_glops;
23extern const struct gfs2_glock_operations gfs2_journal_glops;
24
25#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..118dc693d111
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,634 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __INCORE_DOT_H__
11#define __INCORE_DOT_H__
12
13#include <linux/fs.h>
14
15#define DIO_WAIT 0x00000010
16#define DIO_METADATA 0x00000020
17#define DIO_DATA 0x00000040
18#define DIO_RELEASE 0x00000080
19#define DIO_ALL 0x00000100
20
21struct gfs2_log_operations;
22struct gfs2_log_element;
23struct gfs2_holder;
24struct gfs2_glock;
25struct gfs2_quota_data;
26struct gfs2_trans;
27struct gfs2_ail;
28struct gfs2_jdesc;
29struct gfs2_sbd;
30
31typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
32
33/*
34 * Structure of operations that are associated with each
35 * type of element in the log.
36 */
37
38struct gfs2_log_operations {
39 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
40 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
41 void (*lo_before_commit) (struct gfs2_sbd *sdp);
42 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
43 void (*lo_before_scan) (struct gfs2_jdesc *jd,
44 struct gfs2_log_header *head, int pass);
45 int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
46 struct gfs2_log_descriptor *ld, __be64 *ptr,
47 int pass);
48 void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
49 const char *lo_name;
50};
51
52struct gfs2_log_element {
53 struct list_head le_list;
54 const struct gfs2_log_operations *le_ops;
55};
56
57struct gfs2_bitmap {
58 struct buffer_head *bi_bh;
59 char *bi_clone;
60 u32 bi_offset;
61 u32 bi_start;
62 u32 bi_len;
63};
64
65struct gfs2_rgrpd {
66 struct list_head rd_list; /* Link with superblock */
67 struct list_head rd_list_mru;
68 struct list_head rd_recent; /* Recently used rgrps */
69 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
70 struct gfs2_rindex rd_ri;
71 struct gfs2_rgrp rd_rg;
72 u64 rd_rg_vn;
73 struct gfs2_bitmap *rd_bits;
74 unsigned int rd_bh_count;
75 struct mutex rd_mutex;
76 u32 rd_free_clone;
77 struct gfs2_log_element rd_le;
78 u32 rd_last_alloc_data;
79 u32 rd_last_alloc_meta;
80 struct gfs2_sbd *rd_sbd;
81};
82
83enum gfs2_state_bits {
84 BH_Pinned = BH_PrivateStart,
85 BH_Escaped = BH_PrivateStart + 1,
86};
87
88BUFFER_FNS(Pinned, pinned)
89TAS_BUFFER_FNS(Pinned, pinned)
90BUFFER_FNS(Escaped, escaped)
91TAS_BUFFER_FNS(Escaped, escaped)
92
93struct gfs2_bufdata {
94 struct buffer_head *bd_bh;
95 struct gfs2_glock *bd_gl;
96
97 struct list_head bd_list_tr;
98 struct gfs2_log_element bd_le;
99
100 struct gfs2_ail *bd_ail;
101 struct list_head bd_ail_st_list;
102 struct list_head bd_ail_gl_list;
103};
104
105struct gfs2_glock_operations {
106 void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
107 int flags);
108 void (*go_xmote_bh) (struct gfs2_glock * gl);
109 void (*go_drop_th) (struct gfs2_glock * gl);
110 void (*go_drop_bh) (struct gfs2_glock * gl);
111 void (*go_sync) (struct gfs2_glock * gl, int flags);
112 void (*go_inval) (struct gfs2_glock * gl, int flags);
113 int (*go_demote_ok) (struct gfs2_glock * gl);
114 int (*go_lock) (struct gfs2_holder * gh);
115 void (*go_unlock) (struct gfs2_holder * gh);
116 void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
117 void (*go_greedy) (struct gfs2_glock * gl);
118 const int go_type;
119};
120
121enum {
122 /* Actions */
123 HIF_MUTEX = 0,
124 HIF_PROMOTE = 1,
125 HIF_DEMOTE = 2,
126 HIF_GREEDY = 3,
127
128 /* States */
129 HIF_ALLOCED = 4,
130 HIF_DEALLOC = 5,
131 HIF_HOLDER = 6,
132 HIF_FIRST = 7,
133 HIF_ABORTED = 9,
134};
135
136struct gfs2_holder {
137 struct list_head gh_list;
138
139 struct gfs2_glock *gh_gl;
140 struct task_struct *gh_owner;
141 unsigned int gh_state;
142 unsigned gh_flags;
143
144 int gh_error;
145 unsigned long gh_iflags;
146 struct completion gh_wait;
147 unsigned long gh_ip;
148};
149
150enum {
151 GLF_LOCK = 1,
152 GLF_STICKY = 2,
153 GLF_PREFETCH = 3,
154 GLF_DIRTY = 5,
155 GLF_SKIP_WAITERS2 = 6,
156 GLF_GREEDY = 7,
157};
158
159struct gfs2_glock {
160 struct hlist_node gl_list;
161 unsigned long gl_flags; /* GLF_... */
162 struct lm_lockname gl_name;
163 atomic_t gl_ref;
164
165 spinlock_t gl_spin;
166
167 unsigned int gl_state;
168 unsigned int gl_hash;
169 struct task_struct *gl_owner;
170 unsigned long gl_ip;
171 struct list_head gl_holders;
172 struct list_head gl_waiters1; /* HIF_MUTEX */
173 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
174 struct list_head gl_waiters3; /* HIF_PROMOTE */
175
176 const struct gfs2_glock_operations *gl_ops;
177
178 struct gfs2_holder *gl_req_gh;
179 gfs2_glop_bh_t gl_req_bh;
180
181 void *gl_lock;
182 char *gl_lvb;
183 atomic_t gl_lvb_count;
184
185 u64 gl_vn;
186 unsigned long gl_stamp;
187 void *gl_object;
188
189 struct list_head gl_reclaim;
190
191 struct gfs2_sbd *gl_sbd;
192
193 struct inode *gl_aspace;
194 struct gfs2_log_element gl_le;
195 struct list_head gl_ail_list;
196 atomic_t gl_ail_count;
197};
198
199struct gfs2_alloc {
200 /* Quota stuff */
201
202 struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
203 struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
204 unsigned int al_qd_num;
205
206 u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
207 u32 al_alloced; /* Filled in by gfs2_alloc_*() */
208
209 /* Filled in by gfs2_inplace_reserve() */
210
211 unsigned int al_line;
212 char *al_file;
213 struct gfs2_holder al_ri_gh;
214 struct gfs2_holder al_rgd_gh;
215 struct gfs2_rgrpd *al_rgd;
216
217};
218
219enum {
220 GIF_QD_LOCKED = 1,
221 GIF_PAGED = 2,
222 GIF_SW_PAGED = 3,
223};
224
225struct gfs2_inode {
226 struct inode i_inode;
227 struct gfs2_inum i_num;
228
229 unsigned long i_flags; /* GIF_... */
230
231 u64 i_vn;
232 struct gfs2_dinode i_di; /* To be replaced by ref to block */
233
234 struct gfs2_glock *i_gl; /* Move into i_gh? */
235 struct gfs2_holder i_iopen_gh;
236 struct gfs2_holder i_gh; /* for prepare/commit_write only */
237 struct gfs2_alloc i_alloc;
238 u64 i_last_rg_alloc;
239
240 spinlock_t i_spin;
241 struct rw_semaphore i_rw_mutex;
242 unsigned int i_greedy;
243 unsigned long i_last_pfault;
244
245 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
246};
247
248/*
249 * Since i_inode is the first element of struct gfs2_inode,
250 * this is effectively a cast.
251 */
252static inline struct gfs2_inode *GFS2_I(struct inode *inode)
253{
254 return container_of(inode, struct gfs2_inode, i_inode);
255}
256
257/* To be removed? */
258static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
259{
260 return inode->i_sb->s_fs_info;
261}
262
263enum {
264 GFF_DID_DIRECT_ALLOC = 0,
265 GFF_EXLOCK = 1,
266};
267
268struct gfs2_file {
269 unsigned long f_flags; /* GFF_... */
270 struct mutex f_fl_mutex;
271 struct gfs2_holder f_fl_gh;
272};
273
274struct gfs2_revoke {
275 struct gfs2_log_element rv_le;
276 u64 rv_blkno;
277};
278
279struct gfs2_revoke_replay {
280 struct list_head rr_list;
281 u64 rr_blkno;
282 unsigned int rr_where;
283};
284
285enum {
286 QDF_USER = 0,
287 QDF_CHANGE = 1,
288 QDF_LOCKED = 2,
289};
290
291struct gfs2_quota_lvb {
292 __be32 qb_magic;
293 u32 __pad;
294 __be64 qb_limit; /* Hard limit of # blocks to alloc */
295 __be64 qb_warn; /* Warn user when alloc is above this # */
296 __be64 qb_value; /* Current # blocks allocated */
297};
298
299struct gfs2_quota_data {
300 struct list_head qd_list;
301 unsigned int qd_count;
302
303 u32 qd_id;
304 unsigned long qd_flags; /* QDF_... */
305
306 s64 qd_change;
307 s64 qd_change_sync;
308
309 unsigned int qd_slot;
310 unsigned int qd_slot_count;
311
312 struct buffer_head *qd_bh;
313 struct gfs2_quota_change *qd_bh_qc;
314 unsigned int qd_bh_count;
315
316 struct gfs2_glock *qd_gl;
317 struct gfs2_quota_lvb qd_qb;
318
319 u64 qd_sync_gen;
320 unsigned long qd_last_warn;
321 unsigned long qd_last_touched;
322};
323
324struct gfs2_log_buf {
325 struct list_head lb_list;
326 struct buffer_head *lb_bh;
327 struct buffer_head *lb_real;
328};
329
330struct gfs2_trans {
331 unsigned long tr_ip;
332
333 unsigned int tr_blocks;
334 unsigned int tr_revokes;
335 unsigned int tr_reserved;
336
337 struct gfs2_holder tr_t_gh;
338
339 int tr_touched;
340
341 unsigned int tr_num_buf;
342 unsigned int tr_num_buf_new;
343 unsigned int tr_num_buf_rm;
344 struct list_head tr_list_buf;
345
346 unsigned int tr_num_revoke;
347 unsigned int tr_num_revoke_rm;
348};
349
350struct gfs2_ail {
351 struct list_head ai_list;
352
353 unsigned int ai_first;
354 struct list_head ai_ail1_list;
355 struct list_head ai_ail2_list;
356
357 u64 ai_sync_gen;
358};
359
360struct gfs2_jdesc {
361 struct list_head jd_list;
362
363 struct inode *jd_inode;
364 unsigned int jd_jid;
365 int jd_dirty;
366
367 unsigned int jd_blocks;
368};
369
370#define GFS2_GLOCKD_DEFAULT 1
371#define GFS2_GLOCKD_MAX 16
372
373#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
374#define GFS2_QUOTA_OFF 0
375#define GFS2_QUOTA_ACCOUNT 1
376#define GFS2_QUOTA_ON 2
377
378#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
379#define GFS2_DATA_WRITEBACK 1
380#define GFS2_DATA_ORDERED 2
381
382struct gfs2_args {
383 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
384 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
385 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
386 int ar_spectator; /* Don't get a journal because we're always RO */
387 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
388 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
389 int ar_localcaching; /* Local-style caching (dangerous on multihost) */
390 int ar_debug; /* Oops on errors instead of trying to be graceful */
391 int ar_upgrade; /* Upgrade ondisk/multihost format */
392 unsigned int ar_num_glockd; /* Number of glockd threads */
393 int ar_posix_acl; /* Enable posix acls */
394 int ar_quota; /* off/account/on */
395 int ar_suiddir; /* suiddir support */
396 int ar_data; /* ordered/writeback */
397};
398
399struct gfs2_tune {
400 spinlock_t gt_spin;
401
402 unsigned int gt_ilimit;
403 unsigned int gt_ilimit_tries;
404 unsigned int gt_ilimit_min;
405 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
406 unsigned int gt_incore_log_blocks;
407 unsigned int gt_log_flush_secs;
408 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
409
410 unsigned int gt_scand_secs;
411 unsigned int gt_recoverd_secs;
412 unsigned int gt_logd_secs;
413 unsigned int gt_quotad_secs;
414
415 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
416 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
417 unsigned int gt_quota_scale_num; /* Numerator */
418 unsigned int gt_quota_scale_den; /* Denominator */
419 unsigned int gt_quota_cache_secs;
420 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
421 unsigned int gt_atime_quantum; /* Min secs between atime updates */
422 unsigned int gt_new_files_jdata;
423 unsigned int gt_new_files_directio;
424 unsigned int gt_max_atomic_write; /* Split big writes into this size */
425 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
426 unsigned int gt_lockdump_size;
427 unsigned int gt_stall_secs; /* Detects trouble! */
428 unsigned int gt_complain_secs;
429 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
430 unsigned int gt_entries_per_readdir;
431 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
432 unsigned int gt_greedy_default;
433 unsigned int gt_greedy_quantum;
434 unsigned int gt_greedy_max;
435 unsigned int gt_statfs_quantum;
436 unsigned int gt_statfs_slow;
437};
438
439enum {
440 SDF_JOURNAL_CHECKED = 0,
441 SDF_JOURNAL_LIVE = 1,
442 SDF_SHUTDOWN = 2,
443 SDF_NOATIME = 3,
444};
445
446#define GFS2_FSNAME_LEN 256
447
448struct gfs2_sbd {
449 struct super_block *sd_vfs;
450 struct super_block *sd_vfs_meta;
451 struct kobject sd_kobj;
452 unsigned long sd_flags; /* SDF_... */
453 struct gfs2_sb sd_sb;
454
455 /* Constants computed on mount */
456
457 u32 sd_fsb2bb;
458 u32 sd_fsb2bb_shift;
459 u32 sd_diptrs; /* Number of pointers in a dinode */
460 u32 sd_inptrs; /* Number of pointers in a indirect block */
461 u32 sd_jbsize; /* Size of a journaled data block */
462 u32 sd_hash_bsize; /* sizeof(exhash block) */
463 u32 sd_hash_bsize_shift;
464 u32 sd_hash_ptrs; /* Number of pointers in a hash block */
465 u32 sd_qc_per_block;
466 u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
467 u32 sd_max_height; /* Max height of a file's metadata tree */
468 u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
469 u32 sd_max_jheight; /* Max height of journaled file's meta tree */
470 u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
471
472 struct gfs2_args sd_args; /* Mount arguments */
473 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
474
475 /* Lock Stuff */
476
477 struct lm_lockstruct sd_lockstruct;
478 struct list_head sd_reclaim_list;
479 spinlock_t sd_reclaim_lock;
480 wait_queue_head_t sd_reclaim_wq;
481 atomic_t sd_reclaim_count;
482 struct gfs2_holder sd_live_gh;
483 struct gfs2_glock *sd_rename_gl;
484 struct gfs2_glock *sd_trans_gl;
485
486 /* Inode Stuff */
487
488 struct inode *sd_master_dir;
489 struct inode *sd_jindex;
490 struct inode *sd_inum_inode;
491 struct inode *sd_statfs_inode;
492 struct inode *sd_ir_inode;
493 struct inode *sd_sc_inode;
494 struct inode *sd_qc_inode;
495 struct inode *sd_rindex;
496 struct inode *sd_quota_inode;
497
498 /* Inum stuff */
499
500 struct mutex sd_inum_mutex;
501
502 /* StatFS stuff */
503
504 spinlock_t sd_statfs_spin;
505 struct mutex sd_statfs_mutex;
506 struct gfs2_statfs_change sd_statfs_master;
507 struct gfs2_statfs_change sd_statfs_local;
508 unsigned long sd_statfs_sync_time;
509
510 /* Resource group stuff */
511
512 u64 sd_rindex_vn;
513 spinlock_t sd_rindex_spin;
514 struct mutex sd_rindex_mutex;
515 struct list_head sd_rindex_list;
516 struct list_head sd_rindex_mru_list;
517 struct list_head sd_rindex_recent_list;
518 struct gfs2_rgrpd *sd_rindex_forward;
519 unsigned int sd_rgrps;
520
521 /* Journal index stuff */
522
523 struct list_head sd_jindex_list;
524 spinlock_t sd_jindex_spin;
525 struct mutex sd_jindex_mutex;
526 unsigned int sd_journals;
527 unsigned long sd_jindex_refresh_time;
528
529 struct gfs2_jdesc *sd_jdesc;
530 struct gfs2_holder sd_journal_gh;
531 struct gfs2_holder sd_jinode_gh;
532
533 struct gfs2_holder sd_ir_gh;
534 struct gfs2_holder sd_sc_gh;
535 struct gfs2_holder sd_qc_gh;
536
537 /* Daemon stuff */
538
539 struct task_struct *sd_scand_process;
540 struct task_struct *sd_recoverd_process;
541 struct task_struct *sd_logd_process;
542 struct task_struct *sd_quotad_process;
543 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
544 unsigned int sd_glockd_num;
545
546 /* Quota stuff */
547
548 struct list_head sd_quota_list;
549 atomic_t sd_quota_count;
550 spinlock_t sd_quota_spin;
551 struct mutex sd_quota_mutex;
552
553 unsigned int sd_quota_slots;
554 unsigned int sd_quota_chunks;
555 unsigned char **sd_quota_bitmap;
556
557 u64 sd_quota_sync_gen;
558 unsigned long sd_quota_sync_time;
559
560 /* Log stuff */
561
562 spinlock_t sd_log_lock;
563
564 unsigned int sd_log_blks_reserved;
565 unsigned int sd_log_commited_buf;
566 unsigned int sd_log_commited_revoke;
567
568 unsigned int sd_log_num_gl;
569 unsigned int sd_log_num_buf;
570 unsigned int sd_log_num_revoke;
571 unsigned int sd_log_num_rg;
572 unsigned int sd_log_num_databuf;
573 unsigned int sd_log_num_jdata;
574 unsigned int sd_log_num_hdrs;
575
576 struct list_head sd_log_le_gl;
577 struct list_head sd_log_le_buf;
578 struct list_head sd_log_le_revoke;
579 struct list_head sd_log_le_rg;
580 struct list_head sd_log_le_databuf;
581
582 unsigned int sd_log_blks_free;
583 struct mutex sd_log_reserve_mutex;
584
585 u64 sd_log_sequence;
586 unsigned int sd_log_head;
587 unsigned int sd_log_tail;
588 int sd_log_idle;
589
590 unsigned long sd_log_flush_time;
591 struct rw_semaphore sd_log_flush_lock;
592 struct list_head sd_log_flush_list;
593
594 unsigned int sd_log_flush_head;
595 u64 sd_log_flush_wrapped;
596
597 struct list_head sd_ail1_list;
598 struct list_head sd_ail2_list;
599 u64 sd_ail_sync_gen;
600
601 /* Replay stuff */
602
603 struct list_head sd_revoke_list;
604 unsigned int sd_replay_tail;
605
606 unsigned int sd_found_blocks;
607 unsigned int sd_found_revokes;
608 unsigned int sd_replayed_blocks;
609
610 /* For quiescing the filesystem */
611
612 struct gfs2_holder sd_freeze_gh;
613 struct mutex sd_freeze_lock;
614 unsigned int sd_freeze_count;
615
616 /* Counters */
617
618 atomic_t sd_glock_count;
619 atomic_t sd_glock_held_count;
620 atomic_t sd_inode_count;
621 atomic_t sd_reclaimed;
622
623 char sd_fsname[GFS2_FSNAME_LEN];
624 char sd_table_name[GFS2_FSNAME_LEN];
625 char sd_proto_name[GFS2_FSNAME_LEN];
626
627 /* Debugging crud */
628
629 unsigned long sd_last_warning;
630 struct vfsmount *sd_gfs2mnt;
631};
632
633#endif /* __INCORE_DOT_H__ */
634
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..57c43ac47925
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1379 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h>
19#include <linux/lm_interface.h>
20#include <linux/security.h>
21
22#include "gfs2.h"
23#include "incore.h"
24#include "acl.h"
25#include "bmap.h"
26#include "dir.h"
27#include "eattr.h"
28#include "glock.h"
29#include "glops.h"
30#include "inode.h"
31#include "log.h"
32#include "meta_io.h"
33#include "ops_address.h"
34#include "ops_file.h"
35#include "ops_inode.h"
36#include "quota.h"
37#include "rgrp.h"
38#include "trans.h"
39#include "util.h"
40
41/**
42 * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
43 * @ip: The GFS2 inode (with embedded disk inode data)
44 * @inode: The Linux VFS inode
45 *
46 */
47
48void gfs2_inode_attr_in(struct gfs2_inode *ip)
49{
50 struct inode *inode = &ip->i_inode;
51 struct gfs2_dinode *di = &ip->i_di;
52
53 inode->i_ino = ip->i_num.no_addr;
54
55 switch (di->di_mode & S_IFMT) {
56 case S_IFBLK:
57 case S_IFCHR:
58 inode->i_rdev = MKDEV(di->di_major, di->di_minor);
59 break;
60 default:
61 inode->i_rdev = 0;
62 break;
63 };
64
65 inode->i_mode = di->di_mode;
66 inode->i_nlink = di->di_nlink;
67 inode->i_uid = di->di_uid;
68 inode->i_gid = di->di_gid;
69 i_size_write(inode, di->di_size);
70 inode->i_atime.tv_sec = di->di_atime;
71 inode->i_mtime.tv_sec = di->di_mtime;
72 inode->i_ctime.tv_sec = di->di_ctime;
73 inode->i_atime.tv_nsec = 0;
74 inode->i_mtime.tv_nsec = 0;
75 inode->i_ctime.tv_nsec = 0;
76 inode->i_blocks = di->di_blocks <<
77 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
78
79 if (di->di_flags & GFS2_DIF_IMMUTABLE)
80 inode->i_flags |= S_IMMUTABLE;
81 else
82 inode->i_flags &= ~S_IMMUTABLE;
83
84 if (di->di_flags & GFS2_DIF_APPENDONLY)
85 inode->i_flags |= S_APPEND;
86 else
87 inode->i_flags &= ~S_APPEND;
88}
89
90/**
91 * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
92 * @ip: The GFS2 inode
93 *
94 * Only copy out the attributes that we want the VFS layer
95 * to be able to modify.
96 */
97
98void gfs2_inode_attr_out(struct gfs2_inode *ip)
99{
100 struct inode *inode = &ip->i_inode;
101 struct gfs2_dinode *di = &ip->i_di;
102 gfs2_assert_withdraw(GFS2_SB(inode),
103 (di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
104 di->di_mode = inode->i_mode;
105 di->di_uid = inode->i_uid;
106 di->di_gid = inode->i_gid;
107 di->di_atime = inode->i_atime.tv_sec;
108 di->di_mtime = inode->i_mtime.tv_sec;
109 di->di_ctime = inode->i_ctime.tv_sec;
110}
111
112static int iget_test(struct inode *inode, void *opaque)
113{
114 struct gfs2_inode *ip = GFS2_I(inode);
115 struct gfs2_inum *inum = opaque;
116
117 if (ip && ip->i_num.no_addr == inum->no_addr)
118 return 1;
119
120 return 0;
121}
122
123static int iget_set(struct inode *inode, void *opaque)
124{
125 struct gfs2_inode *ip = GFS2_I(inode);
126 struct gfs2_inum *inum = opaque;
127
128 ip->i_num = *inum;
129 return 0;
130}
131
132struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
133{
134 return ilookup5(sb, (unsigned long)inum->no_formal_ino,
135 iget_test, inum);
136}
137
138static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
139{
140 return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
141 iget_test, iget_set, inum);
142}
143
144/**
145 * gfs2_inode_lookup - Lookup an inode
146 * @sb: The super block
147 * @inum: The inode number
148 * @type: The type of the inode
149 *
150 * Returns: A VFS inode, or an error
151 */
152
153struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
154{
155 struct inode *inode = gfs2_iget(sb, inum);
156 struct gfs2_inode *ip = GFS2_I(inode);
157 struct gfs2_glock *io_gl;
158 int error;
159
160 if (inode->i_state & I_NEW) {
161 struct gfs2_sbd *sdp = GFS2_SB(inode);
162 umode_t mode = DT2IF(type);
163 inode->i_private = ip;
164 inode->i_mode = mode;
165
166 if (S_ISREG(mode)) {
167 inode->i_op = &gfs2_file_iops;
168 inode->i_fop = &gfs2_file_fops;
169 inode->i_mapping->a_ops = &gfs2_file_aops;
170 } else if (S_ISDIR(mode)) {
171 inode->i_op = &gfs2_dir_iops;
172 inode->i_fop = &gfs2_dir_fops;
173 } else if (S_ISLNK(mode)) {
174 inode->i_op = &gfs2_symlink_iops;
175 } else {
176 inode->i_op = &gfs2_dev_iops;
177 }
178
179 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
180 if (unlikely(error))
181 goto fail;
182 ip->i_gl->gl_object = ip;
183
184 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
185 if (unlikely(error))
186 goto fail_put;
187
188 ip->i_vn = ip->i_gl->gl_vn - 1;
189 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
190 if (unlikely(error))
191 goto fail_iopen;
192
193 gfs2_glock_put(io_gl);
194 unlock_new_inode(inode);
195 }
196
197 return inode;
198fail_iopen:
199 gfs2_glock_put(io_gl);
200fail_put:
201 ip->i_gl->gl_object = NULL;
202 gfs2_glock_put(ip->i_gl);
203fail:
204 iput(inode);
205 return ERR_PTR(error);
206}
207
208/**
209 * gfs2_inode_refresh - Refresh the incore copy of the dinode
210 * @ip: The GFS2 inode
211 *
212 * Returns: errno
213 */
214
215int gfs2_inode_refresh(struct gfs2_inode *ip)
216{
217 struct buffer_head *dibh;
218 int error;
219
220 error = gfs2_meta_inode_buffer(ip, &dibh);
221 if (error)
222 return error;
223
224 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
225 brelse(dibh);
226 return -EIO;
227 }
228
229 gfs2_dinode_in(&ip->i_di, dibh->b_data);
230
231 brelse(dibh);
232
233 if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
234 if (gfs2_consist_inode(ip))
235 gfs2_dinode_print(&ip->i_di);
236 return -EIO;
237 }
238 if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
239 return -ESTALE;
240
241 ip->i_vn = ip->i_gl->gl_vn;
242
243 return 0;
244}
245
246int gfs2_dinode_dealloc(struct gfs2_inode *ip)
247{
248 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
249 struct gfs2_alloc *al;
250 struct gfs2_rgrpd *rgd;
251 int error;
252
253 if (ip->i_di.di_blocks != 1) {
254 if (gfs2_consist_inode(ip))
255 gfs2_dinode_print(&ip->i_di);
256 return -EIO;
257 }
258
259 al = gfs2_alloc_get(ip);
260
261 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
262 if (error)
263 goto out;
264
265 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
266 if (error)
267 goto out_qs;
268
269 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
270 if (!rgd) {
271 gfs2_consist_inode(ip);
272 error = -EIO;
273 goto out_rindex_relse;
274 }
275
276 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
277 &al->al_rgd_gh);
278 if (error)
279 goto out_rindex_relse;
280
281 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
282 if (error)
283 goto out_rg_gunlock;
284
285 gfs2_trans_add_gl(ip->i_gl);
286
287 gfs2_free_di(rgd, ip);
288
289 gfs2_trans_end(sdp);
290 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
291
292out_rg_gunlock:
293 gfs2_glock_dq_uninit(&al->al_rgd_gh);
294out_rindex_relse:
295 gfs2_glock_dq_uninit(&al->al_ri_gh);
296out_qs:
297 gfs2_quota_unhold(ip);
298out:
299 gfs2_alloc_put(ip);
300 return error;
301}
302
303/**
304 * gfs2_change_nlink - Change nlink count on inode
305 * @ip: The GFS2 inode
306 * @diff: The change in the nlink count required
307 *
308 * Returns: errno
309 */
310
311int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
312{
313 struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
314 struct buffer_head *dibh;
315 u32 nlink;
316 int error;
317
318 BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
319 nlink = ip->i_di.di_nlink + diff;
320
321 /* If we are reducing the nlink count, but the new value ends up being
322 bigger than the old one, we must have underflowed. */
323 if (diff < 0 && nlink > ip->i_di.di_nlink) {
324 if (gfs2_consist_inode(ip))
325 gfs2_dinode_print(&ip->i_di);
326 return -EIO;
327 }
328
329 error = gfs2_meta_inode_buffer(ip, &dibh);
330 if (error)
331 return error;
332
333 ip->i_di.di_nlink = nlink;
334 ip->i_di.di_ctime = get_seconds();
335 ip->i_inode.i_nlink = nlink;
336
337 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
338 gfs2_dinode_out(&ip->i_di, dibh->b_data);
339 brelse(dibh);
340 mark_inode_dirty(&ip->i_inode);
341
342 if (ip->i_di.di_nlink == 0) {
343 struct gfs2_rgrpd *rgd;
344 struct gfs2_holder ri_gh, rg_gh;
345
346 error = gfs2_rindex_hold(sdp, &ri_gh);
347 if (error)
348 goto out;
349 error = -EIO;
350 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
351 if (!rgd)
352 goto out_norgrp;
353 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
354 if (error)
355 goto out_norgrp;
356
357 clear_nlink(&ip->i_inode);
358 gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
359 gfs2_glock_dq_uninit(&rg_gh);
360out_norgrp:
361 gfs2_glock_dq_uninit(&ri_gh);
362 }
363out:
364 return error;
365}
366
367struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
368{
369 struct qstr qstr;
370 gfs2_str2qstr(&qstr, name);
371 return gfs2_lookupi(dip, &qstr, 1, NULL);
372}
373
374
375/**
376 * gfs2_lookupi - Look up a filename in a directory and return its inode
377 * @d_gh: An initialized holder for the directory glock
378 * @name: The name of the inode to look for
379 * @is_root: If 1, ignore the caller's permissions
380 * @i_gh: An uninitialized holder for the new inode glock
381 *
382 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
383 * @is_root is true.
384 *
385 * Returns: errno
386 */
387
388struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
389 int is_root, struct nameidata *nd)
390{
391 struct super_block *sb = dir->i_sb;
392 struct gfs2_inode *dip = GFS2_I(dir);
393 struct gfs2_holder d_gh;
394 struct gfs2_inum inum;
395 unsigned int type;
396 int error = 0;
397 struct inode *inode = NULL;
398
399 if (!name->len || name->len > GFS2_FNAMESIZE)
400 return ERR_PTR(-ENAMETOOLONG);
401
402 if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
403 (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
404 dir == sb->s_root->d_inode)) {
405 igrab(dir);
406 return dir;
407 }
408
409 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
410 if (error)
411 return ERR_PTR(error);
412
413 if (!is_root) {
414 error = permission(dir, MAY_EXEC, NULL);
415 if (error)
416 goto out;
417 }
418
419 error = gfs2_dir_search(dir, name, &inum, &type);
420 if (error)
421 goto out;
422
423 inode = gfs2_inode_lookup(sb, &inum, type);
424
425out:
426 gfs2_glock_dq_uninit(&d_gh);
427 if (error == -ENOENT)
428 return NULL;
429 return inode;
430}
431
432static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
433{
434 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
435 struct buffer_head *bh;
436 struct gfs2_inum_range ir;
437 int error;
438
439 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
440 if (error)
441 return error;
442 mutex_lock(&sdp->sd_inum_mutex);
443
444 error = gfs2_meta_inode_buffer(ip, &bh);
445 if (error) {
446 mutex_unlock(&sdp->sd_inum_mutex);
447 gfs2_trans_end(sdp);
448 return error;
449 }
450
451 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
452
453 if (ir.ir_length) {
454 *formal_ino = ir.ir_start++;
455 ir.ir_length--;
456 gfs2_trans_add_bh(ip->i_gl, bh, 1);
457 gfs2_inum_range_out(&ir,
458 bh->b_data + sizeof(struct gfs2_dinode));
459 brelse(bh);
460 mutex_unlock(&sdp->sd_inum_mutex);
461 gfs2_trans_end(sdp);
462 return 0;
463 }
464
465 brelse(bh);
466
467 mutex_unlock(&sdp->sd_inum_mutex);
468 gfs2_trans_end(sdp);
469
470 return 1;
471}
472
473static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
474{
475 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
476 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
477 struct gfs2_holder gh;
478 struct buffer_head *bh;
479 struct gfs2_inum_range ir;
480 int error;
481
482 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
483 if (error)
484 return error;
485
486 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
487 if (error)
488 goto out;
489 mutex_lock(&sdp->sd_inum_mutex);
490
491 error = gfs2_meta_inode_buffer(ip, &bh);
492 if (error)
493 goto out_end_trans;
494
495 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
496
497 if (!ir.ir_length) {
498 struct buffer_head *m_bh;
499 u64 x, y;
500
501 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
502 if (error)
503 goto out_brelse;
504
505 x = *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
506 x = y = be64_to_cpu(x);
507 ir.ir_start = x;
508 ir.ir_length = GFS2_INUM_QUANTUM;
509 x += GFS2_INUM_QUANTUM;
510 if (x < y)
511 gfs2_consist_inode(m_ip);
512 x = cpu_to_be64(x);
513 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
514 *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
515
516 brelse(m_bh);
517 }
518
519 *formal_ino = ir.ir_start++;
520 ir.ir_length--;
521
522 gfs2_trans_add_bh(ip->i_gl, bh, 1);
523 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
524
525out_brelse:
526 brelse(bh);
527out_end_trans:
528 mutex_unlock(&sdp->sd_inum_mutex);
529 gfs2_trans_end(sdp);
530out:
531 gfs2_glock_dq_uninit(&gh);
532 return error;
533}
534
535static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
536{
537 int error;
538
539 error = pick_formal_ino_1(sdp, inum);
540 if (error <= 0)
541 return error;
542
543 error = pick_formal_ino_2(sdp, inum);
544
545 return error;
546}
547
548/**
549 * create_ok - OK to create a new on-disk inode here?
550 * @dip: Directory in which dinode is to be created
551 * @name: Name of new dinode
552 * @mode:
553 *
554 * Returns: errno
555 */
556
557static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
558 unsigned int mode)
559{
560 int error;
561
562 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
563 if (error)
564 return error;
565
566 /* Don't create entries in an unlinked directory */
567 if (!dip->i_di.di_nlink)
568 return -EPERM;
569
570 error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
571 switch (error) {
572 case -ENOENT:
573 error = 0;
574 break;
575 case 0:
576 return -EEXIST;
577 default:
578 return error;
579 }
580
581 if (dip->i_di.di_entries == (u32)-1)
582 return -EFBIG;
583 if (S_ISDIR(mode) && dip->i_di.di_nlink == (u32)-1)
584 return -EMLINK;
585
586 return 0;
587}
588
589static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
590 unsigned int *uid, unsigned int *gid)
591{
592 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
593 (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) {
594 if (S_ISDIR(*mode))
595 *mode |= S_ISUID;
596 else if (dip->i_di.di_uid != current->fsuid)
597 *mode &= ~07111;
598 *uid = dip->i_di.di_uid;
599 } else
600 *uid = current->fsuid;
601
602 if (dip->i_di.di_mode & S_ISGID) {
603 if (S_ISDIR(*mode))
604 *mode |= S_ISGID;
605 *gid = dip->i_di.di_gid;
606 } else
607 *gid = current->fsgid;
608}
609
610static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
611 u64 *generation)
612{
613 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
614 int error;
615
616 gfs2_alloc_get(dip);
617
618 dip->i_alloc.al_requested = RES_DINODE;
619 error = gfs2_inplace_reserve(dip);
620 if (error)
621 goto out;
622
623 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
624 if (error)
625 goto out_ipreserv;
626
627 inum->no_addr = gfs2_alloc_di(dip, generation);
628
629 gfs2_trans_end(sdp);
630
631out_ipreserv:
632 gfs2_inplace_release(dip);
633out:
634 gfs2_alloc_put(dip);
635 return error;
636}
637
638/**
639 * init_dinode - Fill in a new dinode structure
640 * @dip: the directory this inode is being created in
641 * @gl: The glock covering the new inode
642 * @inum: the inode number
643 * @mode: the file permissions
644 * @uid:
645 * @gid:
646 *
647 */
648
649static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
650 const struct gfs2_inum *inum, unsigned int mode,
651 unsigned int uid, unsigned int gid,
652 const u64 *generation)
653{
654 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
655 struct gfs2_dinode *di;
656 struct buffer_head *dibh;
657
658 dibh = gfs2_meta_new(gl, inum->no_addr);
659 gfs2_trans_add_bh(gl, dibh, 1);
660 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
661 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
662 di = (struct gfs2_dinode *)dibh->b_data;
663
664 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
665 di->di_num.no_addr = cpu_to_be64(inum->no_addr);
666 di->di_mode = cpu_to_be32(mode);
667 di->di_uid = cpu_to_be32(uid);
668 di->di_gid = cpu_to_be32(gid);
669 di->di_nlink = cpu_to_be32(0);
670 di->di_size = cpu_to_be64(0);
671 di->di_blocks = cpu_to_be64(1);
672 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
673 di->di_major = di->di_minor = cpu_to_be32(0);
674 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
675 di->di_generation = cpu_to_be64(*generation);
676 di->di_flags = cpu_to_be32(0);
677
678 if (S_ISREG(mode)) {
679 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
680 gfs2_tune_get(sdp, gt_new_files_jdata))
681 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
682 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
683 gfs2_tune_get(sdp, gt_new_files_directio))
684 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
685 } else if (S_ISDIR(mode)) {
686 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
687 GFS2_DIF_INHERIT_DIRECTIO);
688 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
689 GFS2_DIF_INHERIT_JDATA);
690 }
691
692 di->__pad1 = 0;
693 di->di_payload_format = cpu_to_be32(0);
694 di->di_height = cpu_to_be32(0);
695 di->__pad2 = 0;
696 di->__pad3 = 0;
697 di->di_depth = cpu_to_be16(0);
698 di->di_entries = cpu_to_be32(0);
699 memset(&di->__pad4, 0, sizeof(di->__pad4));
700 di->di_eattr = cpu_to_be64(0);
701 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
702
703 brelse(dibh);
704}
705
706static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
707 unsigned int mode, const struct gfs2_inum *inum,
708 const u64 *generation)
709{
710 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
711 unsigned int uid, gid;
712 int error;
713
714 munge_mode_uid_gid(dip, &mode, &uid, &gid);
715 gfs2_alloc_get(dip);
716
717 error = gfs2_quota_lock(dip, uid, gid);
718 if (error)
719 goto out;
720
721 error = gfs2_quota_check(dip, uid, gid);
722 if (error)
723 goto out_quota;
724
725 error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
726 if (error)
727 goto out_quota;
728
729 init_dinode(dip, gl, inum, mode, uid, gid, generation);
730 gfs2_quota_change(dip, +1, uid, gid);
731 gfs2_trans_end(sdp);
732
733out_quota:
734 gfs2_quota_unlock(dip);
735out:
736 gfs2_alloc_put(dip);
737 return error;
738}
739
740static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
741 struct gfs2_inode *ip)
742{
743 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
744 struct gfs2_alloc *al;
745 int alloc_required;
746 struct buffer_head *dibh;
747 int error;
748
749 al = gfs2_alloc_get(dip);
750
751 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
752 if (error)
753 goto fail;
754
755 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
756 if (alloc_required < 0)
757 goto fail;
758 if (alloc_required) {
759 error = gfs2_quota_check(dip, dip->i_di.di_uid,
760 dip->i_di.di_gid);
761 if (error)
762 goto fail_quota_locks;
763
764 al->al_requested = sdp->sd_max_dirres;
765
766 error = gfs2_inplace_reserve(dip);
767 if (error)
768 goto fail_quota_locks;
769
770 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
771 al->al_rgd->rd_ri.ri_length +
772 2 * RES_DINODE +
773 RES_STATFS + RES_QUOTA, 0);
774 if (error)
775 goto fail_ipreserv;
776 } else {
777 error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
778 if (error)
779 goto fail_quota_locks;
780 }
781
782 error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
783 if (error)
784 goto fail_end_trans;
785
786 error = gfs2_meta_inode_buffer(ip, &dibh);
787 if (error)
788 goto fail_end_trans;
789 ip->i_di.di_nlink = 1;
790 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
791 gfs2_dinode_out(&ip->i_di, dibh->b_data);
792 brelse(dibh);
793 return 0;
794
795fail_end_trans:
796 gfs2_trans_end(sdp);
797
798fail_ipreserv:
799 if (dip->i_alloc.al_rgd)
800 gfs2_inplace_release(dip);
801
802fail_quota_locks:
803 gfs2_quota_unlock(dip);
804
805fail:
806 gfs2_alloc_put(dip);
807 return error;
808}
809
810static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
811{
812 int err;
813 size_t len;
814 void *value;
815 char *name;
816 struct gfs2_ea_request er;
817
818 err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
819 &name, &value, &len);
820
821 if (err) {
822 if (err == -EOPNOTSUPP)
823 return 0;
824 return err;
825 }
826
827 memset(&er, 0, sizeof(struct gfs2_ea_request));
828
829 er.er_type = GFS2_EATYPE_SECURITY;
830 er.er_name = name;
831 er.er_data = value;
832 er.er_name_len = strlen(name);
833 er.er_data_len = len;
834
835 err = gfs2_ea_set_i(ip, &er);
836
837 kfree(value);
838 kfree(name);
839
840 return err;
841}
842
843/**
844 * gfs2_createi - Create a new inode
845 * @ghs: An array of two holders
846 * @name: The name of the new file
847 * @mode: the permissions on the new inode
848 *
849 * @ghs[0] is an initialized holder for the directory
850 * @ghs[1] is the holder for the inode lock
851 *
852 * If the return value is not NULL, the glocks on both the directory and the new
853 * file are held. A transaction has been started and an inplace reservation
854 * is held, as well.
855 *
856 * Returns: An inode
857 */
858
859struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
860 unsigned int mode)
861{
862 struct inode *inode;
863 struct gfs2_inode *dip = ghs->gh_gl->gl_object;
864 struct inode *dir = &dip->i_inode;
865 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
866 struct gfs2_inum inum;
867 int error;
868 u64 generation;
869
870 if (!name->len || name->len > GFS2_FNAMESIZE)
871 return ERR_PTR(-ENAMETOOLONG);
872
873 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
874 error = gfs2_glock_nq(ghs);
875 if (error)
876 goto fail;
877
878 error = create_ok(dip, name, mode);
879 if (error)
880 goto fail_gunlock;
881
882 error = pick_formal_ino(sdp, &inum.no_formal_ino);
883 if (error)
884 goto fail_gunlock;
885
886 error = alloc_dinode(dip, &inum, &generation);
887 if (error)
888 goto fail_gunlock;
889
890 if (inum.no_addr < dip->i_num.no_addr) {
891 gfs2_glock_dq(ghs);
892
893 error = gfs2_glock_nq_num(sdp, inum.no_addr,
894 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
895 GL_SKIP, ghs + 1);
896 if (error) {
897 return ERR_PTR(error);
898 }
899
900 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
901 error = gfs2_glock_nq(ghs);
902 if (error) {
903 gfs2_glock_dq_uninit(ghs + 1);
904 return ERR_PTR(error);
905 }
906
907 error = create_ok(dip, name, mode);
908 if (error)
909 goto fail_gunlock2;
910 } else {
911 error = gfs2_glock_nq_num(sdp, inum.no_addr,
912 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
913 GL_SKIP, ghs + 1);
914 if (error)
915 goto fail_gunlock;
916 }
917
918 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
919 if (error)
920 goto fail_gunlock2;
921
922 inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
923 if (IS_ERR(inode))
924 goto fail_gunlock2;
925
926 error = gfs2_inode_refresh(GFS2_I(inode));
927 if (error)
928 goto fail_iput;
929
930 error = gfs2_acl_create(dip, GFS2_I(inode));
931 if (error)
932 goto fail_iput;
933
934 error = gfs2_security_init(dip, GFS2_I(inode));
935 if (error)
936 goto fail_iput;
937
938 error = link_dinode(dip, name, GFS2_I(inode));
939 if (error)
940 goto fail_iput;
941
942 if (!inode)
943 return ERR_PTR(-ENOMEM);
944 return inode;
945
946fail_iput:
947 iput(inode);
948fail_gunlock2:
949 gfs2_glock_dq_uninit(ghs + 1);
950fail_gunlock:
951 gfs2_glock_dq(ghs);
952fail:
953 return ERR_PTR(error);
954}
955
956/**
957 * gfs2_rmdiri - Remove a directory
958 * @dip: The parent directory of the directory to be removed
959 * @name: The name of the directory to be removed
960 * @ip: The GFS2 inode of the directory to be removed
961 *
962 * Assumes Glocks on dip and ip are held
963 *
964 * Returns: errno
965 */
966
967int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
968 struct gfs2_inode *ip)
969{
970 struct qstr dotname;
971 int error;
972
973 if (ip->i_di.di_entries != 2) {
974 if (gfs2_consist_inode(ip))
975 gfs2_dinode_print(&ip->i_di);
976 return -EIO;
977 }
978
979 error = gfs2_dir_del(dip, name);
980 if (error)
981 return error;
982
983 error = gfs2_change_nlink(dip, -1);
984 if (error)
985 return error;
986
987 gfs2_str2qstr(&dotname, ".");
988 error = gfs2_dir_del(ip, &dotname);
989 if (error)
990 return error;
991
992 gfs2_str2qstr(&dotname, "..");
993 error = gfs2_dir_del(ip, &dotname);
994 if (error)
995 return error;
996
997 error = gfs2_change_nlink(ip, -2);
998 if (error)
999 return error;
1000
1001 return error;
1002}
1003
1004/*
1005 * gfs2_unlink_ok - check to see that a inode is still in a directory
1006 * @dip: the directory
1007 * @name: the name of the file
1008 * @ip: the inode
1009 *
1010 * Assumes that the lock on (at least) @dip is held.
1011 *
1012 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
1013 */
1014
1015int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1016 struct gfs2_inode *ip)
1017{
1018 struct gfs2_inum inum;
1019 unsigned int type;
1020 int error;
1021
1022 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1023 return -EPERM;
1024
1025 if ((dip->i_di.di_mode & S_ISVTX) &&
1026 dip->i_di.di_uid != current->fsuid &&
1027 ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
1028 return -EPERM;
1029
1030 if (IS_APPEND(&dip->i_inode))
1031 return -EPERM;
1032
1033 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
1034 if (error)
1035 return error;
1036
1037 error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
1038 if (error)
1039 return error;
1040
1041 if (!gfs2_inum_equal(&inum, &ip->i_num))
1042 return -ENOENT;
1043
1044 if (IF2DT(ip->i_di.di_mode) != type) {
1045 gfs2_consist_inode(dip);
1046 return -EIO;
1047 }
1048
1049 return 0;
1050}
1051
1052/*
1053 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1054 * @this: move this
1055 * @to: to here
1056 *
1057 * Follow @to back to the root and make sure we don't encounter @this
1058 * Assumes we already hold the rename lock.
1059 *
1060 * Returns: errno
1061 */
1062
1063int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1064{
1065 struct inode *dir = &to->i_inode;
1066 struct super_block *sb = dir->i_sb;
1067 struct inode *tmp;
1068 struct qstr dotdot;
1069 int error = 0;
1070
1071 gfs2_str2qstr(&dotdot, "..");
1072
1073 igrab(dir);
1074
1075 for (;;) {
1076 if (dir == &this->i_inode) {
1077 error = -EINVAL;
1078 break;
1079 }
1080 if (dir == sb->s_root->d_inode) {
1081 error = 0;
1082 break;
1083 }
1084
1085 tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
1086 if (IS_ERR(tmp)) {
1087 error = PTR_ERR(tmp);
1088 break;
1089 }
1090
1091 iput(dir);
1092 dir = tmp;
1093 }
1094
1095 iput(dir);
1096
1097 return error;
1098}
1099
1100/**
1101 * gfs2_readlinki - return the contents of a symlink
1102 * @ip: the symlink's inode
1103 * @buf: a pointer to the buffer to be filled
1104 * @len: a pointer to the length of @buf
1105 *
1106 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1107 * to be freed by the caller.
1108 *
1109 * Returns: errno
1110 */
1111
1112int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1113{
1114 struct gfs2_holder i_gh;
1115 struct buffer_head *dibh;
1116 unsigned int x;
1117 int error;
1118
1119 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
1120 error = gfs2_glock_nq_atime(&i_gh);
1121 if (error) {
1122 gfs2_holder_uninit(&i_gh);
1123 return error;
1124 }
1125
1126 if (!ip->i_di.di_size) {
1127 gfs2_consist_inode(ip);
1128 error = -EIO;
1129 goto out;
1130 }
1131
1132 error = gfs2_meta_inode_buffer(ip, &dibh);
1133 if (error)
1134 goto out;
1135
1136 x = ip->i_di.di_size + 1;
1137 if (x > *len) {
1138 *buf = kmalloc(x, GFP_KERNEL);
1139 if (!*buf) {
1140 error = -ENOMEM;
1141 goto out_brelse;
1142 }
1143 }
1144
1145 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1146 *len = x;
1147
1148out_brelse:
1149 brelse(dibh);
1150out:
1151 gfs2_glock_dq_uninit(&i_gh);
1152 return error;
1153}
1154
1155/**
1156 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1157 * conditionally update the inode's atime
1158 * @gh: the holder to acquire
1159 *
1160 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1161 * Update if the difference between the current time and the inode's current
1162 * atime is greater than an interval specified at mount.
1163 *
1164 * Returns: errno
1165 */
1166
1167int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1168{
1169 struct gfs2_glock *gl = gh->gh_gl;
1170 struct gfs2_sbd *sdp = gl->gl_sbd;
1171 struct gfs2_inode *ip = gl->gl_object;
1172 s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1173 unsigned int state;
1174 int flags;
1175 int error;
1176
1177 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1178 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1179 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1180 return -EINVAL;
1181
1182 state = gh->gh_state;
1183 flags = gh->gh_flags;
1184
1185 error = gfs2_glock_nq(gh);
1186 if (error)
1187 return error;
1188
1189 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1190 (sdp->sd_vfs->s_flags & MS_RDONLY))
1191 return 0;
1192
1193 curtime = get_seconds();
1194 if (curtime - ip->i_di.di_atime >= quantum) {
1195 gfs2_glock_dq(gh);
1196 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1197 gh);
1198 error = gfs2_glock_nq(gh);
1199 if (error)
1200 return error;
1201
1202 /* Verify that atime hasn't been updated while we were
1203 trying to get exclusive lock. */
1204
1205 curtime = get_seconds();
1206 if (curtime - ip->i_di.di_atime >= quantum) {
1207 struct buffer_head *dibh;
1208 struct gfs2_dinode *di;
1209
1210 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1211 if (error == -EROFS)
1212 return 0;
1213 if (error)
1214 goto fail;
1215
1216 error = gfs2_meta_inode_buffer(ip, &dibh);
1217 if (error)
1218 goto fail_end_trans;
1219
1220 ip->i_di.di_atime = curtime;
1221
1222 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1223 di = (struct gfs2_dinode *)dibh->b_data;
1224 di->di_atime = cpu_to_be64(ip->i_di.di_atime);
1225 brelse(dibh);
1226
1227 gfs2_trans_end(sdp);
1228 }
1229
1230 /* If someone else has asked for the glock,
1231 unlock and let them have it. Then reacquire
1232 in the original state. */
1233 if (gfs2_glock_is_blocking(gl)) {
1234 gfs2_glock_dq(gh);
1235 gfs2_holder_reinit(state, flags, gh);
1236 return gfs2_glock_nq(gh);
1237 }
1238 }
1239
1240 return 0;
1241
1242fail_end_trans:
1243 gfs2_trans_end(sdp);
1244fail:
1245 gfs2_glock_dq(gh);
1246 return error;
1247}
1248
1249/**
1250 * glock_compare_atime - Compare two struct gfs2_glock structures for sort
1251 * @arg_a: the first structure
1252 * @arg_b: the second structure
1253 *
1254 * Returns: 1 if A > B
1255 * -1 if A < B
1256 * 0 if A == B
1257 */
1258
1259static int glock_compare_atime(const void *arg_a, const void *arg_b)
1260{
1261 const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
1262 const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
1263 const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1264 const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1265
1266 if (a->ln_number > b->ln_number)
1267 return 1;
1268 if (a->ln_number < b->ln_number)
1269 return -1;
1270 if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
1271 return 1;
1272 if (gh_a->gh_state == LM_ST_SHARED && (gh_b->gh_flags & GL_ATIME))
1273 return 1;
1274
1275 return 0;
1276}
1277
1278/**
1279 * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
1280 * atime update
1281 * @num_gh: the number of structures
1282 * @ghs: an array of struct gfs2_holder structures
1283 *
1284 * Returns: 0 on success (all glocks acquired),
1285 * errno on failure (no glocks acquired)
1286 */
1287
1288int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
1289{
1290 struct gfs2_holder **p;
1291 unsigned int x;
1292 int error = 0;
1293
1294 if (!num_gh)
1295 return 0;
1296
1297 if (num_gh == 1) {
1298 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1299 if (ghs->gh_flags & GL_ATIME)
1300 error = gfs2_glock_nq_atime(ghs);
1301 else
1302 error = gfs2_glock_nq(ghs);
1303 return error;
1304 }
1305
1306 p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1307 if (!p)
1308 return -ENOMEM;
1309
1310 for (x = 0; x < num_gh; x++)
1311 p[x] = &ghs[x];
1312
1313 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
1314
1315 for (x = 0; x < num_gh; x++) {
1316 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1317
1318 if (p[x]->gh_flags & GL_ATIME)
1319 error = gfs2_glock_nq_atime(p[x]);
1320 else
1321 error = gfs2_glock_nq(p[x]);
1322
1323 if (error) {
1324 while (x--)
1325 gfs2_glock_dq(p[x]);
1326 break;
1327 }
1328 }
1329
1330 kfree(p);
1331 return error;
1332}
1333
1334
1335static int
1336__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1337{
1338 struct buffer_head *dibh;
1339 int error;
1340
1341 error = gfs2_meta_inode_buffer(ip, &dibh);
1342 if (!error) {
1343 error = inode_setattr(&ip->i_inode, attr);
1344 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1345 gfs2_inode_attr_out(ip);
1346
1347 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1348 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1349 brelse(dibh);
1350 }
1351 return error;
1352}
1353
1354/**
1355 * gfs2_setattr_simple -
1356 * @ip:
1357 * @attr:
1358 *
1359 * Called with a reference on the vnode.
1360 *
1361 * Returns: errno
1362 */
1363
1364int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1365{
1366 int error;
1367
1368 if (current->journal_info)
1369 return __gfs2_setattr_simple(ip, attr);
1370
1371 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
1372 if (error)
1373 return error;
1374
1375 error = __gfs2_setattr_simple(ip, attr);
1376 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1377 return error;
1378}
1379
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..f5d861760579
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__
12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
14{
15 return !ip->i_di.di_height;
16}
17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip)
19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21}
22
23static inline int gfs2_is_dir(struct gfs2_inode *ip)
24{
25 return S_ISDIR(ip->i_di.di_mode);
26}
27
28void gfs2_inode_attr_in(struct gfs2_inode *ip);
29void gfs2_inode_attr_out(struct gfs2_inode *ip);
30struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
31struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
32
33int gfs2_inode_refresh(struct gfs2_inode *ip);
34
35int gfs2_dinode_dealloc(struct gfs2_inode *inode);
36int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
37struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
38 int is_root, struct nameidata *nd);
39struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
40 unsigned int mode);
41int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
42 struct gfs2_inode *ip);
43int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
44 struct gfs2_inode *ip);
45int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
46int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
47
48int gfs2_glock_nq_atime(struct gfs2_holder *gh);
49int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
50
51int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
52
53struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
54
55#endif /* __INODE_DOT_H__ */
56
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..effe4a337c1d
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,217 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "glock.h"
22#include "lm.h"
23#include "super.h"
24#include "util.h"
25
26/**
27 * gfs2_lm_mount - mount a locking protocol
28 * @sdp: the filesystem
29 * @args: mount arguements
30 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
31 *
32 * Returns: errno
33 */
34
35int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
36{
37 char *proto = sdp->sd_proto_name;
38 char *table = sdp->sd_table_name;
39 int flags = 0;
40 int error;
41
42 if (sdp->sd_args.ar_spectator)
43 flags |= LM_MFLAG_SPECTATOR;
44
45 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
46
47 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
48 gfs2_glock_cb, sdp,
49 GFS2_MIN_LVB_SIZE, flags,
50 &sdp->sd_lockstruct, &sdp->sd_kobj);
51 if (error) {
52 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
53 proto, table, sdp->sd_args.ar_hostdata);
54 goto out;
55 }
56
57 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
58 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
59 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
60 GFS2_MIN_LVB_SIZE)) {
61 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
62 goto out;
63 }
64
65 if (sdp->sd_args.ar_spectator)
66 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
67 else
68 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
69 sdp->sd_lockstruct.ls_jid);
70
71 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
72
73 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
74 !sdp->sd_args.ar_ignore_local_fs) {
75 sdp->sd_args.ar_localflocks = 1;
76 sdp->sd_args.ar_localcaching = 1;
77 }
78
79out:
80 return error;
81}
82
83void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
84{
85 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
86 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
87 sdp->sd_lockstruct.ls_lockspace);
88}
89
90void gfs2_lm_unmount(struct gfs2_sbd *sdp)
91{
92 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
93 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
94}
95
96int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
97{
98 va_list args;
99
100 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
101 return 0;
102
103 va_start(args, fmt);
104 vprintk(fmt, args);
105 va_end(args);
106
107 fs_err(sdp, "about to withdraw from the cluster\n");
108 BUG_ON(sdp->sd_args.ar_debug);
109
110
111 fs_err(sdp, "waiting for outstanding I/O\n");
112
113 /* FIXME: suspend dm device so oustanding bio's complete
114 and all further io requests fail */
115
116 fs_err(sdp, "telling LM to withdraw\n");
117 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
118 fs_err(sdp, "withdrawn\n");
119 dump_stack();
120
121 return -1;
122}
123
124int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
125 void **lockp)
126{
127 int error = -EIO;
128 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
129 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
130 sdp->sd_lockstruct.ls_lockspace, name, lockp);
131 return error;
132}
133
134void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
135{
136 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
137 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
138}
139
140unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
141 unsigned int cur_state, unsigned int req_state,
142 unsigned int flags)
143{
144 int ret = 0;
145 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
146 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
147 req_state, flags);
148 return ret;
149}
150
151unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
152 unsigned int cur_state)
153{
154 int ret = 0;
155 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
156 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
157 return ret;
158}
159
160void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
161{
162 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
163 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
164}
165
166int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
167{
168 int error = -EIO;
169 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
170 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
171 return error;
172}
173
174void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
175{
176 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
177 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
178}
179
180int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
181 struct file *file, struct file_lock *fl)
182{
183 int error = -EIO;
184 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
185 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
186 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
187 return error;
188}
189
190int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
191 struct file *file, int cmd, struct file_lock *fl)
192{
193 int error = -EIO;
194 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
195 error = sdp->sd_lockstruct.ls_ops->lm_plock(
196 sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
197 return error;
198}
199
200int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
201 struct file *file, struct file_lock *fl)
202{
203 int error = -EIO;
204 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
205 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
206 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
207 return error;
208}
209
210void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
211 unsigned int message)
212{
213 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
214 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
215 sdp->sd_lockstruct.ls_lockspace, jid, message);
216}
217
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..21cdc30ee08c
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,42 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13struct gfs2_sbd;
14
15#define GFS2_MIN_LVB_SIZE 32
16
17int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
18void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
19void gfs2_lm_unmount(struct gfs2_sbd *sdp);
20int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
21 __attribute__ ((format(printf, 2, 3)));
22int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
23 void **lockp);
24void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
25unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
26 unsigned int cur_state, unsigned int req_state,
27 unsigned int flags);
28unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
29 unsigned int cur_state);
30void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
31int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
32void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
33int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
34 struct file *file, struct file_lock *fl);
35int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
36 struct file *file, int cmd, struct file_lock *fl);
37int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
38 struct file *file, struct file_lock *fl);
39void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
40 unsigned int message);
41
42#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..663fee728783
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19#include <linux/lm_interface.h>
20
21struct lmh_wrapper {
22 struct list_head lw_list;
23 const struct lm_lockops *lw_ops;
24};
25
26/* List of registered low-level locking protocols. A file system selects one
27 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
28
29static LIST_HEAD(lmh_list);
30static DEFINE_MUTEX(lmh_lock);
31
32/**
33 * gfs2_register_lockproto - Register a low-level locking protocol
34 * @proto: the protocol definition
35 *
36 * Returns: 0 on success, -EXXX on failure
37 */
38
39int gfs2_register_lockproto(const struct lm_lockops *proto)
40{
41 struct lmh_wrapper *lw;
42
43 mutex_lock(&lmh_lock);
44
45 list_for_each_entry(lw, &lmh_list, lw_list) {
46 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
47 mutex_unlock(&lmh_lock);
48 printk(KERN_INFO "GFS2: protocol %s already exists\n",
49 proto->lm_proto_name);
50 return -EEXIST;
51 }
52 }
53
54 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
55 if (!lw) {
56 mutex_unlock(&lmh_lock);
57 return -ENOMEM;
58 }
59
60 lw->lw_ops = proto;
61 list_add(&lw->lw_list, &lmh_list);
62
63 mutex_unlock(&lmh_lock);
64
65 return 0;
66}
67
68/**
69 * gfs2_unregister_lockproto - Unregister a low-level locking protocol
70 * @proto: the protocol definition
71 *
72 */
73
74void gfs2_unregister_lockproto(const struct lm_lockops *proto)
75{
76 struct lmh_wrapper *lw;
77
78 mutex_lock(&lmh_lock);
79
80 list_for_each_entry(lw, &lmh_list, lw_list) {
81 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
82 list_del(&lw->lw_list);
83 mutex_unlock(&lmh_lock);
84 kfree(lw);
85 return;
86 }
87 }
88
89 mutex_unlock(&lmh_lock);
90
91 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
92 proto->lm_proto_name);
93}
94
95/**
96 * gfs2_mount_lockproto - Mount a lock protocol
97 * @proto_name - the name of the protocol
98 * @table_name - the name of the lock space
99 * @host_data - data specific to this host
100 * @cb - the callback to the code using the lock module
101 * @sdp - The GFS2 superblock
102 * @min_lvb_size - the mininum LVB size that the caller can deal with
103 * @flags - LM_MFLAG_*
104 * @lockstruct - a structure returned describing the mount
105 *
106 * Returns: 0 on success, -EXXX on failure
107 */
108
109int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
110 lm_callback_t cb, void *cb_data,
111 unsigned int min_lvb_size, int flags,
112 struct lm_lockstruct *lockstruct,
113 struct kobject *fskobj)
114{
115 struct lmh_wrapper *lw = NULL;
116 int try = 0;
117 int error, found;
118
119retry:
120 mutex_lock(&lmh_lock);
121
122 found = 0;
123 list_for_each_entry(lw, &lmh_list, lw_list) {
124 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
125 found = 1;
126 break;
127 }
128 }
129
130 if (!found) {
131 if (!try && capable(CAP_SYS_MODULE)) {
132 try = 1;
133 mutex_unlock(&lmh_lock);
134 request_module(proto_name);
135 goto retry;
136 }
137 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
138 error = -ENOENT;
139 goto out;
140 }
141
142 if (!try_module_get(lw->lw_ops->lm_owner)) {
143 try = 0;
144 mutex_unlock(&lmh_lock);
145 msleep(1000);
146 goto retry;
147 }
148
149 error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
150 min_lvb_size, flags, lockstruct, fskobj);
151 if (error)
152 module_put(lw->lw_ops->lm_owner);
153out:
154 mutex_unlock(&lmh_lock);
155 return error;
156}
157
158void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
159{
160 mutex_lock(&lmh_lock);
161 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
162 if (lockstruct->ls_ops->lm_owner)
163 module_put(lockstruct->ls_ops->lm_owner);
164 mutex_unlock(&lmh_lock);
165}
166
167/**
168 * gfs2_withdraw_lockproto - abnormally unmount a lock module
169 * @lockstruct: the lockstruct passed into mount
170 *
171 */
172
173void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
174{
175 mutex_lock(&lmh_lock);
176 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
177 if (lockstruct->ls_ops->lm_owner)
178 module_put(lockstruct->ls_ops->lm_owner);
179 mutex_unlock(&lmh_lock);
180}
181
182EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
183EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
184
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..89b93b6b45cf
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..b167addf9fd1
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,524 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14static void queue_complete(struct gdlm_lock *lp)
15{
16 struct gdlm_ls *ls = lp->ls;
17
18 clear_bit(LFL_ACTIVE, &lp->flags);
19
20 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete);
22 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait);
24}
25
26static inline void gdlm_ast(void *astarg)
27{
28 queue_complete(astarg);
29}
30
31static inline void gdlm_bast(void *astarg, int mode)
32{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls;
35
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type,
39 (unsigned long long)lp->lockname.ln_number);
40 return;
41 }
42
43 spin_lock(&ls->async_lock);
44 if (!lp->bast_mode) {
45 list_add_tail(&lp->blist, &ls->blocking);
46 lp->bast_mode = mode;
47 } else if (lp->bast_mode < mode)
48 lp->bast_mode = mode;
49 spin_unlock(&ls->async_lock);
50 wake_up(&ls->thread_wait);
51}
52
53void gdlm_queue_delayed(struct gdlm_lock *lp)
54{
55 struct gdlm_ls *ls = lp->ls;
56
57 spin_lock(&ls->async_lock);
58 list_add_tail(&lp->delay_list, &ls->delayed);
59 spin_unlock(&ls->async_lock);
60}
61
62/* convert gfs lock-state to dlm lock-mode */
63
64static s16 make_mode(s16 lmstate)
65{
66 switch (lmstate) {
67 case LM_ST_UNLOCKED:
68 return DLM_LOCK_NL;
69 case LM_ST_EXCLUSIVE:
70 return DLM_LOCK_EX;
71 case LM_ST_DEFERRED:
72 return DLM_LOCK_CW;
73 case LM_ST_SHARED:
74 return DLM_LOCK_PR;
75 }
76 gdlm_assert(0, "unknown LM state %d", lmstate);
77 return -1;
78}
79
80/* convert dlm lock-mode to gfs lock-state */
81
82s16 gdlm_make_lmstate(s16 dlmmode)
83{
84 switch (dlmmode) {
85 case DLM_LOCK_IV:
86 case DLM_LOCK_NL:
87 return LM_ST_UNLOCKED;
88 case DLM_LOCK_EX:
89 return LM_ST_EXCLUSIVE;
90 case DLM_LOCK_CW:
91 return LM_ST_DEFERRED;
92 case DLM_LOCK_PR:
93 return LM_ST_SHARED;
94 }
95 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
96 return -1;
97}
98
99/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
100 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
101
102static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
103{
104 s16 cur = make_mode(cur_state);
105 if (lp->cur != DLM_LOCK_IV)
106 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
107}
108
109static inline unsigned int make_flags(struct gdlm_lock *lp,
110 unsigned int gfs_flags,
111 s16 cur, s16 req)
112{
113 unsigned int lkf = 0;
114
115 if (gfs_flags & LM_FLAG_TRY)
116 lkf |= DLM_LKF_NOQUEUE;
117
118 if (gfs_flags & LM_FLAG_TRY_1CB) {
119 lkf |= DLM_LKF_NOQUEUE;
120 lkf |= DLM_LKF_NOQUEUEBAST;
121 }
122
123 if (gfs_flags & LM_FLAG_PRIORITY) {
124 lkf |= DLM_LKF_NOORDER;
125 lkf |= DLM_LKF_HEADQUE;
126 }
127
128 if (gfs_flags & LM_FLAG_ANY) {
129 if (req == DLM_LOCK_PR)
130 lkf |= DLM_LKF_ALTCW;
131 else if (req == DLM_LOCK_CW)
132 lkf |= DLM_LKF_ALTPR;
133 }
134
135 if (lp->lksb.sb_lkid != 0) {
136 lkf |= DLM_LKF_CONVERT;
137
138 /* Conversion deadlock avoidance by DLM */
139
140 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
141 !(lkf & DLM_LKF_NOQUEUE) &&
142 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
143 lkf |= DLM_LKF_CONVDEADLK;
144 }
145
146 if (lp->lvb)
147 lkf |= DLM_LKF_VALBLK;
148
149 return lkf;
150}
151
152/* make_strname - convert GFS lock numbers to a string */
153
154static inline void make_strname(struct lm_lockname *lockname,
155 struct gdlm_strname *str)
156{
157 sprintf(str->name, "%8x%16llx", lockname->ln_type,
158 (unsigned long long)lockname->ln_number);
159 str->namelen = GDLM_STRNAME_BYTES;
160}
161
162static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
163 struct gdlm_lock **lpp)
164{
165 struct gdlm_lock *lp;
166
167 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
168 if (!lp)
169 return -ENOMEM;
170
171 lp->lockname = *name;
172 lp->ls = ls;
173 lp->cur = DLM_LOCK_IV;
174 lp->lvb = NULL;
175 lp->hold_null = NULL;
176 init_completion(&lp->ast_wait);
177 INIT_LIST_HEAD(&lp->clist);
178 INIT_LIST_HEAD(&lp->blist);
179 INIT_LIST_HEAD(&lp->delay_list);
180
181 spin_lock(&ls->async_lock);
182 list_add(&lp->all_list, &ls->all_locks);
183 ls->all_locks_count++;
184 spin_unlock(&ls->async_lock);
185
186 *lpp = lp;
187 return 0;
188}
189
190void gdlm_delete_lp(struct gdlm_lock *lp)
191{
192 struct gdlm_ls *ls = lp->ls;
193
194 spin_lock(&ls->async_lock);
195 if (!list_empty(&lp->clist))
196 list_del_init(&lp->clist);
197 if (!list_empty(&lp->blist))
198 list_del_init(&lp->blist);
199 if (!list_empty(&lp->delay_list))
200 list_del_init(&lp->delay_list);
201 gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
202 (unsigned long long)lp->lockname.ln_number);
203 list_del_init(&lp->all_list);
204 ls->all_locks_count--;
205 spin_unlock(&ls->async_lock);
206
207 kfree(lp);
208}
209
210int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
211 void **lockp)
212{
213 struct gdlm_lock *lp;
214 int error;
215
216 error = gdlm_create_lp(lockspace, name, &lp);
217
218 *lockp = lp;
219 return error;
220}
221
222void gdlm_put_lock(void *lock)
223{
224 gdlm_delete_lp(lock);
225}
226
227unsigned int gdlm_do_lock(struct gdlm_lock *lp)
228{
229 struct gdlm_ls *ls = lp->ls;
230 struct gdlm_strname str;
231 int error, bast = 1;
232
233 /*
234 * When recovery is in progress, delay lock requests for submission
235 * once recovery is done. Requests for recovery (NOEXP) and unlocks
236 * can pass.
237 */
238
239 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
240 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
241 gdlm_queue_delayed(lp);
242 return LM_OUT_ASYNC;
243 }
244
245 /*
246 * Submit the actual lock request.
247 */
248
249 if (test_bit(LFL_NOBAST, &lp->flags))
250 bast = 0;
251
252 make_strname(&lp->lockname, &str);
253
254 set_bit(LFL_ACTIVE, &lp->flags);
255
256 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
257 (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
258 lp->cur, lp->req, lp->lkf);
259
260 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
261 str.name, str.namelen, 0, gdlm_ast, lp,
262 bast ? gdlm_bast : NULL);
263
264 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
265 lp->lksb.sb_status = -EAGAIN;
266 queue_complete(lp);
267 error = 0;
268 }
269
270 if (error) {
271 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
272 "flags=%lx", ls->fsname, lp->lockname.ln_type,
273 (unsigned long long)lp->lockname.ln_number, error,
274 lp->cur, lp->req, lp->lkf, lp->flags);
275 return LM_OUT_ERROR;
276 }
277 return LM_OUT_ASYNC;
278}
279
280static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
281{
282 struct gdlm_ls *ls = lp->ls;
283 unsigned int lkf = 0;
284 int error;
285
286 set_bit(LFL_DLM_UNLOCK, &lp->flags);
287 set_bit(LFL_ACTIVE, &lp->flags);
288
289 if (lp->lvb)
290 lkf = DLM_LKF_VALBLK;
291
292 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
293 (unsigned long long)lp->lockname.ln_number,
294 lp->lksb.sb_lkid, lp->cur, lkf);
295
296 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
297
298 if (error) {
299 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
300 "flags=%lx", ls->fsname, lp->lockname.ln_type,
301 (unsigned long long)lp->lockname.ln_number, error,
302 lp->cur, lp->req, lp->lkf, lp->flags);
303 return LM_OUT_ERROR;
304 }
305 return LM_OUT_ASYNC;
306}
307
308unsigned int gdlm_lock(void *lock, unsigned int cur_state,
309 unsigned int req_state, unsigned int flags)
310{
311 struct gdlm_lock *lp = lock;
312
313 clear_bit(LFL_DLM_CANCEL, &lp->flags);
314 if (flags & LM_FLAG_NOEXP)
315 set_bit(LFL_NOBLOCK, &lp->flags);
316
317 check_cur_state(lp, cur_state);
318 lp->req = make_mode(req_state);
319 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
320
321 return gdlm_do_lock(lp);
322}
323
324unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
325{
326 struct gdlm_lock *lp = lock;
327
328 clear_bit(LFL_DLM_CANCEL, &lp->flags);
329 if (lp->cur == DLM_LOCK_IV)
330 return 0;
331 return gdlm_do_unlock(lp);
332}
333
334void gdlm_cancel(void *lock)
335{
336 struct gdlm_lock *lp = lock;
337 struct gdlm_ls *ls = lp->ls;
338 int error, delay_list = 0;
339
340 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
341 return;
342
343 log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
344 (unsigned long long)lp->lockname.ln_number, lp->flags);
345
346 spin_lock(&ls->async_lock);
347 if (!list_empty(&lp->delay_list)) {
348 list_del_init(&lp->delay_list);
349 delay_list = 1;
350 }
351 spin_unlock(&ls->async_lock);
352
353 if (delay_list) {
354 set_bit(LFL_CANCEL, &lp->flags);
355 set_bit(LFL_ACTIVE, &lp->flags);
356 queue_complete(lp);
357 return;
358 }
359
360 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
361 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
362 log_info("gdlm_cancel skip %x,%llx flags %lx",
363 lp->lockname.ln_type,
364 (unsigned long long)lp->lockname.ln_number, lp->flags);
365 return;
366 }
367
368 /* the lock is blocked in the dlm */
369
370 set_bit(LFL_DLM_CANCEL, &lp->flags);
371 set_bit(LFL_ACTIVE, &lp->flags);
372
373 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
374 NULL, lp);
375
376 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
377 lp->lockname.ln_type,
378 (unsigned long long)lp->lockname.ln_number, lp->flags);
379
380 if (error == -EBUSY)
381 clear_bit(LFL_DLM_CANCEL, &lp->flags);
382}
383
384static int gdlm_add_lvb(struct gdlm_lock *lp)
385{
386 char *lvb;
387
388 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
389 if (!lvb)
390 return -ENOMEM;
391
392 lp->lksb.sb_lvbptr = lvb;
393 lp->lvb = lvb;
394 return 0;
395}
396
397static void gdlm_del_lvb(struct gdlm_lock *lp)
398{
399 kfree(lp->lvb);
400 lp->lvb = NULL;
401 lp->lksb.sb_lvbptr = NULL;
402}
403
404/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
405 the completion) because gfs won't call hold_lvb() during a callback (from
406 the context of a lock_dlm thread). */
407
408static int hold_null_lock(struct gdlm_lock *lp)
409{
410 struct gdlm_lock *lpn = NULL;
411 int error;
412
413 if (lp->hold_null) {
414 printk(KERN_INFO "lock_dlm: lvb already held\n");
415 return 0;
416 }
417
418 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
419 if (error)
420 goto out;
421
422 lpn->lksb.sb_lvbptr = junk_lvb;
423 lpn->lvb = junk_lvb;
424
425 lpn->req = DLM_LOCK_NL;
426 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
427 set_bit(LFL_NOBAST, &lpn->flags);
428 set_bit(LFL_INLOCK, &lpn->flags);
429
430 init_completion(&lpn->ast_wait);
431 gdlm_do_lock(lpn);
432 wait_for_completion(&lpn->ast_wait);
433 error = lpn->lksb.sb_status;
434 if (error) {
435 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
436 error);
437 gdlm_delete_lp(lpn);
438 lpn = NULL;
439 }
440out:
441 lp->hold_null = lpn;
442 return error;
443}
444
445/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
446 the completion) because gfs may call unhold_lvb() during a callback (from
447 the context of a lock_dlm thread) which could cause a deadlock since the
448 other lock_dlm thread could be engaged in recovery. */
449
450static void unhold_null_lock(struct gdlm_lock *lp)
451{
452 struct gdlm_lock *lpn = lp->hold_null;
453
454 gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
455 (unsigned long long)lp->lockname.ln_number);
456 lpn->lksb.sb_lvbptr = NULL;
457 lpn->lvb = NULL;
458 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
459 gdlm_do_unlock(lpn);
460 lp->hold_null = NULL;
461}
462
463/* Acquire a NL lock because gfs requires the value block to remain
464 intact on the resource while the lvb is "held" even if it's holding no locks
465 on the resource. */
466
467int gdlm_hold_lvb(void *lock, char **lvbp)
468{
469 struct gdlm_lock *lp = lock;
470 int error;
471
472 error = gdlm_add_lvb(lp);
473 if (error)
474 return error;
475
476 *lvbp = lp->lvb;
477
478 error = hold_null_lock(lp);
479 if (error)
480 gdlm_del_lvb(lp);
481
482 return error;
483}
484
485void gdlm_unhold_lvb(void *lock, char *lvb)
486{
487 struct gdlm_lock *lp = lock;
488
489 unhold_null_lock(lp);
490 gdlm_del_lvb(lp);
491}
492
493void gdlm_submit_delayed(struct gdlm_ls *ls)
494{
495 struct gdlm_lock *lp, *safe;
496
497 spin_lock(&ls->async_lock);
498 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
499 list_del_init(&lp->delay_list);
500 list_add_tail(&lp->delay_list, &ls->submit);
501 }
502 spin_unlock(&ls->async_lock);
503 wake_up(&ls->thread_wait);
504}
505
506int gdlm_release_all_locks(struct gdlm_ls *ls)
507{
508 struct gdlm_lock *lp, *safe;
509 int count = 0;
510
511 spin_lock(&ls->async_lock);
512 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
513 list_del_init(&lp->all_list);
514
515 if (lp->lvb && lp->lvb != junk_lvb)
516 kfree(lp->lvb);
517 kfree(lp);
518 count++;
519 }
520 spin_unlock(&ls->async_lock);
521
522 return count;
523}
524
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..33af707a4d3f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,187 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/list.h>
20#include <linux/socket.h>
21#include <linux/delay.h>
22#include <linux/kthread.h>
23#include <linux/kobject.h>
24#include <linux/fcntl.h>
25#include <linux/wait.h>
26#include <net/sock.h>
27
28#include <linux/dlm.h>
29#include <linux/lm_interface.h>
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 u32 id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 struct gfs2_sbd *sdp;
71 int recover_jid;
72 int recover_jid_done;
73 int recover_jid_status;
74 spinlock_t async_lock;
75 struct list_head complete;
76 struct list_head blocking;
77 struct list_head delayed;
78 struct list_head submit;
79 struct list_head all_locks;
80 u32 all_locks_count;
81 wait_queue_head_t wait_control;
82 struct task_struct *thread1;
83 struct task_struct *thread2;
84 wait_queue_head_t thread_wait;
85 unsigned long drop_time;
86 int drop_locks_count;
87 int drop_locks_period;
88};
89
90enum {
91 LFL_NOBLOCK = 0,
92 LFL_NOCACHE = 1,
93 LFL_DLM_UNLOCK = 2,
94 LFL_DLM_CANCEL = 3,
95 LFL_SYNC_LVB = 4,
96 LFL_FORCE_PROMOTE = 5,
97 LFL_REREQUEST = 6,
98 LFL_ACTIVE = 7,
99 LFL_INLOCK = 8,
100 LFL_CANCEL = 9,
101 LFL_NOBAST = 10,
102 LFL_HEADQUE = 11,
103 LFL_UNLOCK_DELETE = 12,
104};
105
106struct gdlm_lock {
107 struct gdlm_ls *ls;
108 struct lm_lockname lockname;
109 char *lvb;
110 struct dlm_lksb lksb;
111
112 s16 cur;
113 s16 req;
114 s16 prev_req;
115 u32 lkf; /* dlm flags DLM_LKF_ */
116 unsigned long flags; /* lock_dlm flags LFL_ */
117
118 int bast_mode; /* protected by async_lock */
119 struct completion ast_wait;
120
121 struct list_head clist; /* complete */
122 struct list_head blist; /* blocking */
123 struct list_head delay_list; /* delayed */
124 struct list_head all_list; /* all locks for the fs */
125 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
126};
127
128#define gdlm_assert(assertion, fmt, args...) \
129do { \
130 if (unlikely(!(assertion))) { \
131 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
132 "lock_dlm: " fmt "\n", \
133 #assertion, ##args); \
134 BUG(); \
135 } \
136} while (0)
137
138#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
139#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
140#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
141#ifdef LOCK_DLM_LOG_DEBUG
142#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
143#else
144#define log_debug(fmt, arg...)
145#endif
146
147/* sysfs.c */
148
149int gdlm_sysfs_init(void);
150void gdlm_sysfs_exit(void);
151int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
152void gdlm_kobject_release(struct gdlm_ls *);
153
154/* thread.c */
155
156int gdlm_init_threads(struct gdlm_ls *);
157void gdlm_release_threads(struct gdlm_ls *);
158
159/* lock.c */
160
161s16 gdlm_make_lmstate(s16);
162void gdlm_queue_delayed(struct gdlm_lock *);
163void gdlm_submit_delayed(struct gdlm_ls *);
164int gdlm_release_all_locks(struct gdlm_ls *);
165void gdlm_delete_lp(struct gdlm_lock *);
166unsigned int gdlm_do_lock(struct gdlm_lock *);
167
168int gdlm_get_lock(void *, struct lm_lockname *, void **);
169void gdlm_put_lock(void *);
170unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
171unsigned int gdlm_unlock(void *, unsigned int);
172void gdlm_cancel(void *);
173int gdlm_hold_lvb(void *, char **);
174void gdlm_unhold_lvb(void *, char *);
175
176/* plock.c */
177
178int gdlm_plock_init(void);
179void gdlm_plock_exit(void);
180int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
181 struct file_lock *);
182int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
183 struct file_lock *);
184int gdlm_punlock(void *, struct lm_lockname *, struct file *,
185 struct file_lock *);
186#endif
187
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..2194b1d5b5ec
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops;
18
19static int __init init_lock_dlm(void)
20{
21 int error;
22
23 error = gfs2_register_lockproto(&gdlm_ops);
24 if (error) {
25 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
26 error);
27 return error;
28 }
29
30 error = gdlm_sysfs_init();
31 if (error) {
32 gfs2_unregister_lockproto(&gdlm_ops);
33 return error;
34 }
35
36 error = gdlm_plock_init();
37 if (error) {
38 gdlm_sysfs_exit();
39 gfs2_unregister_lockproto(&gdlm_ops);
40 return error;
41 }
42
43 gdlm_drop_count = GDLM_DROP_COUNT;
44 gdlm_drop_period = GDLM_DROP_PERIOD;
45
46 printk(KERN_INFO
47 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
48 return 0;
49}
50
51static void __exit exit_lock_dlm(void)
52{
53 gdlm_plock_exit();
54 gdlm_sysfs_exit();
55 gfs2_unregister_lockproto(&gdlm_ops);
56}
57
58module_init(init_lock_dlm);
59module_exit(exit_lock_dlm);
60
61MODULE_DESCRIPTION("GFS DLM Locking Module");
62MODULE_AUTHOR("Red Hat, Inc.");
63MODULE_LICENSE("GPL");
64
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..cdd1694e889b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,255 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12int gdlm_drop_count;
13int gdlm_drop_period;
14const struct lm_lockops gdlm_ops;
15
16
17static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
18 int flags, char *table_name)
19{
20 struct gdlm_ls *ls;
21 char buf[256], *p;
22
23 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
24 if (!ls)
25 return NULL;
26
27 ls->drop_locks_count = gdlm_drop_count;
28 ls->drop_locks_period = gdlm_drop_period;
29 ls->fscb = cb;
30 ls->sdp = sdp;
31 ls->fsflags = flags;
32 spin_lock_init(&ls->async_lock);
33 INIT_LIST_HEAD(&ls->complete);
34 INIT_LIST_HEAD(&ls->blocking);
35 INIT_LIST_HEAD(&ls->delayed);
36 INIT_LIST_HEAD(&ls->submit);
37 INIT_LIST_HEAD(&ls->all_locks);
38 init_waitqueue_head(&ls->thread_wait);
39 init_waitqueue_head(&ls->wait_control);
40 ls->thread1 = NULL;
41 ls->thread2 = NULL;
42 ls->drop_time = jiffies;
43 ls->jid = -1;
44
45 strncpy(buf, table_name, 256);
46 buf[255] = '\0';
47
48 p = strchr(buf, ':');
49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls);
52 return NULL;
53 }
54 *p = '\0';
55 p++;
56
57 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
58 strncpy(ls->fsname, p, GDLM_NAME_LEN);
59
60 return ls;
61}
62
63static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
64{
65 char data[256];
66 char *options, *x, *y;
67 int error = 0;
68
69 memset(data, 0, 256);
70 strncpy(data, data_arg, 255);
71
72 for (options = data; (x = strsep(&options, ":")); ) {
73 if (!*x)
74 continue;
75
76 y = strchr(x, '=');
77 if (y)
78 *y++ = 0;
79
80 if (!strcmp(x, "jid")) {
81 if (!y) {
82 log_error("need argument to jid");
83 error = -EINVAL;
84 break;
85 }
86 sscanf(y, "%u", &ls->jid);
87
88 } else if (!strcmp(x, "first")) {
89 if (!y) {
90 log_error("need argument to first");
91 error = -EINVAL;
92 break;
93 }
94 sscanf(y, "%u", &ls->first);
95
96 } else if (!strcmp(x, "id")) {
97 if (!y) {
98 log_error("need argument to id");
99 error = -EINVAL;
100 break;
101 }
102 sscanf(y, "%u", &ls->id);
103
104 } else if (!strcmp(x, "nodir")) {
105 if (!y) {
106 log_error("need argument to nodir");
107 error = -EINVAL;
108 break;
109 }
110 sscanf(y, "%u", nodir);
111
112 } else {
113 log_error("unkonwn option: %s", x);
114 error = -EINVAL;
115 break;
116 }
117 }
118
119 return error;
120}
121
122static int gdlm_mount(char *table_name, char *host_data,
123 lm_callback_t cb, void *cb_data,
124 unsigned int min_lvb_size, int flags,
125 struct lm_lockstruct *lockstruct,
126 struct kobject *fskobj)
127{
128 struct gdlm_ls *ls;
129 int error = -ENOMEM, nodir = 0;
130
131 if (min_lvb_size > GDLM_LVB_SIZE)
132 goto out;
133
134 ls = init_gdlm(cb, cb_data, flags, table_name);
135 if (!ls)
136 goto out;
137
138 error = make_args(ls, host_data, &nodir);
139 if (error)
140 goto out;
141
142 error = gdlm_init_threads(ls);
143 if (error)
144 goto out_free;
145
146 error = gdlm_kobject_setup(ls, fskobj);
147 if (error)
148 goto out_thread;
149
150 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
151 &ls->dlm_lockspace,
152 nodir ? DLM_LSFL_NODIR : 0,
153 GDLM_LVB_SIZE);
154 if (error) {
155 log_error("dlm_new_lockspace error %d", error);
156 goto out_kobj;
157 }
158
159 lockstruct->ls_jid = ls->jid;
160 lockstruct->ls_first = ls->first;
161 lockstruct->ls_lockspace = ls;
162 lockstruct->ls_ops = &gdlm_ops;
163 lockstruct->ls_flags = 0;
164 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
165 return 0;
166
167out_kobj:
168 gdlm_kobject_release(ls);
169out_thread:
170 gdlm_release_threads(ls);
171out_free:
172 kfree(ls);
173out:
174 return error;
175}
176
177static void gdlm_unmount(void *lockspace)
178{
179 struct gdlm_ls *ls = lockspace;
180 int rv;
181
182 log_debug("unmount flags %lx", ls->flags);
183
184 /* FIXME: serialize unmount and withdraw in case they
185 happen at once. Also, if unmount follows withdraw,
186 wait for withdraw to finish. */
187
188 if (test_bit(DFL_WITHDRAW, &ls->flags))
189 goto out;
190
191 gdlm_kobject_release(ls);
192 dlm_release_lockspace(ls->dlm_lockspace, 2);
193 gdlm_release_threads(ls);
194 rv = gdlm_release_all_locks(ls);
195 if (rv)
196 log_info("gdlm_unmount: %d stray locks freed", rv);
197out:
198 kfree(ls);
199}
200
201static void gdlm_recovery_done(void *lockspace, unsigned int jid,
202 unsigned int message)
203{
204 struct gdlm_ls *ls = lockspace;
205 ls->recover_jid_done = jid;
206 ls->recover_jid_status = message;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
208}
209
210static void gdlm_others_may_mount(void *lockspace)
211{
212 struct gdlm_ls *ls = lockspace;
213 ls->first_done = 1;
214 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
215}
216
217/* Userspace gets the offline uevent, blocks new gfs locks on
218 other mounters, and lets us know (sets WITHDRAW flag). Then,
219 userspace leaves the mount group while we leave the lockspace. */
220
221static void gdlm_withdraw(void *lockspace)
222{
223 struct gdlm_ls *ls = lockspace;
224
225 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
226
227 wait_event_interruptible(ls->wait_control,
228 test_bit(DFL_WITHDRAW, &ls->flags));
229
230 dlm_release_lockspace(ls->dlm_lockspace, 2);
231 gdlm_release_threads(ls);
232 gdlm_release_all_locks(ls);
233 gdlm_kobject_release(ls);
234}
235
236const struct lm_lockops gdlm_ops = {
237 .lm_proto_name = "lock_dlm",
238 .lm_mount = gdlm_mount,
239 .lm_others_may_mount = gdlm_others_may_mount,
240 .lm_unmount = gdlm_unmount,
241 .lm_withdraw = gdlm_withdraw,
242 .lm_get_lock = gdlm_get_lock,
243 .lm_put_lock = gdlm_put_lock,
244 .lm_lock = gdlm_lock,
245 .lm_unlock = gdlm_unlock,
246 .lm_plock = gdlm_plock,
247 .lm_punlock = gdlm_punlock,
248 .lm_plock_get = gdlm_plock_get,
249 .lm_cancel = gdlm_cancel,
250 .lm_hold_lvb = gdlm_hold_lvb,
251 .lm_unhold_lvb = gdlm_unhold_lvb,
252 .lm_recovery_done = gdlm_recovery_done,
253 .lm_owner = THIS_MODULE,
254};
255
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..7365aec9511b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,301 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License version 2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11
12#include "lock_dlm.h"
13
14
15static spinlock_t ops_lock;
16static struct list_head send_list;
17static struct list_head recv_list;
18static wait_queue_head_t send_wq;
19static wait_queue_head_t recv_wq;
20
21struct plock_op {
22 struct list_head list;
23 int done;
24 struct gdlm_plock_info info;
25};
26
27static inline void set_version(struct gdlm_plock_info *info)
28{
29 info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
30 info->version[1] = GDLM_PLOCK_VERSION_MINOR;
31 info->version[2] = GDLM_PLOCK_VERSION_PATCH;
32}
33
34static int check_version(struct gdlm_plock_info *info)
35{
36 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
37 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
38 log_error("plock device version mismatch: "
39 "kernel (%u.%u.%u), user (%u.%u.%u)",
40 GDLM_PLOCK_VERSION_MAJOR,
41 GDLM_PLOCK_VERSION_MINOR,
42 GDLM_PLOCK_VERSION_PATCH,
43 info->version[0],
44 info->version[1],
45 info->version[2]);
46 return -EINVAL;
47 }
48 return 0;
49}
50
51static void send_op(struct plock_op *op)
52{
53 set_version(&op->info);
54 INIT_LIST_HEAD(&op->list);
55 spin_lock(&ops_lock);
56 list_add_tail(&op->list, &send_list);
57 spin_unlock(&ops_lock);
58 wake_up(&send_wq);
59}
60
61int gdlm_plock(void *lockspace, struct lm_lockname *name,
62 struct file *file, int cmd, struct file_lock *fl)
63{
64 struct gdlm_ls *ls = lockspace;
65 struct plock_op *op;
66 int rv;
67
68 op = kzalloc(sizeof(*op), GFP_KERNEL);
69 if (!op)
70 return -ENOMEM;
71
72 op->info.optype = GDLM_PLOCK_OP_LOCK;
73 op->info.pid = fl->fl_pid;
74 op->info.ex = (fl->fl_type == F_WRLCK);
75 op->info.wait = IS_SETLKW(cmd);
76 op->info.fsid = ls->id;
77 op->info.number = name->ln_number;
78 op->info.start = fl->fl_start;
79 op->info.end = fl->fl_end;
80 op->info.owner = (__u64)(long) fl->fl_owner;
81
82 send_op(op);
83 wait_event(recv_wq, (op->done != 0));
84
85 spin_lock(&ops_lock);
86 if (!list_empty(&op->list)) {
87 printk(KERN_INFO "plock op on list\n");
88 list_del(&op->list);
89 }
90 spin_unlock(&ops_lock);
91
92 rv = op->info.rv;
93
94 if (!rv) {
95 if (posix_lock_file_wait(file, fl) < 0)
96 log_error("gdlm_plock: vfs lock error %x,%llx",
97 name->ln_type,
98 (unsigned long long)name->ln_number);
99 }
100
101 kfree(op);
102 return rv;
103}
104
105int gdlm_punlock(void *lockspace, struct lm_lockname *name,
106 struct file *file, struct file_lock *fl)
107{
108 struct gdlm_ls *ls = lockspace;
109 struct plock_op *op;
110 int rv;
111
112 op = kzalloc(sizeof(*op), GFP_KERNEL);
113 if (!op)
114 return -ENOMEM;
115
116 if (posix_lock_file_wait(file, fl) < 0)
117 log_error("gdlm_punlock: vfs unlock error %x,%llx",
118 name->ln_type, (unsigned long long)name->ln_number);
119
120 op->info.optype = GDLM_PLOCK_OP_UNLOCK;
121 op->info.pid = fl->fl_pid;
122 op->info.fsid = ls->id;
123 op->info.number = name->ln_number;
124 op->info.start = fl->fl_start;
125 op->info.end = fl->fl_end;
126 op->info.owner = (__u64)(long) fl->fl_owner;
127
128 send_op(op);
129 wait_event(recv_wq, (op->done != 0));
130
131 spin_lock(&ops_lock);
132 if (!list_empty(&op->list)) {
133 printk(KERN_INFO "punlock op on list\n");
134 list_del(&op->list);
135 }
136 spin_unlock(&ops_lock);
137
138 rv = op->info.rv;
139
140 kfree(op);
141 return rv;
142}
143
144int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
145 struct file *file, struct file_lock *fl)
146{
147 struct gdlm_ls *ls = lockspace;
148 struct plock_op *op;
149 int rv;
150
151 op = kzalloc(sizeof(*op), GFP_KERNEL);
152 if (!op)
153 return -ENOMEM;
154
155 op->info.optype = GDLM_PLOCK_OP_GET;
156 op->info.pid = fl->fl_pid;
157 op->info.ex = (fl->fl_type == F_WRLCK);
158 op->info.fsid = ls->id;
159 op->info.number = name->ln_number;
160 op->info.start = fl->fl_start;
161 op->info.end = fl->fl_end;
162
163 send_op(op);
164 wait_event(recv_wq, (op->done != 0));
165
166 spin_lock(&ops_lock);
167 if (!list_empty(&op->list)) {
168 printk(KERN_INFO "plock_get op on list\n");
169 list_del(&op->list);
170 }
171 spin_unlock(&ops_lock);
172
173 rv = op->info.rv;
174
175 if (rv == 0)
176 fl->fl_type = F_UNLCK;
177 else if (rv > 0) {
178 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
179 fl->fl_pid = op->info.pid;
180 fl->fl_start = op->info.start;
181 fl->fl_end = op->info.end;
182 }
183
184 kfree(op);
185 return rv;
186}
187
188/* a read copies out one plock request from the send list */
189static ssize_t dev_read(struct file *file, char __user *u, size_t count,
190 loff_t *ppos)
191{
192 struct gdlm_plock_info info;
193 struct plock_op *op = NULL;
194
195 if (count < sizeof(info))
196 return -EINVAL;
197
198 spin_lock(&ops_lock);
199 if (!list_empty(&send_list)) {
200 op = list_entry(send_list.next, struct plock_op, list);
201 list_move(&op->list, &recv_list);
202 memcpy(&info, &op->info, sizeof(info));
203 }
204 spin_unlock(&ops_lock);
205
206 if (!op)
207 return -EAGAIN;
208
209 if (copy_to_user(u, &info, sizeof(info)))
210 return -EFAULT;
211 return sizeof(info);
212}
213
214/* a write copies in one plock result that should match a plock_op
215 on the recv list */
216static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
217 loff_t *ppos)
218{
219 struct gdlm_plock_info info;
220 struct plock_op *op;
221 int found = 0;
222
223 if (count != sizeof(info))
224 return -EINVAL;
225
226 if (copy_from_user(&info, u, sizeof(info)))
227 return -EFAULT;
228
229 if (check_version(&info))
230 return -EINVAL;
231
232 spin_lock(&ops_lock);
233 list_for_each_entry(op, &recv_list, list) {
234 if (op->info.fsid == info.fsid && op->info.number == info.number &&
235 op->info.owner == info.owner) {
236 list_del_init(&op->list);
237 found = 1;
238 op->done = 1;
239 memcpy(&op->info, &info, sizeof(info));
240 break;
241 }
242 }
243 spin_unlock(&ops_lock);
244
245 if (found)
246 wake_up(&recv_wq);
247 else
248 printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
249 (unsigned long long)info.number);
250 return count;
251}
252
253static unsigned int dev_poll(struct file *file, poll_table *wait)
254{
255 poll_wait(file, &send_wq, wait);
256
257 spin_lock(&ops_lock);
258 if (!list_empty(&send_list)) {
259 spin_unlock(&ops_lock);
260 return POLLIN | POLLRDNORM;
261 }
262 spin_unlock(&ops_lock);
263 return 0;
264}
265
266static struct file_operations dev_fops = {
267 .read = dev_read,
268 .write = dev_write,
269 .poll = dev_poll,
270 .owner = THIS_MODULE
271};
272
273static struct miscdevice plock_dev_misc = {
274 .minor = MISC_DYNAMIC_MINOR,
275 .name = GDLM_PLOCK_MISC_NAME,
276 .fops = &dev_fops
277};
278
279int gdlm_plock_init(void)
280{
281 int rv;
282
283 spin_lock_init(&ops_lock);
284 INIT_LIST_HEAD(&send_list);
285 INIT_LIST_HEAD(&recv_list);
286 init_waitqueue_head(&send_wq);
287 init_waitqueue_head(&recv_wq);
288
289 rv = misc_register(&plock_dev_misc);
290 if (rv)
291 printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
292 rv);
293 return rv;
294}
295
296void gdlm_plock_exit(void)
297{
298 if (misc_deregister(&plock_dev_misc) < 0)
299 printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
300}
301
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..29ae06f94944
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,226 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
20}
21
22static ssize_t block_show(struct gdlm_ls *ls, char *buf)
23{
24 ssize_t ret;
25 int val = 0;
26
27 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
28 val = 1;
29 ret = sprintf(buf, "%d\n", val);
30 return ret;
31}
32
33static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
34{
35 ssize_t ret = len;
36 int val;
37
38 val = simple_strtol(buf, NULL, 0);
39
40 if (val == 1)
41 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 else if (val == 0) {
43 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
44 gdlm_submit_delayed(ls);
45 } else {
46 ret = -EINVAL;
47 }
48 return ret;
49}
50
51static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
52{
53 ssize_t ret;
54 int val = 0;
55
56 if (test_bit(DFL_WITHDRAW, &ls->flags))
57 val = 1;
58 ret = sprintf(buf, "%d\n", val);
59 return ret;
60}
61
62static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
63{
64 ssize_t ret = len;
65 int val;
66
67 val = simple_strtol(buf, NULL, 0);
68
69 if (val == 1)
70 set_bit(DFL_WITHDRAW, &ls->flags);
71 else
72 ret = -EINVAL;
73 wake_up(&ls->wait_control);
74 return ret;
75}
76
77static ssize_t id_show(struct gdlm_ls *ls, char *buf)
78{
79 return sprintf(buf, "%u\n", ls->id);
80}
81
82static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
83{
84 return sprintf(buf, "%d\n", ls->jid);
85}
86
87static ssize_t first_show(struct gdlm_ls *ls, char *buf)
88{
89 return sprintf(buf, "%d\n", ls->first);
90}
91
92static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
93{
94 return sprintf(buf, "%d\n", ls->first_done);
95}
96
97static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
98{
99 return sprintf(buf, "%d\n", ls->recover_jid);
100}
101
102static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
103{
104 ls->recover_jid = simple_strtol(buf, NULL, 0);
105 ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
106 return len;
107}
108
109static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
110{
111 return sprintf(buf, "%d\n", ls->recover_jid_done);
112}
113
114static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
115{
116 return sprintf(buf, "%d\n", ls->recover_jid_status);
117}
118
119struct gdlm_attr {
120 struct attribute attr;
121 ssize_t (*show)(struct gdlm_ls *, char *);
122 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
123};
124
125#define GDLM_ATTR(_name,_mode,_show,_store) \
126static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
127
128GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
129GDLM_ATTR(block, 0644, block_show, block_store);
130GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
131GDLM_ATTR(id, 0444, id_show, NULL);
132GDLM_ATTR(jid, 0444, jid_show, NULL);
133GDLM_ATTR(first, 0444, first_show, NULL);
134GDLM_ATTR(first_done, 0444, first_done_show, NULL);
135GDLM_ATTR(recover, 0644, recover_show, recover_store);
136GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
137GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
138
139static struct attribute *gdlm_attrs[] = {
140 &gdlm_attr_proto_name.attr,
141 &gdlm_attr_block.attr,
142 &gdlm_attr_withdraw.attr,
143 &gdlm_attr_id.attr,
144 &gdlm_attr_jid.attr,
145 &gdlm_attr_first.attr,
146 &gdlm_attr_first_done.attr,
147 &gdlm_attr_recover.attr,
148 &gdlm_attr_recover_done.attr,
149 &gdlm_attr_recover_status.attr,
150 NULL,
151};
152
153static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
154 char *buf)
155{
156 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
157 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
158 return a->show ? a->show(ls, buf) : 0;
159}
160
161static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
162 const char *buf, size_t len)
163{
164 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
165 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
166 return a->store ? a->store(ls, buf, len) : len;
167}
168
169static struct sysfs_ops gdlm_attr_ops = {
170 .show = gdlm_attr_show,
171 .store = gdlm_attr_store,
172};
173
174static struct kobj_type gdlm_ktype = {
175 .default_attrs = gdlm_attrs,
176 .sysfs_ops = &gdlm_attr_ops,
177};
178
179static struct kset gdlm_kset = {
180 .subsys = &kernel_subsys,
181 .kobj = {.name = "lock_dlm",},
182 .ktype = &gdlm_ktype,
183};
184
185int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
186{
187 int error;
188
189 error = kobject_set_name(&ls->kobj, "%s", "lock_module");
190 if (error) {
191 log_error("can't set kobj name %d", error);
192 return error;
193 }
194
195 ls->kobj.kset = &gdlm_kset;
196 ls->kobj.ktype = &gdlm_ktype;
197 ls->kobj.parent = fskobj;
198
199 error = kobject_register(&ls->kobj);
200 if (error)
201 log_error("can't register kobj %d", error);
202
203 return error;
204}
205
206void gdlm_kobject_release(struct gdlm_ls *ls)
207{
208 kobject_unregister(&ls->kobj);
209}
210
211int gdlm_sysfs_init(void)
212{
213 int error;
214
215 error = kset_register(&gdlm_kset);
216 if (error)
217 printk("lock_dlm: cannot register kset %d\n", error);
218
219 return error;
220}
221
222void gdlm_sysfs_exit(void)
223{
224 kset_unregister(&gdlm_kset);
225}
226
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..9cf1f168eaf8
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->sdp, cb, &lp->lockname);
45}
46
47static void process_complete(struct gdlm_lock *lp)
48{
49 struct gdlm_ls *ls = lp->ls;
50 struct lm_async_cb acb;
51 s16 prev_mode = lp->cur;
52
53 memset(&acb, 0, sizeof(acb));
54
55 if (lp->lksb.sb_status == -DLM_ECANCEL) {
56 log_info("complete dlm cancel %x,%llx flags %lx",
57 lp->lockname.ln_type,
58 (unsigned long long)lp->lockname.ln_number,
59 lp->flags);
60
61 lp->req = lp->cur;
62 acb.lc_ret |= LM_OUT_CANCELED;
63 if (lp->cur == DLM_LOCK_IV)
64 lp->lksb.sb_lkid = 0;
65 goto out;
66 }
67
68 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
69 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
70 log_info("unlock sb_status %d %x,%llx flags %lx",
71 lp->lksb.sb_status, lp->lockname.ln_type,
72 (unsigned long long)lp->lockname.ln_number,
73 lp->flags);
74 return;
75 }
76
77 lp->cur = DLM_LOCK_IV;
78 lp->req = DLM_LOCK_IV;
79 lp->lksb.sb_lkid = 0;
80
81 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
82 gdlm_delete_lp(lp);
83 return;
84 }
85 goto out;
86 }
87
88 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
89 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
90
91 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
92 if (lp->req == DLM_LOCK_PR)
93 lp->req = DLM_LOCK_CW;
94 else if (lp->req == DLM_LOCK_CW)
95 lp->req = DLM_LOCK_PR;
96 }
97
98 /*
99 * A canceled lock request. The lock was just taken off the delayed
100 * list and was never even submitted to dlm.
101 */
102
103 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
104 log_info("complete internal cancel %x,%llx",
105 lp->lockname.ln_type,
106 (unsigned long long)lp->lockname.ln_number);
107 lp->req = lp->cur;
108 acb.lc_ret |= LM_OUT_CANCELED;
109 goto out;
110 }
111
112 /*
113 * An error occured.
114 */
115
116 if (lp->lksb.sb_status) {
117 /* a "normal" error */
118 if ((lp->lksb.sb_status == -EAGAIN) &&
119 (lp->lkf & DLM_LKF_NOQUEUE)) {
120 lp->req = lp->cur;
121 if (lp->cur == DLM_LOCK_IV)
122 lp->lksb.sb_lkid = 0;
123 goto out;
124 }
125
126 /* this could only happen with cancels I think */
127 log_info("ast sb_status %d %x,%llx flags %lx",
128 lp->lksb.sb_status, lp->lockname.ln_type,
129 (unsigned long long)lp->lockname.ln_number,
130 lp->flags);
131 return;
132 }
133
134 /*
135 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
136 */
137
138 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
139 complete(&lp->ast_wait);
140 return;
141 }
142
143 /*
144 * A lock has been demoted to NL because it initially completed during
145 * BLOCK_LOCKS. Now it must be requested in the originally requested
146 * mode.
147 */
148
149 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
150 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
151 lp->lockname.ln_type,
152 (unsigned long long)lp->lockname.ln_number);
153 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
154 lp->lockname.ln_type,
155 (unsigned long long)lp->lockname.ln_number);
156
157 lp->cur = DLM_LOCK_NL;
158 lp->req = lp->prev_req;
159 lp->prev_req = DLM_LOCK_IV;
160 lp->lkf &= ~DLM_LKF_CONVDEADLK;
161
162 set_bit(LFL_NOCACHE, &lp->flags);
163
164 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
165 !test_bit(LFL_NOBLOCK, &lp->flags))
166 gdlm_queue_delayed(lp);
167 else
168 queue_submit(lp);
169 return;
170 }
171
172 /*
173 * A request is granted during dlm recovery. It may be granted
174 * because the locks of a failed node were cleared. In that case,
175 * there may be inconsistent data beneath this lock and we must wait
176 * for recovery to complete to use it. When gfs recovery is done this
177 * granted lock will be converted to NL and then reacquired in this
178 * granted state.
179 */
180
181 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
182 !test_bit(LFL_NOBLOCK, &lp->flags) &&
183 lp->req != DLM_LOCK_NL) {
184
185 lp->cur = lp->req;
186 lp->prev_req = lp->req;
187 lp->req = DLM_LOCK_NL;
188 lp->lkf |= DLM_LKF_CONVERT;
189 lp->lkf &= ~DLM_LKF_CONVDEADLK;
190
191 log_debug("rereq %x,%llx id %x %d,%d",
192 lp->lockname.ln_type,
193 (unsigned long long)lp->lockname.ln_number,
194 lp->lksb.sb_lkid, lp->cur, lp->req);
195
196 set_bit(LFL_REREQUEST, &lp->flags);
197 queue_submit(lp);
198 return;
199 }
200
201 /*
202 * DLM demoted the lock to NL before it was granted so GFS must be
203 * told it cannot cache data for this lock.
204 */
205
206 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
207 set_bit(LFL_NOCACHE, &lp->flags);
208
209out:
210 /*
211 * This is an internal lock_dlm lock
212 */
213
214 if (test_bit(LFL_INLOCK, &lp->flags)) {
215 clear_bit(LFL_NOBLOCK, &lp->flags);
216 lp->cur = lp->req;
217 complete(&lp->ast_wait);
218 return;
219 }
220
221 /*
222 * Normal completion of a lock request. Tell GFS it now has the lock.
223 */
224
225 clear_bit(LFL_NOBLOCK, &lp->flags);
226 lp->cur = lp->req;
227
228 acb.lc_name = lp->lockname;
229 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
230
231 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
232 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
233 acb.lc_ret |= LM_OUT_CACHEABLE;
234
235 ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
236}
237
238static inline int no_work(struct gdlm_ls *ls, int blocking)
239{
240 int ret;
241
242 spin_lock(&ls->async_lock);
243 ret = list_empty(&ls->complete) && list_empty(&ls->submit);
244 if (ret && blocking)
245 ret = list_empty(&ls->blocking);
246 spin_unlock(&ls->async_lock);
247
248 return ret;
249}
250
251static inline int check_drop(struct gdlm_ls *ls)
252{
253 if (!ls->drop_locks_count)
254 return 0;
255
256 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
257 ls->drop_time = jiffies;
258 if (ls->all_locks_count >= ls->drop_locks_count)
259 return 1;
260 }
261 return 0;
262}
263
264static int gdlm_thread(void *data)
265{
266 struct gdlm_ls *ls = (struct gdlm_ls *) data;
267 struct gdlm_lock *lp = NULL;
268 int blist = 0;
269 uint8_t complete, blocking, submit, drop;
270 DECLARE_WAITQUEUE(wait, current);
271
272 /* Only thread1 is allowed to do blocking callbacks since gfs
273 may wait for a completion callback within a blocking cb. */
274
275 if (current == ls->thread1)
276 blist = 1;
277
278 while (!kthread_should_stop()) {
279 set_current_state(TASK_INTERRUPTIBLE);
280 add_wait_queue(&ls->thread_wait, &wait);
281 if (no_work(ls, blist))
282 schedule();
283 remove_wait_queue(&ls->thread_wait, &wait);
284 set_current_state(TASK_RUNNING);
285
286 complete = blocking = submit = drop = 0;
287
288 spin_lock(&ls->async_lock);
289
290 if (blist && !list_empty(&ls->blocking)) {
291 lp = list_entry(ls->blocking.next, struct gdlm_lock,
292 blist);
293 list_del_init(&lp->blist);
294 blocking = lp->bast_mode;
295 lp->bast_mode = 0;
296 } else if (!list_empty(&ls->complete)) {
297 lp = list_entry(ls->complete.next, struct gdlm_lock,
298 clist);
299 list_del_init(&lp->clist);
300 complete = 1;
301 } else if (!list_empty(&ls->submit)) {
302 lp = list_entry(ls->submit.next, struct gdlm_lock,
303 delay_list);
304 list_del_init(&lp->delay_list);
305 submit = 1;
306 }
307
308 drop = check_drop(ls);
309 spin_unlock(&ls->async_lock);
310
311 if (complete)
312 process_complete(lp);
313
314 else if (blocking)
315 process_blocking(lp, blocking);
316
317 else if (submit)
318 gdlm_do_lock(lp);
319
320 if (drop)
321 ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
322
323 schedule();
324 }
325
326 return 0;
327}
328
329int gdlm_init_threads(struct gdlm_ls *ls)
330{
331 struct task_struct *p;
332 int error;
333
334 p = kthread_run(gdlm_thread, ls, "lock_dlm1");
335 error = IS_ERR(p);
336 if (error) {
337 log_error("can't start lock_dlm1 thread %d", error);
338 return error;
339 }
340 ls->thread1 = p;
341
342 p = kthread_run(gdlm_thread, ls, "lock_dlm2");
343 error = IS_ERR(p);
344 if (error) {
345 log_error("can't start lock_dlm2 thread %d", error);
346 kthread_stop(ls->thread1);
347 return error;
348 }
349 ls->thread2 = p;
350
351 return 0;
352}
353
354void gdlm_release_threads(struct gdlm_ls *ls)
355{
356 kthread_stop(ls->thread1);
357 kthread_stop(ls->thread2);
358}
359
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..35e9730bc3a8
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..acfbc941f319
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,246 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/fs.h>
16#include <linux/smp_lock.h>
17#include <linux/lm_interface.h>
18
19struct nolock_lockspace {
20 unsigned int nl_lvb_size;
21};
22
23static const struct lm_lockops nolock_ops;
24
25static int nolock_mount(char *table_name, char *host_data,
26 lm_callback_t cb, void *cb_data,
27 unsigned int min_lvb_size, int flags,
28 struct lm_lockstruct *lockstruct,
29 struct kobject *fskobj)
30{
31 char *c;
32 unsigned int jid;
33 struct nolock_lockspace *nl;
34
35 c = strstr(host_data, "jid=");
36 if (!c)
37 jid = 0;
38 else {
39 c += 4;
40 sscanf(c, "%u", &jid);
41 }
42
43 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
44 if (!nl)
45 return -ENOMEM;
46
47 nl->nl_lvb_size = min_lvb_size;
48
49 lockstruct->ls_jid = jid;
50 lockstruct->ls_first = 1;
51 lockstruct->ls_lvb_size = min_lvb_size;
52 lockstruct->ls_lockspace = nl;
53 lockstruct->ls_ops = &nolock_ops;
54 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
55
56 return 0;
57}
58
59static void nolock_others_may_mount(void *lockspace)
60{
61}
62
63static void nolock_unmount(void *lockspace)
64{
65 struct nolock_lockspace *nl = lockspace;
66 kfree(nl);
67}
68
69static void nolock_withdraw(void *lockspace)
70{
71}
72
73/**
74 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
75 * @lockspace: the lockspace the lock lives in
76 * @name: the name of the lock
77 * @lockp: return the lm_lock_t here
78 *
79 * Returns: 0 on success, -EXXX on failure
80 */
81
82static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
83 void **lockp)
84{
85 *lockp = lockspace;
86 return 0;
87}
88
89/**
90 * nolock_put_lock - get rid of a lock structure
91 * @lock: the lock to throw away
92 *
93 */
94
95static void nolock_put_lock(void *lock)
96{
97}
98
99/**
100 * nolock_lock - acquire a lock
101 * @lock: the lock to manipulate
102 * @cur_state: the current state
103 * @req_state: the requested state
104 * @flags: modifier flags
105 *
106 * Returns: A bitmap of LM_OUT_*
107 */
108
109static unsigned int nolock_lock(void *lock, unsigned int cur_state,
110 unsigned int req_state, unsigned int flags)
111{
112 return req_state | LM_OUT_CACHEABLE;
113}
114
115/**
116 * nolock_unlock - unlock a lock
117 * @lock: the lock to manipulate
118 * @cur_state: the current state
119 *
120 * Returns: 0
121 */
122
123static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
124{
125 return 0;
126}
127
128static void nolock_cancel(void *lock)
129{
130}
131
132/**
133 * nolock_hold_lvb - hold on to a lock value block
134 * @lock: the lock the LVB is associated with
135 * @lvbp: return the lm_lvb_t here
136 *
137 * Returns: 0 on success, -EXXX on failure
138 */
139
140static int nolock_hold_lvb(void *lock, char **lvbp)
141{
142 struct nolock_lockspace *nl = lock;
143 int error = 0;
144
145 *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
146 if (!*lvbp)
147 error = -ENOMEM;
148
149 return error;
150}
151
152/**
153 * nolock_unhold_lvb - release a LVB
154 * @lock: the lock the LVB is associated with
155 * @lvb: the lock value block
156 *
157 */
158
159static void nolock_unhold_lvb(void *lock, char *lvb)
160{
161 kfree(lvb);
162}
163
164static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
165 struct file *file, struct file_lock *fl)
166{
167 struct file_lock tmp;
168 int ret;
169
170 ret = posix_test_lock(file, fl, &tmp);
171 fl->fl_type = F_UNLCK;
172 if (ret)
173 memcpy(fl, &tmp, sizeof(struct file_lock));
174
175 return 0;
176}
177
178static int nolock_plock(void *lockspace, struct lm_lockname *name,
179 struct file *file, int cmd, struct file_lock *fl)
180{
181 int error;
182 error = posix_lock_file_wait(file, fl);
183 return error;
184}
185
186static int nolock_punlock(void *lockspace, struct lm_lockname *name,
187 struct file *file, struct file_lock *fl)
188{
189 int error;
190 error = posix_lock_file_wait(file, fl);
191 return error;
192}
193
194static void nolock_recovery_done(void *lockspace, unsigned int jid,
195 unsigned int message)
196{
197}
198
199static const struct lm_lockops nolock_ops = {
200 .lm_proto_name = "lock_nolock",
201 .lm_mount = nolock_mount,
202 .lm_others_may_mount = nolock_others_may_mount,
203 .lm_unmount = nolock_unmount,
204 .lm_withdraw = nolock_withdraw,
205 .lm_get_lock = nolock_get_lock,
206 .lm_put_lock = nolock_put_lock,
207 .lm_lock = nolock_lock,
208 .lm_unlock = nolock_unlock,
209 .lm_cancel = nolock_cancel,
210 .lm_hold_lvb = nolock_hold_lvb,
211 .lm_unhold_lvb = nolock_unhold_lvb,
212 .lm_plock_get = nolock_plock_get,
213 .lm_plock = nolock_plock,
214 .lm_punlock = nolock_punlock,
215 .lm_recovery_done = nolock_recovery_done,
216 .lm_owner = THIS_MODULE,
217};
218
219static int __init init_nolock(void)
220{
221 int error;
222
223 error = gfs2_register_lockproto(&nolock_ops);
224 if (error) {
225 printk(KERN_WARNING
226 "lock_nolock: can't register protocol: %d\n", error);
227 return error;
228 }
229
230 printk(KERN_INFO
231 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
232 return 0;
233}
234
235static void __exit exit_nolock(void)
236{
237 gfs2_unregister_lockproto(&nolock_ops);
238}
239
240module_init(init_nolock);
241module_exit(exit_nolock);
242
243MODULE_DESCRIPTION("GFS Nolock Locking Module");
244MODULE_AUTHOR("Red Hat, Inc.");
245MODULE_LICENSE("GPL");
246
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..0cace3da9dbb
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,688 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "log.h"
24#include "lops.h"
25#include "meta_io.h"
26#include "util.h"
27#include "dir.h"
28
29#define PULL 1
30
31/**
32 * gfs2_struct2blk - compute stuff
33 * @sdp: the filesystem
34 * @nstruct: the number of structures
35 * @ssize: the size of the structures
36 *
37 * Compute the number of log descriptor blocks needed to hold a certain number
38 * of structures of a certain size.
39 *
40 * Returns: the number of blocks needed (minimum is always 1)
41 */
42
43unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
44 unsigned int ssize)
45{
46 unsigned int blks;
47 unsigned int first, second;
48
49 blks = 1;
50 first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
51
52 if (nstruct > first) {
53 second = (sdp->sd_sb.sb_bsize -
54 sizeof(struct gfs2_meta_header)) / ssize;
55 blks += DIV_ROUND_UP(nstruct - first, second);
56 }
57
58 return blks;
59}
60
61/**
62 * gfs2_ail1_start_one - Start I/O on a part of the AIL
63 * @sdp: the filesystem
64 * @tr: the part of the AIL
65 *
66 */
67
68static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
69{
70 struct gfs2_bufdata *bd, *s;
71 struct buffer_head *bh;
72 int retry;
73
74 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
75
76 do {
77 retry = 0;
78
79 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
80 bd_ail_st_list) {
81 bh = bd->bd_bh;
82
83 gfs2_assert(sdp, bd->bd_ail == ai);
84
85 if (!buffer_busy(bh)) {
86 if (!buffer_uptodate(bh)) {
87 gfs2_log_unlock(sdp);
88 gfs2_io_error_bh(sdp, bh);
89 gfs2_log_lock(sdp);
90 }
91 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
92 continue;
93 }
94
95 if (!buffer_dirty(bh))
96 continue;
97
98 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
99
100 gfs2_log_unlock(sdp);
101 wait_on_buffer(bh);
102 ll_rw_block(WRITE, 1, &bh);
103 gfs2_log_lock(sdp);
104
105 retry = 1;
106 break;
107 }
108 } while (retry);
109}
110
111/**
112 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
113 * @sdp: the filesystem
114 * @ai: the AIL entry
115 *
116 */
117
118static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
119{
120 struct gfs2_bufdata *bd, *s;
121 struct buffer_head *bh;
122
123 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
124 bd_ail_st_list) {
125 bh = bd->bd_bh;
126
127 gfs2_assert(sdp, bd->bd_ail == ai);
128
129 if (buffer_busy(bh)) {
130 if (flags & DIO_ALL)
131 continue;
132 else
133 break;
134 }
135
136 if (!buffer_uptodate(bh))
137 gfs2_io_error_bh(sdp, bh);
138
139 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
140 }
141
142 return list_empty(&ai->ai_ail1_list);
143}
144
145void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
146{
147 struct list_head *head = &sdp->sd_ail1_list;
148 u64 sync_gen;
149 struct list_head *first;
150 struct gfs2_ail *first_ai, *ai, *tmp;
151 int done = 0;
152
153 gfs2_log_lock(sdp);
154 if (list_empty(head)) {
155 gfs2_log_unlock(sdp);
156 return;
157 }
158 sync_gen = sdp->sd_ail_sync_gen++;
159
160 first = head->prev;
161 first_ai = list_entry(first, struct gfs2_ail, ai_list);
162 first_ai->ai_sync_gen = sync_gen;
163 gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
164
165 if (flags & DIO_ALL)
166 first = NULL;
167
168 while(!done) {
169 if (first && (head->prev != first ||
170 gfs2_ail1_empty_one(sdp, first_ai, 0)))
171 break;
172
173 done = 1;
174 list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
175 if (ai->ai_sync_gen >= sync_gen)
176 continue;
177 ai->ai_sync_gen = sync_gen;
178 gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
179 done = 0;
180 break;
181 }
182 }
183
184 gfs2_log_unlock(sdp);
185}
186
187int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
188{
189 struct gfs2_ail *ai, *s;
190 int ret;
191
192 gfs2_log_lock(sdp);
193
194 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
195 if (gfs2_ail1_empty_one(sdp, ai, flags))
196 list_move(&ai->ai_list, &sdp->sd_ail2_list);
197 else if (!(flags & DIO_ALL))
198 break;
199 }
200
201 ret = list_empty(&sdp->sd_ail1_list);
202
203 gfs2_log_unlock(sdp);
204
205 return ret;
206}
207
208
209/**
210 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
211 * @sdp: the filesystem
212 * @ai: the AIL entry
213 *
214 */
215
216static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
217{
218 struct list_head *head = &ai->ai_ail2_list;
219 struct gfs2_bufdata *bd;
220
221 while (!list_empty(head)) {
222 bd = list_entry(head->prev, struct gfs2_bufdata,
223 bd_ail_st_list);
224 gfs2_assert(sdp, bd->bd_ail == ai);
225 bd->bd_ail = NULL;
226 list_del(&bd->bd_ail_st_list);
227 list_del(&bd->bd_ail_gl_list);
228 atomic_dec(&bd->bd_gl->gl_ail_count);
229 brelse(bd->bd_bh);
230 }
231}
232
233static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
234{
235 struct gfs2_ail *ai, *safe;
236 unsigned int old_tail = sdp->sd_log_tail;
237 int wrap = (new_tail < old_tail);
238 int a, b, rm;
239
240 gfs2_log_lock(sdp);
241
242 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
243 a = (old_tail <= ai->ai_first);
244 b = (ai->ai_first < new_tail);
245 rm = (wrap) ? (a || b) : (a && b);
246 if (!rm)
247 continue;
248
249 gfs2_ail2_empty_one(sdp, ai);
250 list_del(&ai->ai_list);
251 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
252 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
253 kfree(ai);
254 }
255
256 gfs2_log_unlock(sdp);
257}
258
259/**
260 * gfs2_log_reserve - Make a log reservation
261 * @sdp: The GFS2 superblock
262 * @blks: The number of blocks to reserve
263 *
264 * Returns: errno
265 */
266
267int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
268{
269 unsigned int try = 0;
270
271 if (gfs2_assert_warn(sdp, blks) ||
272 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
273 return -EINVAL;
274
275 mutex_lock(&sdp->sd_log_reserve_mutex);
276 gfs2_log_lock(sdp);
277 while(sdp->sd_log_blks_free <= blks) {
278 gfs2_log_unlock(sdp);
279 gfs2_ail1_empty(sdp, 0);
280 gfs2_log_flush(sdp, NULL);
281
282 if (try++)
283 gfs2_ail1_start(sdp, 0);
284 gfs2_log_lock(sdp);
285 }
286 sdp->sd_log_blks_free -= blks;
287 gfs2_log_unlock(sdp);
288 mutex_unlock(&sdp->sd_log_reserve_mutex);
289
290 down_read(&sdp->sd_log_flush_lock);
291
292 return 0;
293}
294
295/**
296 * gfs2_log_release - Release a given number of log blocks
297 * @sdp: The GFS2 superblock
298 * @blks: The number of blocks
299 *
300 */
301
302void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
303{
304
305 gfs2_log_lock(sdp);
306 sdp->sd_log_blks_free += blks;
307 gfs2_assert_withdraw(sdp,
308 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
309 gfs2_log_unlock(sdp);
310 up_read(&sdp->sd_log_flush_lock);
311}
312
313static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
314{
315 struct inode *inode = sdp->sd_jdesc->jd_inode;
316 int error;
317 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
318
319 bh_map.b_size = 1 << inode->i_blkbits;
320 error = gfs2_block_map(inode, lbn, 0, &bh_map);
321 if (error || !bh_map.b_blocknr)
322 printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, bh_map.b_blocknr, lbn);
323 gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
324
325 return bh_map.b_blocknr;
326}
327
328/**
329 * log_distance - Compute distance between two journal blocks
330 * @sdp: The GFS2 superblock
331 * @newer: The most recent journal block of the pair
332 * @older: The older journal block of the pair
333 *
334 * Compute the distance (in the journal direction) between two
335 * blocks in the journal
336 *
337 * Returns: the distance in blocks
338 */
339
340static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
341 unsigned int older)
342{
343 int dist;
344
345 dist = newer - older;
346 if (dist < 0)
347 dist += sdp->sd_jdesc->jd_blocks;
348
349 return dist;
350}
351
352static unsigned int current_tail(struct gfs2_sbd *sdp)
353{
354 struct gfs2_ail *ai;
355 unsigned int tail;
356
357 gfs2_log_lock(sdp);
358
359 if (list_empty(&sdp->sd_ail1_list)) {
360 tail = sdp->sd_log_head;
361 } else {
362 ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
363 tail = ai->ai_first;
364 }
365
366 gfs2_log_unlock(sdp);
367
368 return tail;
369}
370
371static inline void log_incr_head(struct gfs2_sbd *sdp)
372{
373 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
374 gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head);
375
376 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
377 sdp->sd_log_flush_head = 0;
378 sdp->sd_log_flush_wrapped = 1;
379 }
380}
381
382/**
383 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
384 * @sdp: The GFS2 superblock
385 *
386 * Returns: the buffer_head
387 */
388
389struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
390{
391 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
392 struct gfs2_log_buf *lb;
393 struct buffer_head *bh;
394
395 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
396 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
397
398 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
399 lock_buffer(bh);
400 memset(bh->b_data, 0, bh->b_size);
401 set_buffer_uptodate(bh);
402 clear_buffer_dirty(bh);
403 unlock_buffer(bh);
404
405 log_incr_head(sdp);
406
407 return bh;
408}
409
410/**
411 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
412 * @sdp: the filesystem
413 * @data: the data the buffer_head should point to
414 *
415 * Returns: the log buffer descriptor
416 */
417
418struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
419 struct buffer_head *real)
420{
421 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
422 struct gfs2_log_buf *lb;
423 struct buffer_head *bh;
424
425 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
426 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
427 lb->lb_real = real;
428
429 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
430 atomic_set(&bh->b_count, 1);
431 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
432 set_bh_page(bh, real->b_page, bh_offset(real));
433 bh->b_blocknr = blkno;
434 bh->b_size = sdp->sd_sb.sb_bsize;
435 bh->b_bdev = sdp->sd_vfs->s_bdev;
436
437 log_incr_head(sdp);
438
439 return bh;
440}
441
442static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
443{
444 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
445
446 ail2_empty(sdp, new_tail);
447
448 gfs2_log_lock(sdp);
449 sdp->sd_log_blks_free += dist - (pull ? 1 : 0);
450 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
451 gfs2_log_unlock(sdp);
452
453 sdp->sd_log_tail = new_tail;
454}
455
456/**
457 * log_write_header - Get and initialize a journal header buffer
458 * @sdp: The GFS2 superblock
459 *
460 * Returns: the initialized log buffer descriptor
461 */
462
463static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
464{
465 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
466 struct buffer_head *bh;
467 struct gfs2_log_header *lh;
468 unsigned int tail;
469 u32 hash;
470
471 bh = sb_getblk(sdp->sd_vfs, blkno);
472 lock_buffer(bh);
473 memset(bh->b_data, 0, bh->b_size);
474 set_buffer_uptodate(bh);
475 clear_buffer_dirty(bh);
476 unlock_buffer(bh);
477
478 gfs2_ail1_empty(sdp, 0);
479 tail = current_tail(sdp);
480
481 lh = (struct gfs2_log_header *)bh->b_data;
482 memset(lh, 0, sizeof(struct gfs2_log_header));
483 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
484 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
485 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
486 lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
487 lh->lh_flags = cpu_to_be32(flags);
488 lh->lh_tail = cpu_to_be32(tail);
489 lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
490 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
491 lh->lh_hash = cpu_to_be32(hash);
492
493 set_buffer_dirty(bh);
494 if (sync_dirty_buffer(bh))
495 gfs2_io_error_bh(sdp, bh);
496 brelse(bh);
497
498 if (sdp->sd_log_tail != tail)
499 log_pull_tail(sdp, tail, pull);
500 else
501 gfs2_assert_withdraw(sdp, !pull);
502
503 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
504 log_incr_head(sdp);
505}
506
507static void log_flush_commit(struct gfs2_sbd *sdp)
508{
509 struct list_head *head = &sdp->sd_log_flush_list;
510 struct gfs2_log_buf *lb;
511 struct buffer_head *bh;
512
513 while (!list_empty(head)) {
514 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
515 list_del(&lb->lb_list);
516 bh = lb->lb_bh;
517
518 wait_on_buffer(bh);
519 if (!buffer_uptodate(bh))
520 gfs2_io_error_bh(sdp, bh);
521 if (lb->lb_real) {
522 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
523 schedule();
524 free_buffer_head(bh);
525 } else
526 brelse(bh);
527 kfree(lb);
528 }
529
530 log_write_header(sdp, 0, 0);
531}
532
533/**
534 * gfs2_log_flush - flush incore transaction(s)
535 * @sdp: the filesystem
536 * @gl: The glock structure to flush. If NULL, flush the whole incore log
537 *
538 */
539
540void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
541{
542 struct gfs2_ail *ai;
543
544 down_write(&sdp->sd_log_flush_lock);
545
546 if (gl) {
547 gfs2_log_lock(sdp);
548 if (list_empty(&gl->gl_le.le_list)) {
549 gfs2_log_unlock(sdp);
550 up_write(&sdp->sd_log_flush_lock);
551 return;
552 }
553 gfs2_log_unlock(sdp);
554 }
555
556 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
557 INIT_LIST_HEAD(&ai->ai_ail1_list);
558 INIT_LIST_HEAD(&ai->ai_ail2_list);
559
560 gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
561 gfs2_assert_withdraw(sdp,
562 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
563
564 sdp->sd_log_flush_head = sdp->sd_log_head;
565 sdp->sd_log_flush_wrapped = 0;
566 ai->ai_first = sdp->sd_log_flush_head;
567
568 lops_before_commit(sdp);
569 if (!list_empty(&sdp->sd_log_flush_list))
570 log_flush_commit(sdp);
571 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
572 log_write_header(sdp, 0, PULL);
573 lops_after_commit(sdp, ai);
574
575 gfs2_log_lock(sdp);
576 sdp->sd_log_head = sdp->sd_log_flush_head;
577 sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
578 sdp->sd_log_blks_reserved = 0;
579 sdp->sd_log_commited_buf = 0;
580 sdp->sd_log_num_hdrs = 0;
581 sdp->sd_log_commited_revoke = 0;
582
583 if (!list_empty(&ai->ai_ail1_list)) {
584 list_add(&ai->ai_list, &sdp->sd_ail1_list);
585 ai = NULL;
586 }
587 gfs2_log_unlock(sdp);
588
589 sdp->sd_vfs->s_dirt = 0;
590 up_write(&sdp->sd_log_flush_lock);
591
592 kfree(ai);
593}
594
595static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
596{
597 unsigned int reserved = 0;
598 unsigned int old;
599
600 gfs2_log_lock(sdp);
601
602 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
603 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
604 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
605 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
606
607 if (sdp->sd_log_commited_buf)
608 reserved += sdp->sd_log_commited_buf;
609 if (sdp->sd_log_commited_revoke)
610 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
611 sizeof(u64));
612 if (reserved)
613 reserved++;
614
615 old = sdp->sd_log_blks_free;
616 sdp->sd_log_blks_free += tr->tr_reserved -
617 (reserved - sdp->sd_log_blks_reserved);
618
619 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
620 gfs2_assert_withdraw(sdp,
621 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
622 sdp->sd_log_num_hdrs);
623
624 sdp->sd_log_blks_reserved = reserved;
625
626 gfs2_log_unlock(sdp);
627}
628
629/**
630 * gfs2_log_commit - Commit a transaction to the log
631 * @sdp: the filesystem
632 * @tr: the transaction
633 *
634 * Returns: errno
635 */
636
637void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
638{
639 log_refund(sdp, tr);
640 lops_incore_commit(sdp, tr);
641
642 sdp->sd_vfs->s_dirt = 1;
643 up_read(&sdp->sd_log_flush_lock);
644
645 gfs2_log_lock(sdp);
646 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
647 gfs2_log_unlock(sdp);
648 gfs2_log_flush(sdp, NULL);
649 } else {
650 gfs2_log_unlock(sdp);
651 }
652}
653
654/**
655 * gfs2_log_shutdown - write a shutdown header into a journal
656 * @sdp: the filesystem
657 *
658 */
659
660void gfs2_log_shutdown(struct gfs2_sbd *sdp)
661{
662 down_write(&sdp->sd_log_flush_lock);
663
664 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
665 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
666 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
667 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
668 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
669 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
670 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
671 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
672 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
673
674 sdp->sd_log_flush_head = sdp->sd_log_head;
675 sdp->sd_log_flush_wrapped = 0;
676
677 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
678
679 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
680 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
681 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
682
683 sdp->sd_log_head = sdp->sd_log_flush_head;
684 sdp->sd_log_tail = sdp->sd_log_head;
685
686 up_write(&sdp->sd_log_flush_lock);
687}
688
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..7f5737d55612
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,65 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __LOG_DOT_H__
11#define __LOG_DOT_H__
12
13#include <linux/list.h>
14#include <linux/spinlock.h>
15#include "incore.h"
16
17/**
18 * gfs2_log_lock - acquire the right to mess with the log manager
19 * @sdp: the filesystem
20 *
21 */
22
23static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
24{
25 spin_lock(&sdp->sd_log_lock);
26}
27
28/**
29 * gfs2_log_unlock - release the right to mess with the log manager
30 * @sdp: the filesystem
31 *
32 */
33
34static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
35{
36 spin_unlock(&sdp->sd_log_lock);
37}
38
39static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
40 unsigned int value)
41{
42 if (++value == sdp->sd_jdesc->jd_blocks) {
43 value = 0;
44 }
45 sdp->sd_log_head = sdp->sd_log_tail = value;
46}
47
48unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
49 unsigned int ssize);
50
51void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
52int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
53
54int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
55void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
56
57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
59 struct buffer_head *real);
60void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
61void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
62
63void gfs2_log_shutdown(struct gfs2_sbd *sdp);
64
65#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..ab6d1115f95d
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,809 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17
18#include "gfs2.h"
19#include "incore.h"
20#include "glock.h"
21#include "log.h"
22#include "lops.h"
23#include "meta_io.h"
24#include "recovery.h"
25#include "rgrp.h"
26#include "trans.h"
27#include "util.h"
28
29static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
30{
31 struct gfs2_glock *gl;
32 struct gfs2_trans *tr = current->journal_info;
33
34 tr->tr_touched = 1;
35
36 if (!list_empty(&le->le_list))
37 return;
38
39 gl = container_of(le, struct gfs2_glock, gl_le);
40 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
41 return;
42 gfs2_glock_hold(gl);
43 set_bit(GLF_DIRTY, &gl->gl_flags);
44
45 gfs2_log_lock(sdp);
46 sdp->sd_log_num_gl++;
47 list_add(&le->le_list, &sdp->sd_log_le_gl);
48 gfs2_log_unlock(sdp);
49}
50
51static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 struct list_head *head = &sdp->sd_log_le_gl;
54 struct gfs2_glock *gl;
55
56 while (!list_empty(head)) {
57 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
58 list_del_init(&gl->gl_le.le_list);
59 sdp->sd_log_num_gl--;
60
61 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
62 gfs2_glock_put(gl);
63 }
64 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
65}
66
67static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
68{
69 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
70 struct gfs2_trans *tr;
71
72 if (!list_empty(&bd->bd_list_tr))
73 return;
74
75 tr = current->journal_info;
76 tr->tr_touched = 1;
77 tr->tr_num_buf++;
78 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
79
80 if (!list_empty(&le->le_list))
81 return;
82
83 gfs2_trans_add_gl(bd->bd_gl);
84
85 gfs2_meta_check(sdp, bd->bd_bh);
86 gfs2_pin(sdp, bd->bd_bh);
87
88 gfs2_log_lock(sdp);
89 sdp->sd_log_num_buf++;
90 list_add(&le->le_list, &sdp->sd_log_le_buf);
91 gfs2_log_unlock(sdp);
92
93 tr->tr_num_buf_new++;
94}
95
96static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
97{
98 struct list_head *head = &tr->tr_list_buf;
99 struct gfs2_bufdata *bd;
100
101 while (!list_empty(head)) {
102 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
103 list_del_init(&bd->bd_list_tr);
104 tr->tr_num_buf--;
105 }
106 gfs2_assert_warn(sdp, !tr->tr_num_buf);
107}
108
109static void buf_lo_before_commit(struct gfs2_sbd *sdp)
110{
111 struct buffer_head *bh;
112 struct gfs2_log_descriptor *ld;
113 struct gfs2_bufdata *bd1 = NULL, *bd2;
114 unsigned int total = sdp->sd_log_num_buf;
115 unsigned int offset = sizeof(struct gfs2_log_descriptor);
116 unsigned int limit;
117 unsigned int num;
118 unsigned n;
119 __be64 *ptr;
120
121 offset += sizeof(__be64) - 1;
122 offset &= ~(sizeof(__be64) - 1);
123 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
124 /* for 4k blocks, limit = 503 */
125
126 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
127 while(total) {
128 num = total;
129 if (total > limit)
130 num = limit;
131 bh = gfs2_log_get_buf(sdp);
132 sdp->sd_log_num_hdrs++;
133 ld = (struct gfs2_log_descriptor *)bh->b_data;
134 ptr = (__be64 *)(bh->b_data + offset);
135 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
136 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
137 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
138 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
139 ld->ld_length = cpu_to_be32(num + 1);
140 ld->ld_data1 = cpu_to_be32(num);
141 ld->ld_data2 = cpu_to_be32(0);
142 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
143
144 n = 0;
145 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
146 bd_le.le_list) {
147 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
148 if (++n >= num)
149 break;
150 }
151
152 set_buffer_dirty(bh);
153 ll_rw_block(WRITE, 1, &bh);
154
155 n = 0;
156 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
157 bd_le.le_list) {
158 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
159 set_buffer_dirty(bh);
160 ll_rw_block(WRITE, 1, &bh);
161 if (++n >= num)
162 break;
163 }
164
165 total -= num;
166 }
167}
168
169static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
170{
171 struct list_head *head = &sdp->sd_log_le_buf;
172 struct gfs2_bufdata *bd;
173
174 while (!list_empty(head)) {
175 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
176 list_del_init(&bd->bd_le.le_list);
177 sdp->sd_log_num_buf--;
178
179 gfs2_unpin(sdp, bd->bd_bh, ai);
180 }
181 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
182}
183
184static void buf_lo_before_scan(struct gfs2_jdesc *jd,
185 struct gfs2_log_header *head, int pass)
186{
187 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
188
189 if (pass != 0)
190 return;
191
192 sdp->sd_found_blocks = 0;
193 sdp->sd_replayed_blocks = 0;
194}
195
196static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
197 struct gfs2_log_descriptor *ld, __be64 *ptr,
198 int pass)
199{
200 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
201 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
202 struct gfs2_glock *gl = ip->i_gl;
203 unsigned int blks = be32_to_cpu(ld->ld_data1);
204 struct buffer_head *bh_log, *bh_ip;
205 u64 blkno;
206 int error = 0;
207
208 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
209 return 0;
210
211 gfs2_replay_incr_blk(sdp, &start);
212
213 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
214 blkno = be64_to_cpu(*ptr++);
215
216 sdp->sd_found_blocks++;
217
218 if (gfs2_revoke_check(sdp, blkno, start))
219 continue;
220
221 error = gfs2_replay_read_block(jd, start, &bh_log);
222 if (error)
223 return error;
224
225 bh_ip = gfs2_meta_new(gl, blkno);
226 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
227
228 if (gfs2_meta_check(sdp, bh_ip))
229 error = -EIO;
230 else
231 mark_buffer_dirty(bh_ip);
232
233 brelse(bh_log);
234 brelse(bh_ip);
235
236 if (error)
237 break;
238
239 sdp->sd_replayed_blocks++;
240 }
241
242 return error;
243}
244
245static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
246{
247 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
248 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
249
250 if (error) {
251 gfs2_meta_sync(ip->i_gl);
252 return;
253 }
254 if (pass != 1)
255 return;
256
257 gfs2_meta_sync(ip->i_gl);
258
259 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
260 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
261}
262
263static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
264{
265 struct gfs2_trans *tr;
266
267 tr = current->journal_info;
268 tr->tr_touched = 1;
269 tr->tr_num_revoke++;
270
271 gfs2_log_lock(sdp);
272 sdp->sd_log_num_revoke++;
273 list_add(&le->le_list, &sdp->sd_log_le_revoke);
274 gfs2_log_unlock(sdp);
275}
276
277static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
278{
279 struct gfs2_log_descriptor *ld;
280 struct gfs2_meta_header *mh;
281 struct buffer_head *bh;
282 unsigned int offset;
283 struct list_head *head = &sdp->sd_log_le_revoke;
284 struct gfs2_revoke *rv;
285
286 if (!sdp->sd_log_num_revoke)
287 return;
288
289 bh = gfs2_log_get_buf(sdp);
290 ld = (struct gfs2_log_descriptor *)bh->b_data;
291 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
292 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
293 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
294 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
295 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
296 sizeof(u64)));
297 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
298 ld->ld_data2 = cpu_to_be32(0);
299 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
300 offset = sizeof(struct gfs2_log_descriptor);
301
302 while (!list_empty(head)) {
303 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
304 list_del_init(&rv->rv_le.le_list);
305 sdp->sd_log_num_revoke--;
306
307 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
308 set_buffer_dirty(bh);
309 ll_rw_block(WRITE, 1, &bh);
310
311 bh = gfs2_log_get_buf(sdp);
312 mh = (struct gfs2_meta_header *)bh->b_data;
313 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
314 mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
315 mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
316 offset = sizeof(struct gfs2_meta_header);
317 }
318
319 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
320 kfree(rv);
321
322 offset += sizeof(u64);
323 }
324 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
325
326 set_buffer_dirty(bh);
327 ll_rw_block(WRITE, 1, &bh);
328}
329
330static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
331 struct gfs2_log_header *head, int pass)
332{
333 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
334
335 if (pass != 0)
336 return;
337
338 sdp->sd_found_revokes = 0;
339 sdp->sd_replay_tail = head->lh_tail;
340}
341
342static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
343 struct gfs2_log_descriptor *ld, __be64 *ptr,
344 int pass)
345{
346 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
347 unsigned int blks = be32_to_cpu(ld->ld_length);
348 unsigned int revokes = be32_to_cpu(ld->ld_data1);
349 struct buffer_head *bh;
350 unsigned int offset;
351 u64 blkno;
352 int first = 1;
353 int error;
354
355 if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
356 return 0;
357
358 offset = sizeof(struct gfs2_log_descriptor);
359
360 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
361 error = gfs2_replay_read_block(jd, start, &bh);
362 if (error)
363 return error;
364
365 if (!first)
366 gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
367
368 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
369 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
370
371 error = gfs2_revoke_add(sdp, blkno, start);
372 if (error < 0)
373 return error;
374 else if (error)
375 sdp->sd_found_revokes++;
376
377 if (!--revokes)
378 break;
379 offset += sizeof(u64);
380 }
381
382 brelse(bh);
383 offset = sizeof(struct gfs2_meta_header);
384 first = 0;
385 }
386
387 return 0;
388}
389
390static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
391{
392 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
393
394 if (error) {
395 gfs2_revoke_clean(sdp);
396 return;
397 }
398 if (pass != 1)
399 return;
400
401 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
402 jd->jd_jid, sdp->sd_found_revokes);
403
404 gfs2_revoke_clean(sdp);
405}
406
407static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
408{
409 struct gfs2_rgrpd *rgd;
410 struct gfs2_trans *tr = current->journal_info;
411
412 tr->tr_touched = 1;
413
414 if (!list_empty(&le->le_list))
415 return;
416
417 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
418 gfs2_rgrp_bh_hold(rgd);
419
420 gfs2_log_lock(sdp);
421 sdp->sd_log_num_rg++;
422 list_add(&le->le_list, &sdp->sd_log_le_rg);
423 gfs2_log_unlock(sdp);
424}
425
426static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
427{
428 struct list_head *head = &sdp->sd_log_le_rg;
429 struct gfs2_rgrpd *rgd;
430
431 while (!list_empty(head)) {
432 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
433 list_del_init(&rgd->rd_le.le_list);
434 sdp->sd_log_num_rg--;
435
436 gfs2_rgrp_repolish_clones(rgd);
437 gfs2_rgrp_bh_put(rgd);
438 }
439 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
440}
441
442/**
443 * databuf_lo_add - Add a databuf to the transaction.
444 *
445 * This is used in two distinct cases:
446 * i) In ordered write mode
447 * We put the data buffer on a list so that we can ensure that its
448 * synced to disk at the right time
449 * ii) In journaled data mode
450 * We need to journal the data block in the same way as metadata in
451 * the functions above. The difference is that here we have a tag
452 * which is two __be64's being the block number (as per meta data)
453 * and a flag which says whether the data block needs escaping or
454 * not. This means we need a new log entry for each 251 or so data
455 * blocks, which isn't an enormous overhead but twice as much as
456 * for normal metadata blocks.
457 */
458static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
459{
460 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
461 struct gfs2_trans *tr = current->journal_info;
462 struct address_space *mapping = bd->bd_bh->b_page->mapping;
463 struct gfs2_inode *ip = GFS2_I(mapping->host);
464
465 tr->tr_touched = 1;
466 if (list_empty(&bd->bd_list_tr) &&
467 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
468 tr->tr_num_buf++;
469 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
470 gfs2_pin(sdp, bd->bd_bh);
471 tr->tr_num_buf_new++;
472 }
473 gfs2_trans_add_gl(bd->bd_gl);
474 gfs2_log_lock(sdp);
475 if (list_empty(&le->le_list)) {
476 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
477 sdp->sd_log_num_jdata++;
478 sdp->sd_log_num_databuf++;
479 list_add(&le->le_list, &sdp->sd_log_le_databuf);
480 }
481 gfs2_log_unlock(sdp);
482}
483
484static int gfs2_check_magic(struct buffer_head *bh)
485{
486 struct page *page = bh->b_page;
487 void *kaddr;
488 __be32 *ptr;
489 int rv = 0;
490
491 kaddr = kmap_atomic(page, KM_USER0);
492 ptr = kaddr + bh_offset(bh);
493 if (*ptr == cpu_to_be32(GFS2_MAGIC))
494 rv = 1;
495 kunmap_atomic(kaddr, KM_USER0);
496
497 return rv;
498}
499
500/**
501 * databuf_lo_before_commit - Scan the data buffers, writing as we go
502 *
503 * Here we scan through the lists of buffers and make the assumption
504 * that any buffer thats been pinned is being journaled, and that
505 * any unpinned buffer is an ordered write data buffer and therefore
506 * will be written back rather than journaled.
507 */
508static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
509{
510 LIST_HEAD(started);
511 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
512 struct buffer_head *bh = NULL;
513 unsigned int offset = sizeof(struct gfs2_log_descriptor);
514 struct gfs2_log_descriptor *ld;
515 unsigned int limit;
516 unsigned int total_dbuf = sdp->sd_log_num_databuf;
517 unsigned int total_jdata = sdp->sd_log_num_jdata;
518 unsigned int num, n;
519 __be64 *ptr = NULL;
520
521 offset += 2*sizeof(__be64) - 1;
522 offset &= ~(2*sizeof(__be64) - 1);
523 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
524
525 /*
526 * Start writing ordered buffers, write journaled buffers
527 * into the log along with a header
528 */
529 gfs2_log_lock(sdp);
530 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
531 bd_le.le_list);
532 while(total_dbuf) {
533 num = total_jdata;
534 if (num > limit)
535 num = limit;
536 n = 0;
537 list_for_each_entry_safe_continue(bd1, bdt,
538 &sdp->sd_log_le_databuf,
539 bd_le.le_list) {
540 /* An ordered write buffer */
541 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
542 list_move(&bd1->bd_le.le_list, &started);
543 if (bd1 == bd2) {
544 bd2 = NULL;
545 bd2 = list_prepare_entry(bd2,
546 &sdp->sd_log_le_databuf,
547 bd_le.le_list);
548 }
549 total_dbuf--;
550 if (bd1->bd_bh) {
551 get_bh(bd1->bd_bh);
552 if (buffer_dirty(bd1->bd_bh)) {
553 gfs2_log_unlock(sdp);
554 wait_on_buffer(bd1->bd_bh);
555 ll_rw_block(WRITE, 1,
556 &bd1->bd_bh);
557 gfs2_log_lock(sdp);
558 }
559 brelse(bd1->bd_bh);
560 continue;
561 }
562 continue;
563 } else if (bd1->bd_bh) { /* A journaled buffer */
564 int magic;
565 gfs2_log_unlock(sdp);
566 if (!bh) {
567 bh = gfs2_log_get_buf(sdp);
568 sdp->sd_log_num_hdrs++;
569 ld = (struct gfs2_log_descriptor *)
570 bh->b_data;
571 ptr = (__be64 *)(bh->b_data + offset);
572 ld->ld_header.mh_magic =
573 cpu_to_be32(GFS2_MAGIC);
574 ld->ld_header.mh_type =
575 cpu_to_be32(GFS2_METATYPE_LD);
576 ld->ld_header.mh_format =
577 cpu_to_be32(GFS2_FORMAT_LD);
578 ld->ld_type =
579 cpu_to_be32(GFS2_LOG_DESC_JDATA);
580 ld->ld_length = cpu_to_be32(num + 1);
581 ld->ld_data1 = cpu_to_be32(num);
582 ld->ld_data2 = cpu_to_be32(0);
583 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
584 }
585 magic = gfs2_check_magic(bd1->bd_bh);
586 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
587 *ptr++ = cpu_to_be64((__u64)magic);
588 clear_buffer_escaped(bd1->bd_bh);
589 if (unlikely(magic != 0))
590 set_buffer_escaped(bd1->bd_bh);
591 gfs2_log_lock(sdp);
592 if (n++ > num)
593 break;
594 } else if (!bd1->bd_bh) {
595 total_dbuf--;
596 sdp->sd_log_num_databuf--;
597 list_del_init(&bd1->bd_le.le_list);
598 if (bd1 == bd2) {
599 bd2 = NULL;
600 bd2 = list_prepare_entry(bd2,
601 &sdp->sd_log_le_databuf,
602 bd_le.le_list);
603 }
604 kmem_cache_free(gfs2_bufdata_cachep, bd1);
605 }
606 }
607 gfs2_log_unlock(sdp);
608 if (bh) {
609 set_buffer_dirty(bh);
610 ll_rw_block(WRITE, 1, &bh);
611 bh = NULL;
612 }
613 n = 0;
614 gfs2_log_lock(sdp);
615 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
616 bd_le.le_list) {
617 if (!bd2->bd_bh)
618 continue;
619 /* copy buffer if it needs escaping */
620 gfs2_log_unlock(sdp);
621 if (unlikely(buffer_escaped(bd2->bd_bh))) {
622 void *kaddr;
623 struct page *page = bd2->bd_bh->b_page;
624 bh = gfs2_log_get_buf(sdp);
625 kaddr = kmap_atomic(page, KM_USER0);
626 memcpy(bh->b_data,
627 kaddr + bh_offset(bd2->bd_bh),
628 sdp->sd_sb.sb_bsize);
629 kunmap_atomic(kaddr, KM_USER0);
630 *(__be32 *)bh->b_data = 0;
631 } else {
632 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
633 }
634 set_buffer_dirty(bh);
635 ll_rw_block(WRITE, 1, &bh);
636 gfs2_log_lock(sdp);
637 if (++n >= num)
638 break;
639 }
640 bh = NULL;
641 total_dbuf -= num;
642 total_jdata -= num;
643 }
644 gfs2_log_unlock(sdp);
645
646 /* Wait on all ordered buffers */
647 while (!list_empty(&started)) {
648 gfs2_log_lock(sdp);
649 bd1 = list_entry(started.next, struct gfs2_bufdata,
650 bd_le.le_list);
651 list_del_init(&bd1->bd_le.le_list);
652 sdp->sd_log_num_databuf--;
653 bh = bd1->bd_bh;
654 if (bh) {
655 bh->b_private = NULL;
656 get_bh(bh);
657 gfs2_log_unlock(sdp);
658 wait_on_buffer(bh);
659 brelse(bh);
660 } else
661 gfs2_log_unlock(sdp);
662
663 kmem_cache_free(gfs2_bufdata_cachep, bd1);
664 }
665
666 /* We've removed all the ordered write bufs here, so only jdata left */
667 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
668}
669
670static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
671 struct gfs2_log_descriptor *ld,
672 __be64 *ptr, int pass)
673{
674 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
675 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
676 struct gfs2_glock *gl = ip->i_gl;
677 unsigned int blks = be32_to_cpu(ld->ld_data1);
678 struct buffer_head *bh_log, *bh_ip;
679 u64 blkno;
680 u64 esc;
681 int error = 0;
682
683 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
684 return 0;
685
686 gfs2_replay_incr_blk(sdp, &start);
687 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
688 blkno = be64_to_cpu(*ptr++);
689 esc = be64_to_cpu(*ptr++);
690
691 sdp->sd_found_blocks++;
692
693 if (gfs2_revoke_check(sdp, blkno, start))
694 continue;
695
696 error = gfs2_replay_read_block(jd, start, &bh_log);
697 if (error)
698 return error;
699
700 bh_ip = gfs2_meta_new(gl, blkno);
701 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
702
703 /* Unescape */
704 if (esc) {
705 __be32 *eptr = (__be32 *)bh_ip->b_data;
706 *eptr = cpu_to_be32(GFS2_MAGIC);
707 }
708 mark_buffer_dirty(bh_ip);
709
710 brelse(bh_log);
711 brelse(bh_ip);
712 if (error)
713 break;
714
715 sdp->sd_replayed_blocks++;
716 }
717
718 return error;
719}
720
721/* FIXME: sort out accounting for log blocks etc. */
722
723static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
724{
725 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
726 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
727
728 if (error) {
729 gfs2_meta_sync(ip->i_gl);
730 return;
731 }
732 if (pass != 1)
733 return;
734
735 /* data sync? */
736 gfs2_meta_sync(ip->i_gl);
737
738 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
739 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
740}
741
742static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
743{
744 struct list_head *head = &sdp->sd_log_le_databuf;
745 struct gfs2_bufdata *bd;
746
747 while (!list_empty(head)) {
748 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
749 list_del_init(&bd->bd_le.le_list);
750 sdp->sd_log_num_databuf--;
751 sdp->sd_log_num_jdata--;
752 gfs2_unpin(sdp, bd->bd_bh, ai);
753 }
754 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
755 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
756}
757
758
759const struct gfs2_log_operations gfs2_glock_lops = {
760 .lo_add = glock_lo_add,
761 .lo_after_commit = glock_lo_after_commit,
762 .lo_name = "glock",
763};
764
765const struct gfs2_log_operations gfs2_buf_lops = {
766 .lo_add = buf_lo_add,
767 .lo_incore_commit = buf_lo_incore_commit,
768 .lo_before_commit = buf_lo_before_commit,
769 .lo_after_commit = buf_lo_after_commit,
770 .lo_before_scan = buf_lo_before_scan,
771 .lo_scan_elements = buf_lo_scan_elements,
772 .lo_after_scan = buf_lo_after_scan,
773 .lo_name = "buf",
774};
775
776const struct gfs2_log_operations gfs2_revoke_lops = {
777 .lo_add = revoke_lo_add,
778 .lo_before_commit = revoke_lo_before_commit,
779 .lo_before_scan = revoke_lo_before_scan,
780 .lo_scan_elements = revoke_lo_scan_elements,
781 .lo_after_scan = revoke_lo_after_scan,
782 .lo_name = "revoke",
783};
784
785const struct gfs2_log_operations gfs2_rg_lops = {
786 .lo_add = rg_lo_add,
787 .lo_after_commit = rg_lo_after_commit,
788 .lo_name = "rg",
789};
790
791const struct gfs2_log_operations gfs2_databuf_lops = {
792 .lo_add = databuf_lo_add,
793 .lo_incore_commit = buf_lo_incore_commit,
794 .lo_before_commit = databuf_lo_before_commit,
795 .lo_after_commit = databuf_lo_after_commit,
796 .lo_scan_elements = databuf_lo_scan_elements,
797 .lo_after_scan = databuf_lo_after_scan,
798 .lo_name = "databuf",
799};
800
801const struct gfs2_log_operations *gfs2_log_ops[] = {
802 &gfs2_glock_lops,
803 &gfs2_buf_lops,
804 &gfs2_revoke_lops,
805 &gfs2_rg_lops,
806 &gfs2_databuf_lops,
807 NULL,
808};
809
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..5839c05ae6be
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,99 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __LOPS_DOT_H__
11#define __LOPS_DOT_H__
12
13#include <linux/list.h>
14#include "incore.h"
15
16extern const struct gfs2_log_operations gfs2_glock_lops;
17extern const struct gfs2_log_operations gfs2_buf_lops;
18extern const struct gfs2_log_operations gfs2_revoke_lops;
19extern const struct gfs2_log_operations gfs2_rg_lops;
20extern const struct gfs2_log_operations gfs2_databuf_lops;
21
22extern const struct gfs2_log_operations *gfs2_log_ops[];
23
24static inline void lops_init_le(struct gfs2_log_element *le,
25 const struct gfs2_log_operations *lops)
26{
27 INIT_LIST_HEAD(&le->le_list);
28 le->le_ops = lops;
29}
30
31static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
32{
33 if (le->le_ops->lo_add)
34 le->le_ops->lo_add(sdp, le);
35}
36
37static inline void lops_incore_commit(struct gfs2_sbd *sdp,
38 struct gfs2_trans *tr)
39{
40 int x;
41 for (x = 0; gfs2_log_ops[x]; x++)
42 if (gfs2_log_ops[x]->lo_incore_commit)
43 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
44}
45
46static inline void lops_before_commit(struct gfs2_sbd *sdp)
47{
48 int x;
49 for (x = 0; gfs2_log_ops[x]; x++)
50 if (gfs2_log_ops[x]->lo_before_commit)
51 gfs2_log_ops[x]->lo_before_commit(sdp);
52}
53
54static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
55{
56 int x;
57 for (x = 0; gfs2_log_ops[x]; x++)
58 if (gfs2_log_ops[x]->lo_after_commit)
59 gfs2_log_ops[x]->lo_after_commit(sdp, ai);
60}
61
62static inline void lops_before_scan(struct gfs2_jdesc *jd,
63 struct gfs2_log_header *head,
64 unsigned int pass)
65{
66 int x;
67 for (x = 0; gfs2_log_ops[x]; x++)
68 if (gfs2_log_ops[x]->lo_before_scan)
69 gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
70}
71
72static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
73 struct gfs2_log_descriptor *ld,
74 __be64 *ptr,
75 unsigned int pass)
76{
77 int x, error;
78 for (x = 0; gfs2_log_ops[x]; x++)
79 if (gfs2_log_ops[x]->lo_scan_elements) {
80 error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
81 ld, ptr, pass);
82 if (error)
83 return error;
84 }
85
86 return 0;
87}
88
89static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
90 unsigned int pass)
91{
92 int x;
93 for (x = 0; gfs2_log_ops[x]; x++)
94 if (gfs2_log_ops[x]->lo_before_scan)
95 gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
96}
97
98#endif /* __LOPS_DOT_H__ */
99
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..21508a13bb78
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,150 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <asm/atomic.h>
20
21#include "gfs2.h"
22#include "incore.h"
23#include "ops_fstype.h"
24#include "sys.h"
25#include "util.h"
26#include "glock.h"
27
28static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
29{
30 struct gfs2_inode *ip = foo;
31 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
32 SLAB_CTOR_CONSTRUCTOR) {
33 inode_init_once(&ip->i_inode);
34 spin_lock_init(&ip->i_spin);
35 init_rwsem(&ip->i_rw_mutex);
36 memset(ip->i_cache, 0, sizeof(ip->i_cache));
37 }
38}
39
40static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
41{
42 struct gfs2_glock *gl = foo;
43 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
44 SLAB_CTOR_CONSTRUCTOR) {
45 INIT_HLIST_NODE(&gl->gl_list);
46 spin_lock_init(&gl->gl_spin);
47 INIT_LIST_HEAD(&gl->gl_holders);
48 INIT_LIST_HEAD(&gl->gl_waiters1);
49 INIT_LIST_HEAD(&gl->gl_waiters2);
50 INIT_LIST_HEAD(&gl->gl_waiters3);
51 gl->gl_lvb = NULL;
52 atomic_set(&gl->gl_lvb_count, 0);
53 INIT_LIST_HEAD(&gl->gl_reclaim);
54 INIT_LIST_HEAD(&gl->gl_ail_list);
55 atomic_set(&gl->gl_ail_count, 0);
56 }
57}
58
59/**
60 * init_gfs2_fs - Register GFS2 as a filesystem
61 *
62 * Returns: 0 on success, error code on failure
63 */
64
65static int __init init_gfs2_fs(void)
66{
67 int error;
68
69 error = gfs2_sys_init();
70 if (error)
71 return error;
72
73 error = gfs2_glock_init();
74 if (error)
75 goto fail;
76
77 error = -ENOMEM;
78 gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
79 sizeof(struct gfs2_glock),
80 0, 0,
81 gfs2_init_glock_once, NULL);
82 if (!gfs2_glock_cachep)
83 goto fail;
84
85 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
86 sizeof(struct gfs2_inode),
87 0, (SLAB_RECLAIM_ACCOUNT|
88 SLAB_PANIC|SLAB_MEM_SPREAD),
89 gfs2_init_inode_once, NULL);
90 if (!gfs2_inode_cachep)
91 goto fail;
92
93 gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
94 sizeof(struct gfs2_bufdata),
95 0, 0, NULL, NULL);
96 if (!gfs2_bufdata_cachep)
97 goto fail;
98
99 error = register_filesystem(&gfs2_fs_type);
100 if (error)
101 goto fail;
102
103 error = register_filesystem(&gfs2meta_fs_type);
104 if (error)
105 goto fail_unregister;
106
107 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
108
109 return 0;
110
111fail_unregister:
112 unregister_filesystem(&gfs2_fs_type);
113fail:
114 if (gfs2_bufdata_cachep)
115 kmem_cache_destroy(gfs2_bufdata_cachep);
116
117 if (gfs2_inode_cachep)
118 kmem_cache_destroy(gfs2_inode_cachep);
119
120 if (gfs2_glock_cachep)
121 kmem_cache_destroy(gfs2_glock_cachep);
122
123 gfs2_sys_uninit();
124 return error;
125}
126
127/**
128 * exit_gfs2_fs - Unregister the file system
129 *
130 */
131
132static void __exit exit_gfs2_fs(void)
133{
134 unregister_filesystem(&gfs2_fs_type);
135 unregister_filesystem(&gfs2meta_fs_type);
136
137 kmem_cache_destroy(gfs2_bufdata_cachep);
138 kmem_cache_destroy(gfs2_inode_cachep);
139 kmem_cache_destroy(gfs2_glock_cachep);
140
141 gfs2_sys_uninit();
142}
143
144MODULE_DESCRIPTION("Global File System");
145MODULE_AUTHOR("Red Hat, Inc.");
146MODULE_LICENSE("GPL");
147
148module_init(init_gfs2_fs);
149module_exit(exit_gfs2_fs);
150
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..3912d6a4b1e6
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,590 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/writeback.h>
18#include <linux/swap.h>
19#include <linux/delay.h>
20#include <linux/bio.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h>
23
24#include "gfs2.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "log.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "rgrp.h"
33#include "trans.h"
34#include "util.h"
35#include "ops_address.h"
36
37static int aspace_get_block(struct inode *inode, sector_t lblock,
38 struct buffer_head *bh_result, int create)
39{
40 gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
41 return -EOPNOTSUPP;
42}
43
44static int gfs2_aspace_writepage(struct page *page,
45 struct writeback_control *wbc)
46{
47 return block_write_full_page(page, aspace_get_block, wbc);
48}
49
50static const struct address_space_operations aspace_aops = {
51 .writepage = gfs2_aspace_writepage,
52 .releasepage = gfs2_releasepage,
53};
54
55/**
56 * gfs2_aspace_get - Create and initialize a struct inode structure
57 * @sdp: the filesystem the aspace is in
58 *
59 * Right now a struct inode is just a struct inode. Maybe Linux
60 * will supply a more lightweight address space construct (that works)
61 * in the future.
62 *
63 * Make sure pages/buffers in this aspace aren't in high memory.
64 *
65 * Returns: the aspace
66 */
67
68struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
69{
70 struct inode *aspace;
71
72 aspace = new_inode(sdp->sd_vfs);
73 if (aspace) {
74 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
75 aspace->i_mapping->a_ops = &aspace_aops;
76 aspace->i_size = ~0ULL;
77 aspace->i_private = NULL;
78 insert_inode_hash(aspace);
79 }
80 return aspace;
81}
82
83void gfs2_aspace_put(struct inode *aspace)
84{
85 remove_inode_hash(aspace);
86 iput(aspace);
87}
88
89/**
90 * gfs2_meta_inval - Invalidate all buffers associated with a glock
91 * @gl: the glock
92 *
93 */
94
95void gfs2_meta_inval(struct gfs2_glock *gl)
96{
97 struct gfs2_sbd *sdp = gl->gl_sbd;
98 struct inode *aspace = gl->gl_aspace;
99 struct address_space *mapping = gl->gl_aspace->i_mapping;
100
101 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
102
103 atomic_inc(&aspace->i_writecount);
104 truncate_inode_pages(mapping, 0);
105 atomic_dec(&aspace->i_writecount);
106
107 gfs2_assert_withdraw(sdp, !mapping->nrpages);
108}
109
110/**
111 * gfs2_meta_sync - Sync all buffers associated with a glock
112 * @gl: The glock
113 *
114 */
115
116void gfs2_meta_sync(struct gfs2_glock *gl)
117{
118 struct address_space *mapping = gl->gl_aspace->i_mapping;
119 int error;
120
121 filemap_fdatawrite(mapping);
122 error = filemap_fdatawait(mapping);
123
124 if (error)
125 gfs2_io_error(gl->gl_sbd);
126}
127
128/**
129 * getbuf - Get a buffer with a given address space
130 * @sdp: the filesystem
131 * @aspace: the address space
132 * @blkno: the block number (filesystem scope)
133 * @create: 1 if the buffer should be created
134 *
135 * Returns: the buffer
136 */
137
138static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
139 u64 blkno, int create)
140{
141 struct page *page;
142 struct buffer_head *bh;
143 unsigned int shift;
144 unsigned long index;
145 unsigned int bufnum;
146
147 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
148 index = blkno >> shift; /* convert block to page */
149 bufnum = blkno - (index << shift); /* block buf index within page */
150
151 if (create) {
152 for (;;) {
153 page = grab_cache_page(aspace->i_mapping, index);
154 if (page)
155 break;
156 yield();
157 }
158 } else {
159 page = find_lock_page(aspace->i_mapping, index);
160 if (!page)
161 return NULL;
162 }
163
164 if (!page_has_buffers(page))
165 create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
166
167 /* Locate header for our buffer within our page */
168 for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
169 /* Do nothing */;
170 get_bh(bh);
171
172 if (!buffer_mapped(bh))
173 map_bh(bh, sdp->sd_vfs, blkno);
174
175 unlock_page(page);
176 mark_page_accessed(page);
177 page_cache_release(page);
178
179 return bh;
180}
181
182static void meta_prep_new(struct buffer_head *bh)
183{
184 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
185
186 lock_buffer(bh);
187 clear_buffer_dirty(bh);
188 set_buffer_uptodate(bh);
189 unlock_buffer(bh);
190
191 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
192}
193
194/**
195 * gfs2_meta_new - Get a block
196 * @gl: The glock associated with this block
197 * @blkno: The block number
198 *
199 * Returns: The buffer
200 */
201
202struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
203{
204 struct buffer_head *bh;
205 bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
206 meta_prep_new(bh);
207 return bh;
208}
209
210/**
211 * gfs2_meta_read - Read a block from disk
212 * @gl: The glock covering the block
213 * @blkno: The block number
214 * @flags: flags
215 * @bhp: the place where the buffer is returned (NULL on failure)
216 *
217 * Returns: errno
218 */
219
220int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
221 struct buffer_head **bhp)
222{
223 *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
224 if (!buffer_uptodate(*bhp))
225 ll_rw_block(READ_META, 1, bhp);
226 if (flags & DIO_WAIT) {
227 int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
228 if (error) {
229 brelse(*bhp);
230 return error;
231 }
232 }
233
234 return 0;
235}
236
237/**
238 * gfs2_meta_wait - Reread a block from disk
239 * @sdp: the filesystem
240 * @bh: The block to wait for
241 *
242 * Returns: errno
243 */
244
245int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
246{
247 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
248 return -EIO;
249
250 wait_on_buffer(bh);
251
252 if (!buffer_uptodate(bh)) {
253 struct gfs2_trans *tr = current->journal_info;
254 if (tr && tr->tr_touched)
255 gfs2_io_error_bh(sdp, bh);
256 return -EIO;
257 }
258 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
259 return -EIO;
260
261 return 0;
262}
263
264/**
265 * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
266 * @gl: the glock the buffer belongs to
267 * @bh: The buffer to be attached to
268 * @meta: Flag to indicate whether its metadata or not
269 */
270
271void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
272 int meta)
273{
274 struct gfs2_bufdata *bd;
275
276 if (meta)
277 lock_page(bh->b_page);
278
279 if (bh->b_private) {
280 if (meta)
281 unlock_page(bh->b_page);
282 return;
283 }
284
285 bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
286 memset(bd, 0, sizeof(struct gfs2_bufdata));
287 bd->bd_bh = bh;
288 bd->bd_gl = gl;
289
290 INIT_LIST_HEAD(&bd->bd_list_tr);
291 if (meta)
292 lops_init_le(&bd->bd_le, &gfs2_buf_lops);
293 else
294 lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
295 bh->b_private = bd;
296
297 if (meta)
298 unlock_page(bh->b_page);
299}
300
301/**
302 * gfs2_pin - Pin a buffer in memory
303 * @sdp: the filesystem the buffer belongs to
304 * @bh: The buffer to be pinned
305 *
306 */
307
308void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
309{
310 struct gfs2_bufdata *bd = bh->b_private;
311
312 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
313
314 if (test_set_buffer_pinned(bh))
315 gfs2_assert_withdraw(sdp, 0);
316
317 wait_on_buffer(bh);
318
319 /* If this buffer is in the AIL and it has already been written
320 to in-place disk block, remove it from the AIL. */
321
322 gfs2_log_lock(sdp);
323 if (bd->bd_ail && !buffer_in_io(bh))
324 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
325 gfs2_log_unlock(sdp);
326
327 clear_buffer_dirty(bh);
328 wait_on_buffer(bh);
329
330 if (!buffer_uptodate(bh))
331 gfs2_io_error_bh(sdp, bh);
332
333 get_bh(bh);
334}
335
336/**
337 * gfs2_unpin - Unpin a buffer
338 * @sdp: the filesystem the buffer belongs to
339 * @bh: The buffer to unpin
340 * @ai:
341 *
342 */
343
344void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
345 struct gfs2_ail *ai)
346{
347 struct gfs2_bufdata *bd = bh->b_private;
348
349 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
350
351 if (!buffer_pinned(bh))
352 gfs2_assert_withdraw(sdp, 0);
353
354 mark_buffer_dirty(bh);
355 clear_buffer_pinned(bh);
356
357 gfs2_log_lock(sdp);
358 if (bd->bd_ail) {
359 list_del(&bd->bd_ail_st_list);
360 brelse(bh);
361 } else {
362 struct gfs2_glock *gl = bd->bd_gl;
363 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
364 atomic_inc(&gl->gl_ail_count);
365 }
366 bd->bd_ail = ai;
367 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
368 gfs2_log_unlock(sdp);
369}
370
371/**
372 * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
373 * @ip: the inode who owns the buffers
374 * @bstart: the first buffer in the run
375 * @blen: the number of buffers in the run
376 *
377 */
378
379void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
380{
381 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
382 struct inode *aspace = ip->i_gl->gl_aspace;
383 struct buffer_head *bh;
384
385 while (blen) {
386 bh = getbuf(sdp, aspace, bstart, NO_CREATE);
387 if (bh) {
388 struct gfs2_bufdata *bd = bh->b_private;
389
390 if (test_clear_buffer_pinned(bh)) {
391 struct gfs2_trans *tr = current->journal_info;
392 gfs2_log_lock(sdp);
393 list_del_init(&bd->bd_le.le_list);
394 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
395 sdp->sd_log_num_buf--;
396 gfs2_log_unlock(sdp);
397 tr->tr_num_buf_rm++;
398 brelse(bh);
399 }
400 if (bd) {
401 gfs2_log_lock(sdp);
402 if (bd->bd_ail) {
403 u64 blkno = bh->b_blocknr;
404 bd->bd_ail = NULL;
405 list_del(&bd->bd_ail_st_list);
406 list_del(&bd->bd_ail_gl_list);
407 atomic_dec(&bd->bd_gl->gl_ail_count);
408 brelse(bh);
409 gfs2_log_unlock(sdp);
410 gfs2_trans_add_revoke(sdp, blkno);
411 } else
412 gfs2_log_unlock(sdp);
413 }
414
415 lock_buffer(bh);
416 clear_buffer_dirty(bh);
417 clear_buffer_uptodate(bh);
418 unlock_buffer(bh);
419
420 brelse(bh);
421 }
422
423 bstart++;
424 blen--;
425 }
426}
427
428/**
429 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
430 * @ip: The GFS2 inode
431 *
432 * This releases buffers that are in the most-recently-used array of
433 * blocks used for indirect block addressing for this inode.
434 */
435
436void gfs2_meta_cache_flush(struct gfs2_inode *ip)
437{
438 struct buffer_head **bh_slot;
439 unsigned int x;
440
441 spin_lock(&ip->i_spin);
442
443 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
444 bh_slot = &ip->i_cache[x];
445 if (!*bh_slot)
446 break;
447 brelse(*bh_slot);
448 *bh_slot = NULL;
449 }
450
451 spin_unlock(&ip->i_spin);
452}
453
454/**
455 * gfs2_meta_indirect_buffer - Get a metadata buffer
456 * @ip: The GFS2 inode
457 * @height: The level of this buf in the metadata (indir addr) tree (if any)
458 * @num: The block number (device relative) of the buffer
459 * @new: Non-zero if we may create a new buffer
460 * @bhp: the buffer is returned here
461 *
462 * Try to use the gfs2_inode's MRU metadata tree cache.
463 *
464 * Returns: errno
465 */
466
467int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
468 int new, struct buffer_head **bhp)
469{
470 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
471 struct gfs2_glock *gl = ip->i_gl;
472 struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
473 int in_cache = 0;
474
475 spin_lock(&ip->i_spin);
476 if (*bh_slot && (*bh_slot)->b_blocknr == num) {
477 bh = *bh_slot;
478 get_bh(bh);
479 in_cache = 1;
480 }
481 spin_unlock(&ip->i_spin);
482
483 if (!bh)
484 bh = getbuf(gl->gl_sbd, gl->gl_aspace, num, CREATE);
485
486 if (!bh)
487 return -ENOBUFS;
488
489 if (new) {
490 if (gfs2_assert_warn(sdp, height))
491 goto err;
492 meta_prep_new(bh);
493 gfs2_trans_add_bh(ip->i_gl, bh, 1);
494 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
495 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
496 } else {
497 u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
498 if (!buffer_uptodate(bh)) {
499 ll_rw_block(READ_META, 1, &bh);
500 if (gfs2_meta_wait(sdp, bh))
501 goto err;
502 }
503 if (gfs2_metatype_check(sdp, bh, mtype))
504 goto err;
505 }
506
507 if (!in_cache) {
508 spin_lock(&ip->i_spin);
509 if (*bh_slot)
510 brelse(*bh_slot);
511 *bh_slot = bh;
512 get_bh(bh);
513 spin_unlock(&ip->i_spin);
514 }
515
516 *bhp = bh;
517 return 0;
518err:
519 brelse(bh);
520 return -EIO;
521}
522
523/**
524 * gfs2_meta_ra - start readahead on an extent of a file
525 * @gl: the glock the blocks belong to
526 * @dblock: the starting disk block
527 * @extlen: the number of blocks in the extent
528 *
529 * returns: the first buffer in the extent
530 */
531
532struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
533{
534 struct gfs2_sbd *sdp = gl->gl_sbd;
535 struct inode *aspace = gl->gl_aspace;
536 struct buffer_head *first_bh, *bh;
537 u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
538 sdp->sd_sb.sb_bsize_shift;
539
540 BUG_ON(!extlen);
541
542 if (max_ra < 1)
543 max_ra = 1;
544 if (extlen > max_ra)
545 extlen = max_ra;
546
547 first_bh = getbuf(sdp, aspace, dblock, CREATE);
548
549 if (buffer_uptodate(first_bh))
550 goto out;
551 if (!buffer_locked(first_bh))
552 ll_rw_block(READ_META, 1, &first_bh);
553
554 dblock++;
555 extlen--;
556
557 while (extlen) {
558 bh = getbuf(sdp, aspace, dblock, CREATE);
559
560 if (!buffer_uptodate(bh) && !buffer_locked(bh))
561 ll_rw_block(READA, 1, &bh);
562 brelse(bh);
563 dblock++;
564 extlen--;
565 if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
566 goto out;
567 }
568
569 wait_on_buffer(first_bh);
570out:
571 return first_bh;
572}
573
574/**
575 * gfs2_meta_syncfs - sync all the buffers in a filesystem
576 * @sdp: the filesystem
577 *
578 */
579
580void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
581{
582 gfs2_log_flush(sdp, NULL);
583 for (;;) {
584 gfs2_ail1_start(sdp, DIO_ALL);
585 if (gfs2_ail1_empty(sdp, DIO_ALL))
586 break;
587 msleep(10);
588 }
589}
590
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..3ec939e20dff
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,78 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DIO_DOT_H__
11#define __DIO_DOT_H__
12
13#include <linux/buffer_head.h>
14#include <linux/string.h>
15#include "incore.h"
16
17static inline void gfs2_buffer_clear(struct buffer_head *bh)
18{
19 memset(bh->b_data, 0, bh->b_size);
20}
21
22static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
23{
24 BUG_ON(head > bh->b_size);
25 memset(bh->b_data + head, 0, bh->b_size - head);
26}
27
28static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
29 int to_head,
30 struct buffer_head *from_bh,
31 int from_head)
32{
33 BUG_ON(from_head < to_head);
34 memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
35 from_bh->b_size - from_head);
36 memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
37 0, from_head - to_head);
38}
39
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
41void gfs2_aspace_put(struct inode *aspace);
42
43void gfs2_meta_inval(struct gfs2_glock *gl);
44void gfs2_meta_sync(struct gfs2_glock *gl);
45
46struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
47int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
48 int flags, struct buffer_head **bhp);
49int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
50
51void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
52 int meta);
53void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
54void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
55 struct gfs2_ail *ai);
56
57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
58
59void gfs2_meta_cache_flush(struct gfs2_inode *ip);
60int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
61 int new, struct buffer_head **bhp);
62
63static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
64 struct buffer_head **bhp)
65{
66 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
67}
68
69struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
70void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
71
72#define buffer_busy(bh) \
73((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
74#define buffer_in_io(bh) \
75((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
76
77#endif /* __DIO_DOT_H__ */
78
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..ef3092e29607
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17
18#include "gfs2.h"
19#include "incore.h"
20#include "mount.h"
21#include "sys.h"
22#include "util.h"
23
24/**
25 * gfs2_mount_args - Parse mount options
26 * @sdp:
27 * @data:
28 *
29 * Return: errno
30 */
31
32int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
33{
34 struct gfs2_args *args = &sdp->sd_args;
35 char *data = data_arg;
36 char *options, *o, *v;
37 int error = 0;
38
39 if (!remount) {
40 /* If someone preloaded options, use those instead */
41 spin_lock(&gfs2_sys_margs_lock);
42 if (gfs2_sys_margs) {
43 data = gfs2_sys_margs;
44 gfs2_sys_margs = NULL;
45 }
46 spin_unlock(&gfs2_sys_margs_lock);
47
48 /* Set some defaults */
49 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
50 args->ar_quota = GFS2_QUOTA_DEFAULT;
51 args->ar_data = GFS2_DATA_DEFAULT;
52 }
53
54 /* Split the options into tokens with the "," character and
55 process them */
56
57 for (options = data; (o = strsep(&options, ",")); ) {
58 if (!*o)
59 continue;
60
61 v = strchr(o, '=');
62 if (v)
63 *v++ = 0;
64
65 if (!strcmp(o, "lockproto")) {
66 if (!v)
67 goto need_value;
68 if (remount && strcmp(v, args->ar_lockproto))
69 goto cant_remount;
70 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
71 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
72 }
73
74 else if (!strcmp(o, "locktable")) {
75 if (!v)
76 goto need_value;
77 if (remount && strcmp(v, args->ar_locktable))
78 goto cant_remount;
79 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
80 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
81 }
82
83 else if (!strcmp(o, "hostdata")) {
84 if (!v)
85 goto need_value;
86 if (remount && strcmp(v, args->ar_hostdata))
87 goto cant_remount;
88 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
89 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
90 }
91
92 else if (!strcmp(o, "spectator")) {
93 if (remount && !args->ar_spectator)
94 goto cant_remount;
95 args->ar_spectator = 1;
96 sdp->sd_vfs->s_flags |= MS_RDONLY;
97 }
98
99 else if (!strcmp(o, "ignore_local_fs")) {
100 if (remount && !args->ar_ignore_local_fs)
101 goto cant_remount;
102 args->ar_ignore_local_fs = 1;
103 }
104
105 else if (!strcmp(o, "localflocks")) {
106 if (remount && !args->ar_localflocks)
107 goto cant_remount;
108 args->ar_localflocks = 1;
109 }
110
111 else if (!strcmp(o, "localcaching")) {
112 if (remount && !args->ar_localcaching)
113 goto cant_remount;
114 args->ar_localcaching = 1;
115 }
116
117 else if (!strcmp(o, "debug"))
118 args->ar_debug = 1;
119
120 else if (!strcmp(o, "nodebug"))
121 args->ar_debug = 0;
122
123 else if (!strcmp(o, "upgrade")) {
124 if (remount && !args->ar_upgrade)
125 goto cant_remount;
126 args->ar_upgrade = 1;
127 }
128
129 else if (!strcmp(o, "num_glockd")) {
130 unsigned int x;
131 if (!v)
132 goto need_value;
133 sscanf(v, "%u", &x);
134 if (remount && x != args->ar_num_glockd)
135 goto cant_remount;
136 if (!x || x > GFS2_GLOCKD_MAX) {
137 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
138 GFS2_GLOCKD_MAX, x);
139 error = -EINVAL;
140 break;
141 }
142 args->ar_num_glockd = x;
143 }
144
145 else if (!strcmp(o, "acl")) {
146 args->ar_posix_acl = 1;
147 sdp->sd_vfs->s_flags |= MS_POSIXACL;
148 }
149
150 else if (!strcmp(o, "noacl")) {
151 args->ar_posix_acl = 0;
152 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
153 }
154
155 else if (!strcmp(o, "quota")) {
156 if (!v)
157 goto need_value;
158 if (!strcmp(v, "off"))
159 args->ar_quota = GFS2_QUOTA_OFF;
160 else if (!strcmp(v, "account"))
161 args->ar_quota = GFS2_QUOTA_ACCOUNT;
162 else if (!strcmp(v, "on"))
163 args->ar_quota = GFS2_QUOTA_ON;
164 else {
165 fs_info(sdp, "invalid value for quota\n");
166 error = -EINVAL;
167 break;
168 }
169 }
170
171 else if (!strcmp(o, "suiddir"))
172 args->ar_suiddir = 1;
173
174 else if (!strcmp(o, "nosuiddir"))
175 args->ar_suiddir = 0;
176
177 else if (!strcmp(o, "data")) {
178 if (!v)
179 goto need_value;
180 if (!strcmp(v, "writeback"))
181 args->ar_data = GFS2_DATA_WRITEBACK;
182 else if (!strcmp(v, "ordered"))
183 args->ar_data = GFS2_DATA_ORDERED;
184 else {
185 fs_info(sdp, "invalid value for data\n");
186 error = -EINVAL;
187 break;
188 }
189 }
190
191 else {
192 fs_info(sdp, "unknown option: %s\n", o);
193 error = -EINVAL;
194 break;
195 }
196 }
197
198 if (error)
199 fs_info(sdp, "invalid mount option(s)\n");
200
201 if (data != data_arg)
202 kfree(data);
203
204 return error;
205
206need_value:
207 fs_info(sdp, "need value for option %s\n", o);
208 return -EINVAL;
209
210cant_remount:
211 fs_info(sdp, "can't remount with option %s\n", o);
212 return -EINVAL;
213}
214
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..401288acfdf3
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13struct gfs2_sbd;
14
15int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
16
17#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..1025960b0e6e
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15
16#include "gfs2.h"
17#include <linux/gfs2_ondisk.h>
18
19#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
20 struct->member);
21
22/*
23 * gfs2_xxx_in - read in an xxx struct
24 * first arg: the cpu-order structure
25 * buf: the disk-order buffer
26 *
27 * gfs2_xxx_out - write out an xxx struct
28 * first arg: the cpu-order structure
29 * buf: the disk-order buffer
30 *
31 * gfs2_xxx_print - print out an xxx struct
32 * first arg: the cpu-order structure
33 */
34
35void gfs2_inum_in(struct gfs2_inum *no, const void *buf)
36{
37 const struct gfs2_inum *str = buf;
38
39 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
40 no->no_addr = be64_to_cpu(str->no_addr);
41}
42
43void gfs2_inum_out(const struct gfs2_inum *no, void *buf)
44{
45 struct gfs2_inum *str = buf;
46
47 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
48 str->no_addr = cpu_to_be64(no->no_addr);
49}
50
51static void gfs2_inum_print(const struct gfs2_inum *no)
52{
53 printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
54 printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr);
55}
56
57static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf)
58{
59 const struct gfs2_meta_header *str = buf;
60
61 mh->mh_magic = be32_to_cpu(str->mh_magic);
62 mh->mh_type = be32_to_cpu(str->mh_type);
63 mh->mh_format = be32_to_cpu(str->mh_format);
64}
65
66static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf)
67{
68 struct gfs2_meta_header *str = buf;
69
70 str->mh_magic = cpu_to_be32(mh->mh_magic);
71 str->mh_type = cpu_to_be32(mh->mh_type);
72 str->mh_format = cpu_to_be32(mh->mh_format);
73}
74
75static void gfs2_meta_header_print(const struct gfs2_meta_header *mh)
76{
77 pv(mh, mh_magic, "0x%.8X");
78 pv(mh, mh_type, "%u");
79 pv(mh, mh_format, "%u");
80}
81
82void gfs2_sb_in(struct gfs2_sb *sb, const void *buf)
83{
84 const struct gfs2_sb *str = buf;
85
86 gfs2_meta_header_in(&sb->sb_header, buf);
87
88 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
89 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
90 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
91 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
92
93 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
94 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
95
96 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
97 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
98}
99
100void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf)
101{
102 const struct gfs2_rindex *str = buf;
103
104 ri->ri_addr = be64_to_cpu(str->ri_addr);
105 ri->ri_length = be32_to_cpu(str->ri_length);
106 ri->ri_data0 = be64_to_cpu(str->ri_data0);
107 ri->ri_data = be32_to_cpu(str->ri_data);
108 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
109
110}
111
112void gfs2_rindex_print(const struct gfs2_rindex *ri)
113{
114 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
115 pv(ri, ri_length, "%u");
116
117 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
118 pv(ri, ri_data, "%u");
119
120 pv(ri, ri_bitbytes, "%u");
121}
122
123void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf)
124{
125 const struct gfs2_rgrp *str = buf;
126
127 gfs2_meta_header_in(&rg->rg_header, buf);
128 rg->rg_flags = be32_to_cpu(str->rg_flags);
129 rg->rg_free = be32_to_cpu(str->rg_free);
130 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
131 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
132}
133
134void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf)
135{
136 struct gfs2_rgrp *str = buf;
137
138 gfs2_meta_header_out(&rg->rg_header, buf);
139 str->rg_flags = cpu_to_be32(rg->rg_flags);
140 str->rg_free = cpu_to_be32(rg->rg_free);
141 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
142 str->__pad = cpu_to_be32(0);
143 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
144 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
145}
146
147void gfs2_quota_in(struct gfs2_quota *qu, const void *buf)
148{
149 const struct gfs2_quota *str = buf;
150
151 qu->qu_limit = be64_to_cpu(str->qu_limit);
152 qu->qu_warn = be64_to_cpu(str->qu_warn);
153 qu->qu_value = be64_to_cpu(str->qu_value);
154}
155
156void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf)
157{
158 const struct gfs2_dinode *str = buf;
159
160 gfs2_meta_header_in(&di->di_header, buf);
161 gfs2_inum_in(&di->di_num, &str->di_num);
162
163 di->di_mode = be32_to_cpu(str->di_mode);
164 di->di_uid = be32_to_cpu(str->di_uid);
165 di->di_gid = be32_to_cpu(str->di_gid);
166 di->di_nlink = be32_to_cpu(str->di_nlink);
167 di->di_size = be64_to_cpu(str->di_size);
168 di->di_blocks = be64_to_cpu(str->di_blocks);
169 di->di_atime = be64_to_cpu(str->di_atime);
170 di->di_mtime = be64_to_cpu(str->di_mtime);
171 di->di_ctime = be64_to_cpu(str->di_ctime);
172 di->di_major = be32_to_cpu(str->di_major);
173 di->di_minor = be32_to_cpu(str->di_minor);
174
175 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
176 di->di_goal_data = be64_to_cpu(str->di_goal_data);
177 di->di_generation = be64_to_cpu(str->di_generation);
178
179 di->di_flags = be32_to_cpu(str->di_flags);
180 di->di_payload_format = be32_to_cpu(str->di_payload_format);
181 di->di_height = be16_to_cpu(str->di_height);
182
183 di->di_depth = be16_to_cpu(str->di_depth);
184 di->di_entries = be32_to_cpu(str->di_entries);
185
186 di->di_eattr = be64_to_cpu(str->di_eattr);
187
188}
189
190void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf)
191{
192 struct gfs2_dinode *str = buf;
193
194 gfs2_meta_header_out(&di->di_header, buf);
195 gfs2_inum_out(&di->di_num, (char *)&str->di_num);
196
197 str->di_mode = cpu_to_be32(di->di_mode);
198 str->di_uid = cpu_to_be32(di->di_uid);
199 str->di_gid = cpu_to_be32(di->di_gid);
200 str->di_nlink = cpu_to_be32(di->di_nlink);
201 str->di_size = cpu_to_be64(di->di_size);
202 str->di_blocks = cpu_to_be64(di->di_blocks);
203 str->di_atime = cpu_to_be64(di->di_atime);
204 str->di_mtime = cpu_to_be64(di->di_mtime);
205 str->di_ctime = cpu_to_be64(di->di_ctime);
206 str->di_major = cpu_to_be32(di->di_major);
207 str->di_minor = cpu_to_be32(di->di_minor);
208
209 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
210 str->di_goal_data = cpu_to_be64(di->di_goal_data);
211 str->di_generation = cpu_to_be64(di->di_generation);
212
213 str->di_flags = cpu_to_be32(di->di_flags);
214 str->di_payload_format = cpu_to_be32(di->di_payload_format);
215 str->di_height = cpu_to_be16(di->di_height);
216
217 str->di_depth = cpu_to_be16(di->di_depth);
218 str->di_entries = cpu_to_be32(di->di_entries);
219
220 str->di_eattr = cpu_to_be64(di->di_eattr);
221
222}
223
224void gfs2_dinode_print(const struct gfs2_dinode *di)
225{
226 gfs2_meta_header_print(&di->di_header);
227 gfs2_inum_print(&di->di_num);
228
229 pv(di, di_mode, "0%o");
230 pv(di, di_uid, "%u");
231 pv(di, di_gid, "%u");
232 pv(di, di_nlink, "%u");
233 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
234 printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks);
235 printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime);
236 printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime);
237 printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime);
238 pv(di, di_major, "%u");
239 pv(di, di_minor, "%u");
240
241 printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
242 printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
243
244 pv(di, di_flags, "0x%.8X");
245 pv(di, di_payload_format, "%u");
246 pv(di, di_height, "%u");
247
248 pv(di, di_depth, "%u");
249 pv(di, di_entries, "%u");
250
251 printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr);
252}
253
254void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf)
255{
256 const struct gfs2_log_header *str = buf;
257
258 gfs2_meta_header_in(&lh->lh_header, buf);
259 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
260 lh->lh_flags = be32_to_cpu(str->lh_flags);
261 lh->lh_tail = be32_to_cpu(str->lh_tail);
262 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
263 lh->lh_hash = be32_to_cpu(str->lh_hash);
264}
265
266void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf)
267{
268 const struct gfs2_inum_range *str = buf;
269
270 ir->ir_start = be64_to_cpu(str->ir_start);
271 ir->ir_length = be64_to_cpu(str->ir_length);
272}
273
274void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf)
275{
276 struct gfs2_inum_range *str = buf;
277
278 str->ir_start = cpu_to_be64(ir->ir_start);
279 str->ir_length = cpu_to_be64(ir->ir_length);
280}
281
282void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf)
283{
284 const struct gfs2_statfs_change *str = buf;
285
286 sc->sc_total = be64_to_cpu(str->sc_total);
287 sc->sc_free = be64_to_cpu(str->sc_free);
288 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
289}
290
291void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf)
292{
293 struct gfs2_statfs_change *str = buf;
294
295 str->sc_total = cpu_to_be64(sc->sc_total);
296 str->sc_free = cpu_to_be64(sc->sc_free);
297 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
298}
299
300void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf)
301{
302 const struct gfs2_quota_change *str = buf;
303
304 qc->qc_change = be64_to_cpu(str->qc_change);
305 qc->qc_flags = be32_to_cpu(str->qc_flags);
306 qc->qc_id = be32_to_cpu(str->qc_id);
307}
308
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..8d5963c7e123
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,793 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/pagevec.h>
17#include <linux/mpage.h>
18#include <linux/fs.h>
19#include <linux/gfs2_ondisk.h>
20#include <linux/lm_interface.h>
21
22#include "gfs2.h"
23#include "incore.h"
24#include "bmap.h"
25#include "glock.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "ops_address.h"
30#include "quota.h"
31#include "trans.h"
32#include "rgrp.h"
33#include "ops_file.h"
34#include "util.h"
35#include "glops.h"
36
37
38static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
39 unsigned int from, unsigned int to)
40{
41 struct buffer_head *head = page_buffers(page);
42 unsigned int bsize = head->b_size;
43 struct buffer_head *bh;
44 unsigned int start, end;
45
46 for (bh = head, start = 0; bh != head || !start;
47 bh = bh->b_this_page, start = end) {
48 end = start + bsize;
49 if (end <= from || start >= to)
50 continue;
51 gfs2_trans_add_bh(ip->i_gl, bh, 0);
52 }
53}
54
55/**
56 * gfs2_get_block - Fills in a buffer head with details about a block
57 * @inode: The inode
58 * @lblock: The block number to look up
59 * @bh_result: The buffer head to return the result in
60 * @create: Non-zero if we may add block to the file
61 *
62 * Returns: errno
63 */
64
65int gfs2_get_block(struct inode *inode, sector_t lblock,
66 struct buffer_head *bh_result, int create)
67{
68 return gfs2_block_map(inode, lblock, create, bh_result);
69}
70
71/**
72 * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
73 * @inode: The inode
74 * @lblock: The block number to look up
75 * @bh_result: The buffer head to return the result in
76 * @create: Non-zero if we may add block to the file
77 *
78 * Returns: errno
79 */
80
81static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
82 struct buffer_head *bh_result, int create)
83{
84 int error;
85
86 error = gfs2_block_map(inode, lblock, 0, bh_result);
87 if (error)
88 return error;
89 if (bh_result->b_blocknr == 0)
90 return -EIO;
91 return 0;
92}
93
94static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
95 struct buffer_head *bh_result, int create)
96{
97 return gfs2_block_map(inode, lblock, 0, bh_result);
98}
99
100/**
101 * gfs2_writepage - Write complete page
102 * @page: Page to write
103 *
104 * Returns: errno
105 *
106 * Some of this is copied from block_write_full_page() although we still
107 * call it to do most of the work.
108 */
109
110static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
111{
112 struct inode *inode = page->mapping->host;
113 struct gfs2_inode *ip = GFS2_I(inode);
114 struct gfs2_sbd *sdp = GFS2_SB(inode);
115 loff_t i_size = i_size_read(inode);
116 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
117 unsigned offset;
118 int error;
119 int done_trans = 0;
120
121 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
122 unlock_page(page);
123 return -EIO;
124 }
125 if (current->journal_info)
126 goto out_ignore;
127
128 /* Is the page fully outside i_size? (truncate in progress) */
129 offset = i_size & (PAGE_CACHE_SIZE-1);
130 if (page->index > end_index || (page->index == end_index && !offset)) {
131 page->mapping->a_ops->invalidatepage(page, 0);
132 unlock_page(page);
133 return 0; /* don't care */
134 }
135
136 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
137 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
138 if (error)
139 goto out_ignore;
140 if (!page_has_buffers(page)) {
141 create_empty_buffers(page, inode->i_sb->s_blocksize,
142 (1 << BH_Dirty)|(1 << BH_Uptodate));
143 }
144 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
145 done_trans = 1;
146 }
147 error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
148 if (done_trans)
149 gfs2_trans_end(sdp);
150 gfs2_meta_cache_flush(ip);
151 return error;
152
153out_ignore:
154 redirty_page_for_writepage(wbc, page);
155 unlock_page(page);
156 return 0;
157}
158
159static int zero_readpage(struct page *page)
160{
161 void *kaddr;
162
163 kaddr = kmap_atomic(page, KM_USER0);
164 memset(kaddr, 0, PAGE_CACHE_SIZE);
165 kunmap_atomic(kaddr, KM_USER0);
166
167 SetPageUptodate(page);
168
169 return 0;
170}
171
172/**
173 * stuffed_readpage - Fill in a Linux page with stuffed file data
174 * @ip: the inode
175 * @page: the page
176 *
177 * Returns: errno
178 */
179
180static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
181{
182 struct buffer_head *dibh;
183 void *kaddr;
184 int error;
185
186 /* Only the first page of a stuffed file might contain data */
187 if (unlikely(page->index))
188 return zero_readpage(page);
189
190 error = gfs2_meta_inode_buffer(ip, &dibh);
191 if (error)
192 return error;
193
194 kaddr = kmap_atomic(page, KM_USER0);
195 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
196 ip->i_di.di_size);
197 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
198 kunmap_atomic(kaddr, KM_USER0);
199
200 brelse(dibh);
201
202 SetPageUptodate(page);
203
204 return 0;
205}
206
207
208/**
209 * gfs2_readpage - readpage with locking
210 * @file: The file to read a page for. N.B. This may be NULL if we are
211 * reading an internal file.
212 * @page: The page to read
213 *
214 * Returns: errno
215 */
216
217static int gfs2_readpage(struct file *file, struct page *page)
218{
219 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
220 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
221 struct gfs2_file *gf = NULL;
222 struct gfs2_holder gh;
223 int error;
224 int do_unlock = 0;
225
226 if (likely(file != &gfs2_internal_file_sentinel)) {
227 if (file) {
228 gf = file->private_data;
229 if (test_bit(GFF_EXLOCK, &gf->f_flags))
230 /* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
231 goto skip_lock;
232 }
233 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
234 do_unlock = 1;
235 error = gfs2_glock_nq_m_atime(1, &gh);
236 if (unlikely(error))
237 goto out_unlock;
238 }
239
240skip_lock:
241 if (gfs2_is_stuffed(ip)) {
242 error = stuffed_readpage(ip, page);
243 unlock_page(page);
244 } else
245 error = mpage_readpage(page, gfs2_get_block);
246
247 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
248 error = -EIO;
249
250 if (do_unlock) {
251 gfs2_glock_dq_m(1, &gh);
252 gfs2_holder_uninit(&gh);
253 }
254out:
255 return error;
256out_unlock:
257 unlock_page(page);
258 if (do_unlock)
259 gfs2_holder_uninit(&gh);
260 goto out;
261}
262
263/**
264 * gfs2_readpages - Read a bunch of pages at once
265 *
266 * Some notes:
267 * 1. This is only for readahead, so we can simply ignore any things
268 * which are slightly inconvenient (such as locking conflicts between
269 * the page lock and the glock) and return having done no I/O. Its
270 * obviously not something we'd want to do on too regular a basis.
271 * Any I/O we ignore at this time will be done via readpage later.
272 * 2. We have to handle stuffed files here too.
273 * 3. mpage_readpages() does most of the heavy lifting in the common case.
274 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
275 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
276 * well as read-ahead.
277 */
278static int gfs2_readpages(struct file *file, struct address_space *mapping,
279 struct list_head *pages, unsigned nr_pages)
280{
281 struct inode *inode = mapping->host;
282 struct gfs2_inode *ip = GFS2_I(inode);
283 struct gfs2_sbd *sdp = GFS2_SB(inode);
284 struct gfs2_holder gh;
285 unsigned page_idx;
286 int ret;
287 int do_unlock = 0;
288
289 if (likely(file != &gfs2_internal_file_sentinel)) {
290 if (file) {
291 struct gfs2_file *gf = file->private_data;
292 if (test_bit(GFF_EXLOCK, &gf->f_flags))
293 goto skip_lock;
294 }
295 gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
296 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
297 do_unlock = 1;
298 ret = gfs2_glock_nq_m_atime(1, &gh);
299 if (ret == GLR_TRYFAILED)
300 goto out_noerror;
301 if (unlikely(ret))
302 goto out_unlock;
303 }
304skip_lock:
305 if (gfs2_is_stuffed(ip)) {
306 struct pagevec lru_pvec;
307 pagevec_init(&lru_pvec, 0);
308 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
309 struct page *page = list_entry(pages->prev, struct page, lru);
310 prefetchw(&page->flags);
311 list_del(&page->lru);
312 if (!add_to_page_cache(page, mapping,
313 page->index, GFP_KERNEL)) {
314 ret = stuffed_readpage(ip, page);
315 unlock_page(page);
316 if (!pagevec_add(&lru_pvec, page))
317 __pagevec_lru_add(&lru_pvec);
318 } else {
319 page_cache_release(page);
320 }
321 }
322 pagevec_lru_add(&lru_pvec);
323 ret = 0;
324 } else {
325 /* What we really want to do .... */
326 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
327 }
328
329 if (do_unlock) {
330 gfs2_glock_dq_m(1, &gh);
331 gfs2_holder_uninit(&gh);
332 }
333out:
334 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
335 ret = -EIO;
336 return ret;
337out_noerror:
338 ret = 0;
339out_unlock:
340 /* unlock all pages, we can't do any I/O right now */
341 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
342 struct page *page = list_entry(pages->prev, struct page, lru);
343 list_del(&page->lru);
344 unlock_page(page);
345 page_cache_release(page);
346 }
347 if (do_unlock)
348 gfs2_holder_uninit(&gh);
349 goto out;
350}
351
352/**
353 * gfs2_prepare_write - Prepare to write a page to a file
354 * @file: The file to write to
355 * @page: The page which is to be prepared for writing
356 * @from: From (byte range within page)
357 * @to: To (byte range within page)
358 *
359 * Returns: errno
360 */
361
362static int gfs2_prepare_write(struct file *file, struct page *page,
363 unsigned from, unsigned to)
364{
365 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
366 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
367 unsigned int data_blocks, ind_blocks, rblocks;
368 int alloc_required;
369 int error = 0;
370 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
371 loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
372 struct gfs2_alloc *al;
373 unsigned int write_len = to - from;
374
375
376 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
377 error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
378 if (error)
379 goto out_uninit;
380
381 gfs2_write_calc_reserv(ip, write_len, &data_blocks, &ind_blocks);
382
383 error = gfs2_write_alloc_required(ip, pos, write_len, &alloc_required);
384 if (error)
385 goto out_unlock;
386
387
388 ip->i_alloc.al_requested = 0;
389 if (alloc_required) {
390 al = gfs2_alloc_get(ip);
391
392 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
393 if (error)
394 goto out_alloc_put;
395
396 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
397 if (error)
398 goto out_qunlock;
399
400 al->al_requested = data_blocks + ind_blocks;
401 error = gfs2_inplace_reserve(ip);
402 if (error)
403 goto out_qunlock;
404 }
405
406 rblocks = RES_DINODE + ind_blocks;
407 if (gfs2_is_jdata(ip))
408 rblocks += data_blocks ? data_blocks : 1;
409 if (ind_blocks || data_blocks)
410 rblocks += RES_STATFS + RES_QUOTA;
411
412 error = gfs2_trans_begin(sdp, rblocks, 0);
413 if (error)
414 goto out;
415
416 if (gfs2_is_stuffed(ip)) {
417 if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
418 error = gfs2_unstuff_dinode(ip, page);
419 if (error == 0)
420 goto prepare_write;
421 } else if (!PageUptodate(page))
422 error = stuffed_readpage(ip, page);
423 goto out;
424 }
425
426prepare_write:
427 error = block_prepare_write(page, from, to, gfs2_get_block);
428
429out:
430 if (error) {
431 gfs2_trans_end(sdp);
432 if (alloc_required) {
433 gfs2_inplace_release(ip);
434out_qunlock:
435 gfs2_quota_unlock(ip);
436out_alloc_put:
437 gfs2_alloc_put(ip);
438 }
439out_unlock:
440 gfs2_glock_dq_m(1, &ip->i_gh);
441out_uninit:
442 gfs2_holder_uninit(&ip->i_gh);
443 }
444
445 return error;
446}
447
448/**
449 * gfs2_commit_write - Commit write to a file
450 * @file: The file to write to
451 * @page: The page containing the data
452 * @from: From (byte range within page)
453 * @to: To (byte range within page)
454 *
455 * Returns: errno
456 */
457
458static int gfs2_commit_write(struct file *file, struct page *page,
459 unsigned from, unsigned to)
460{
461 struct inode *inode = page->mapping->host;
462 struct gfs2_inode *ip = GFS2_I(inode);
463 struct gfs2_sbd *sdp = GFS2_SB(inode);
464 int error = -EOPNOTSUPP;
465 struct buffer_head *dibh;
466 struct gfs2_alloc *al = &ip->i_alloc;
467 struct gfs2_dinode *di;
468
469 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
470 goto fail_nounlock;
471
472 error = gfs2_meta_inode_buffer(ip, &dibh);
473 if (error)
474 goto fail_endtrans;
475
476 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
477 di = (struct gfs2_dinode *)dibh->b_data;
478
479 if (gfs2_is_stuffed(ip)) {
480 u64 file_size;
481 void *kaddr;
482
483 file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to;
484
485 kaddr = kmap_atomic(page, KM_USER0);
486 memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
487 kaddr + from, to - from);
488 kunmap_atomic(kaddr, KM_USER0);
489
490 SetPageUptodate(page);
491
492 if (inode->i_size < file_size)
493 i_size_write(inode, file_size);
494 } else {
495 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
496 gfs2_is_jdata(ip))
497 gfs2_page_add_databufs(ip, page, from, to);
498 error = generic_commit_write(file, page, from, to);
499 if (error)
500 goto fail;
501 }
502
503 if (ip->i_di.di_size < inode->i_size) {
504 ip->i_di.di_size = inode->i_size;
505 di->di_size = cpu_to_be64(inode->i_size);
506 }
507
508 di->di_mode = cpu_to_be32(inode->i_mode);
509 di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
510 di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
511 di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
512
513 brelse(dibh);
514 gfs2_trans_end(sdp);
515 if (al->al_requested) {
516 gfs2_inplace_release(ip);
517 gfs2_quota_unlock(ip);
518 gfs2_alloc_put(ip);
519 }
520 gfs2_glock_dq_m(1, &ip->i_gh);
521 gfs2_holder_uninit(&ip->i_gh);
522 return 0;
523
524fail:
525 brelse(dibh);
526fail_endtrans:
527 gfs2_trans_end(sdp);
528 if (al->al_requested) {
529 gfs2_inplace_release(ip);
530 gfs2_quota_unlock(ip);
531 gfs2_alloc_put(ip);
532 }
533 gfs2_glock_dq_m(1, &ip->i_gh);
534 gfs2_holder_uninit(&ip->i_gh);
535fail_nounlock:
536 ClearPageUptodate(page);
537 return error;
538}
539
540/**
541 * gfs2_bmap - Block map function
542 * @mapping: Address space info
543 * @lblock: The block to map
544 *
545 * Returns: The disk address for the block or 0 on hole or error
546 */
547
548static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
549{
550 struct gfs2_inode *ip = GFS2_I(mapping->host);
551 struct gfs2_holder i_gh;
552 sector_t dblock = 0;
553 int error;
554
555 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
556 if (error)
557 return 0;
558
559 if (!gfs2_is_stuffed(ip))
560 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
561
562 gfs2_glock_dq_uninit(&i_gh);
563
564 return dblock;
565}
566
567static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
568{
569 struct gfs2_bufdata *bd;
570
571 gfs2_log_lock(sdp);
572 bd = bh->b_private;
573 if (bd) {
574 bd->bd_bh = NULL;
575 bh->b_private = NULL;
576 }
577 gfs2_log_unlock(sdp);
578
579 lock_buffer(bh);
580 clear_buffer_dirty(bh);
581 bh->b_bdev = NULL;
582 clear_buffer_mapped(bh);
583 clear_buffer_req(bh);
584 clear_buffer_new(bh);
585 clear_buffer_delay(bh);
586 unlock_buffer(bh);
587}
588
589static void gfs2_invalidatepage(struct page *page, unsigned long offset)
590{
591 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
592 struct buffer_head *head, *bh, *next;
593 unsigned int curr_off = 0;
594
595 BUG_ON(!PageLocked(page));
596 if (!page_has_buffers(page))
597 return;
598
599 bh = head = page_buffers(page);
600 do {
601 unsigned int next_off = curr_off + bh->b_size;
602 next = bh->b_this_page;
603
604 if (offset <= curr_off)
605 discard_buffer(sdp, bh);
606
607 curr_off = next_off;
608 bh = next;
609 } while (bh != head);
610
611 if (!offset)
612 try_to_release_page(page, 0);
613
614 return;
615}
616
617static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
618 const struct iovec *iov, loff_t offset,
619 unsigned long nr_segs)
620{
621 struct file *file = iocb->ki_filp;
622 struct inode *inode = file->f_mapping->host;
623 struct gfs2_inode *ip = GFS2_I(inode);
624 struct gfs2_holder gh;
625 int rv;
626
627 if (rw == READ)
628 mutex_lock(&inode->i_mutex);
629 /*
630 * Shared lock, even if its a write, since we do no allocation
631 * on this path. All we need change is atime.
632 */
633 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
634 rv = gfs2_glock_nq_m_atime(1, &gh);
635 if (rv)
636 goto out;
637
638 if (offset > i_size_read(inode))
639 goto out;
640
641 /*
642 * Should we return an error here? I can't see that O_DIRECT for
643 * a journaled file makes any sense. For now we'll silently fall
644 * back to buffered I/O, likewise we do the same for stuffed
645 * files since they are (a) small and (b) unaligned.
646 */
647 if (gfs2_is_jdata(ip))
648 goto out;
649
650 if (gfs2_is_stuffed(ip))
651 goto out;
652
653 rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
654 inode->i_sb->s_bdev,
655 iov, offset, nr_segs,
656 gfs2_get_block_direct, NULL);
657out:
658 gfs2_glock_dq_m(1, &gh);
659 gfs2_holder_uninit(&gh);
660 if (rw == READ)
661 mutex_unlock(&inode->i_mutex);
662
663 return rv;
664}
665
666/**
667 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
668 * @bh: the buffer we're stuck on
669 *
670 */
671
672static void stuck_releasepage(struct buffer_head *bh)
673{
674 struct inode *inode = bh->b_page->mapping->host;
675 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
676 struct gfs2_bufdata *bd = bh->b_private;
677 struct gfs2_glock *gl;
678static unsigned limit = 0;
679
680 if (limit > 3)
681 return;
682 limit++;
683
684 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
685 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
686 (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
687 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
688 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
689
690 if (!bd)
691 return;
692
693 gl = bd->bd_gl;
694
695 fs_warn(sdp, "gl = (%u, %llu)\n",
696 gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
697
698 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
699 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
700 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
701
702 if (gl->gl_ops == &gfs2_inode_glops) {
703 struct gfs2_inode *ip = gl->gl_object;
704 unsigned int x;
705
706 if (!ip)
707 return;
708
709 fs_warn(sdp, "ip = %llu %llu\n",
710 (unsigned long long)ip->i_num.no_formal_ino,
711 (unsigned long long)ip->i_num.no_addr);
712
713 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
714 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
715 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
716 }
717}
718
719/**
720 * gfs2_releasepage - free the metadata associated with a page
721 * @page: the page that's being released
722 * @gfp_mask: passed from Linux VFS, ignored by us
723 *
724 * Call try_to_free_buffers() if the buffers in this page can be
725 * released.
726 *
727 * Returns: 0
728 */
729
730int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
731{
732 struct inode *aspace = page->mapping->host;
733 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
734 struct buffer_head *bh, *head;
735 struct gfs2_bufdata *bd;
736 unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
737
738 if (!page_has_buffers(page))
739 goto out;
740
741 head = bh = page_buffers(page);
742 do {
743 while (atomic_read(&bh->b_count)) {
744 if (!atomic_read(&aspace->i_writecount))
745 return 0;
746
747 if (time_after_eq(jiffies, t)) {
748 stuck_releasepage(bh);
749 /* should we withdraw here? */
750 return 0;
751 }
752
753 yield();
754 }
755
756 gfs2_assert_warn(sdp, !buffer_pinned(bh));
757 gfs2_assert_warn(sdp, !buffer_dirty(bh));
758
759 gfs2_log_lock(sdp);
760 bd = bh->b_private;
761 if (bd) {
762 gfs2_assert_warn(sdp, bd->bd_bh == bh);
763 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
764 gfs2_assert_warn(sdp, !bd->bd_ail);
765 bd->bd_bh = NULL;
766 if (!list_empty(&bd->bd_le.le_list))
767 bd = NULL;
768 bh->b_private = NULL;
769 }
770 gfs2_log_unlock(sdp);
771 if (bd)
772 kmem_cache_free(gfs2_bufdata_cachep, bd);
773
774 bh = bh->b_this_page;
775 } while (bh != head);
776
777out:
778 return try_to_free_buffers(page);
779}
780
781const struct address_space_operations gfs2_file_aops = {
782 .writepage = gfs2_writepage,
783 .readpage = gfs2_readpage,
784 .readpages = gfs2_readpages,
785 .sync_page = block_sync_page,
786 .prepare_write = gfs2_prepare_write,
787 .commit_write = gfs2_commit_write,
788 .bmap = gfs2_bmap,
789 .invalidatepage = gfs2_invalidatepage,
790 .releasepage = gfs2_releasepage,
791 .direct_IO = gfs2_direct_IO,
792};
793
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..35aaee4aa7e1
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,22 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13#include <linux/fs.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16
17extern const struct address_space_operations gfs2_file_aops;
18extern int gfs2_get_block(struct inode *inode, sector_t lblock,
19 struct buffer_head *bh_result, int create);
20extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
21
22#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..00041b1b8025
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,119 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/smp_lock.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/crc32.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "dir.h"
23#include "glock.h"
24#include "ops_dentry.h"
25#include "util.h"
26
27/**
28 * gfs2_drevalidate - Check directory lookup consistency
29 * @dentry: the mapping to check
30 * @nd:
31 *
32 * Check to make sure the lookup necessary to arrive at this inode from its
33 * parent is still good.
34 *
35 * Returns: 1 if the dentry is ok, 0 if it isn't
36 */
37
38static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
39{
40 struct dentry *parent = dget_parent(dentry);
41 struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
42 struct gfs2_inode *dip = GFS2_I(parent->d_inode);
43 struct inode *inode = dentry->d_inode;
44 struct gfs2_holder d_gh;
45 struct gfs2_inode *ip;
46 struct gfs2_inum inum;
47 unsigned int type;
48 int error;
49
50 if (inode && is_bad_inode(inode))
51 goto invalid;
52
53 if (sdp->sd_args.ar_localcaching)
54 goto valid;
55
56 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
57 if (error)
58 goto fail;
59
60 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
61 switch (error) {
62 case 0:
63 if (!inode)
64 goto invalid_gunlock;
65 break;
66 case -ENOENT:
67 if (!inode)
68 goto valid_gunlock;
69 goto invalid_gunlock;
70 default:
71 goto fail_gunlock;
72 }
73
74 ip = GFS2_I(inode);
75
76 if (!gfs2_inum_equal(&ip->i_num, &inum))
77 goto invalid_gunlock;
78
79 if (IF2DT(ip->i_di.di_mode) != type) {
80 gfs2_consist_inode(dip);
81 goto fail_gunlock;
82 }
83
84valid_gunlock:
85 gfs2_glock_dq_uninit(&d_gh);
86valid:
87 dput(parent);
88 return 1;
89
90invalid_gunlock:
91 gfs2_glock_dq_uninit(&d_gh);
92invalid:
93 if (inode && S_ISDIR(inode->i_mode)) {
94 if (have_submounts(dentry))
95 goto valid;
96 shrink_dcache_parent(dentry);
97 }
98 d_drop(dentry);
99 dput(parent);
100 return 0;
101
102fail_gunlock:
103 gfs2_glock_dq_uninit(&d_gh);
104fail:
105 dput(parent);
106 return 0;
107}
108
109static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
110{
111 str->hash = gfs2_disk_hash(str->name, str->len);
112 return 0;
113}
114
115struct dentry_operations gfs2_dops = {
116 .d_revalidate = gfs2_drevalidate,
117 .d_hash = gfs2_dhash,
118};
119
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..5caa3db4d3f5
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13#include <linux/dcache.h>
14
15extern struct dentry_operations gfs2_dops;
16
17#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..86127d93bd35
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,298 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "dir.h"
22#include "glock.h"
23#include "glops.h"
24#include "inode.h"
25#include "ops_export.h"
26#include "rgrp.h"
27#include "util.h"
28
29static struct dentry *gfs2_decode_fh(struct super_block *sb,
30 __u32 *fh,
31 int fh_len,
32 int fh_type,
33 int (*acceptable)(void *context,
34 struct dentry *dentry),
35 void *context)
36{
37 struct gfs2_fh_obj fh_obj;
38 struct gfs2_inum *this, parent;
39
40 if (fh_type != fh_len)
41 return NULL;
42
43 this = &fh_obj.this;
44 fh_obj.imode = DT_UNKNOWN;
45 memset(&parent, 0, sizeof(struct gfs2_inum));
46
47 switch (fh_type) {
48 case GFS2_LARGE_FH_SIZE:
49 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
50 parent.no_formal_ino |= be32_to_cpu(fh[5]);
51 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
52 parent.no_addr |= be32_to_cpu(fh[7]);
53 fh_obj.imode = be32_to_cpu(fh[8]);
54 case GFS2_SMALL_FH_SIZE:
55 this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
56 this->no_formal_ino |= be32_to_cpu(fh[1]);
57 this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
58 this->no_addr |= be32_to_cpu(fh[3]);
59 break;
60 default:
61 return NULL;
62 }
63
64 return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
65 acceptable, context);
66}
67
68static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
69 int connectable)
70{
71 struct inode *inode = dentry->d_inode;
72 struct super_block *sb = inode->i_sb;
73 struct gfs2_inode *ip = GFS2_I(inode);
74
75 if (*len < GFS2_SMALL_FH_SIZE ||
76 (connectable && *len < GFS2_LARGE_FH_SIZE))
77 return 255;
78
79 fh[0] = ip->i_num.no_formal_ino >> 32;
80 fh[0] = cpu_to_be32(fh[0]);
81 fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
82 fh[1] = cpu_to_be32(fh[1]);
83 fh[2] = ip->i_num.no_addr >> 32;
84 fh[2] = cpu_to_be32(fh[2]);
85 fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
86 fh[3] = cpu_to_be32(fh[3]);
87 *len = GFS2_SMALL_FH_SIZE;
88
89 if (!connectable || inode == sb->s_root->d_inode)
90 return *len;
91
92 spin_lock(&dentry->d_lock);
93 inode = dentry->d_parent->d_inode;
94 ip = GFS2_I(inode);
95 igrab(inode);
96 spin_unlock(&dentry->d_lock);
97
98 fh[4] = ip->i_num.no_formal_ino >> 32;
99 fh[4] = cpu_to_be32(fh[4]);
100 fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
101 fh[5] = cpu_to_be32(fh[5]);
102 fh[6] = ip->i_num.no_addr >> 32;
103 fh[6] = cpu_to_be32(fh[6]);
104 fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
105 fh[7] = cpu_to_be32(fh[7]);
106
107 fh[8] = cpu_to_be32(inode->i_mode);
108 fh[9] = 0; /* pad to double word */
109 *len = GFS2_LARGE_FH_SIZE;
110
111 iput(inode);
112
113 return *len;
114}
115
116struct get_name_filldir {
117 struct gfs2_inum inum;
118 char *name;
119};
120
121static int get_name_filldir(void *opaque, const char *name, unsigned int length,
122 u64 offset, struct gfs2_inum *inum,
123 unsigned int type)
124{
125 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
126
127 if (!gfs2_inum_equal(inum, &gnfd->inum))
128 return 0;
129
130 memcpy(gnfd->name, name, length);
131 gnfd->name[length] = 0;
132
133 return 1;
134}
135
136static int gfs2_get_name(struct dentry *parent, char *name,
137 struct dentry *child)
138{
139 struct inode *dir = parent->d_inode;
140 struct inode *inode = child->d_inode;
141 struct gfs2_inode *dip, *ip;
142 struct get_name_filldir gnfd;
143 struct gfs2_holder gh;
144 u64 offset = 0;
145 int error;
146
147 if (!dir)
148 return -EINVAL;
149
150 if (!S_ISDIR(dir->i_mode) || !inode)
151 return -EINVAL;
152
153 dip = GFS2_I(dir);
154 ip = GFS2_I(inode);
155
156 *name = 0;
157 gnfd.inum = ip->i_num;
158 gnfd.name = name;
159
160 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
161 if (error)
162 return error;
163
164 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
165
166 gfs2_glock_dq_uninit(&gh);
167
168 if (!error && !*name)
169 error = -ENOENT;
170
171 return error;
172}
173
174static struct dentry *gfs2_get_parent(struct dentry *child)
175{
176 struct qstr dotdot;
177 struct inode *inode;
178 struct dentry *dentry;
179
180 gfs2_str2qstr(&dotdot, "..");
181 inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
182
183 if (!inode)
184 return ERR_PTR(-ENOENT);
185 /*
186 * In case of an error, @inode carries the error value, and we
187 * have to return that as a(n invalid) pointer to dentry.
188 */
189 if (IS_ERR(inode))
190 return ERR_PTR(PTR_ERR(inode));
191
192 dentry = d_alloc_anon(inode);
193 if (!dentry) {
194 iput(inode);
195 return ERR_PTR(-ENOMEM);
196 }
197
198 return dentry;
199}
200
201static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
202{
203 struct gfs2_sbd *sdp = sb->s_fs_info;
204 struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
205 struct gfs2_inum *inum = &fh_obj->this;
206 struct gfs2_holder i_gh, ri_gh, rgd_gh;
207 struct gfs2_rgrpd *rgd;
208 struct inode *inode;
209 struct dentry *dentry;
210 int error;
211
212 /* System files? */
213
214 inode = gfs2_ilookup(sb, inum);
215 if (inode) {
216 if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
217 iput(inode);
218 return ERR_PTR(-ESTALE);
219 }
220 goto out_inode;
221 }
222
223 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
224 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
225 &i_gh);
226 if (error)
227 return ERR_PTR(error);
228
229 error = gfs2_rindex_hold(sdp, &ri_gh);
230 if (error)
231 goto fail;
232
233 error = -EINVAL;
234 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
235 if (!rgd)
236 goto fail_rindex;
237
238 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
239 if (error)
240 goto fail_rindex;
241
242 error = -ESTALE;
243 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
244 goto fail_rgd;
245
246 gfs2_glock_dq_uninit(&rgd_gh);
247 gfs2_glock_dq_uninit(&ri_gh);
248
249 inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
250 if (!inode)
251 goto fail;
252 if (IS_ERR(inode)) {
253 error = PTR_ERR(inode);
254 goto fail;
255 }
256
257 error = gfs2_inode_refresh(GFS2_I(inode));
258 if (error) {
259 iput(inode);
260 goto fail;
261 }
262
263 error = -EIO;
264 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
265 iput(inode);
266 goto fail;
267 }
268
269 gfs2_glock_dq_uninit(&i_gh);
270
271out_inode:
272 dentry = d_alloc_anon(inode);
273 if (!dentry) {
274 iput(inode);
275 return ERR_PTR(-ENOMEM);
276 }
277
278 return dentry;
279
280fail_rgd:
281 gfs2_glock_dq_uninit(&rgd_gh);
282
283fail_rindex:
284 gfs2_glock_dq_uninit(&ri_gh);
285
286fail:
287 gfs2_glock_dq_uninit(&i_gh);
288 return ERR_PTR(error);
289}
290
291struct export_operations gfs2_export_ops = {
292 .decode_fh = gfs2_decode_fh,
293 .encode_fh = gfs2_encode_fh,
294 .get_name = gfs2_get_name,
295 .get_parent = gfs2_get_parent,
296 .get_dentry = gfs2_get_dentry,
297};
298
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09aca5046fb1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,22 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13#define GFS2_SMALL_FH_SIZE 4
14#define GFS2_LARGE_FH_SIZE 10
15
16extern struct export_operations gfs2_export_ops;
17struct gfs2_fh_obj {
18 struct gfs2_inum this;
19 __u32 imode;
20};
21
22#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..3064f133bf3c
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,661 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/uio.h>
17#include <linux/blkdev.h>
18#include <linux/mm.h>
19#include <linux/smp_lock.h>
20#include <linux/fs.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/ext2_fs.h>
23#include <linux/crc32.h>
24#include <linux/lm_interface.h>
25#include <asm/uaccess.h>
26
27#include "gfs2.h"
28#include "incore.h"
29#include "bmap.h"
30#include "dir.h"
31#include "glock.h"
32#include "glops.h"
33#include "inode.h"
34#include "lm.h"
35#include "log.h"
36#include "meta_io.h"
37#include "ops_file.h"
38#include "ops_vm.h"
39#include "quota.h"
40#include "rgrp.h"
41#include "trans.h"
42#include "util.h"
43#include "eaops.h"
44
45/* For regular, non-NFS */
46struct filldir_reg {
47 struct gfs2_sbd *fdr_sbd;
48 int fdr_prefetch;
49
50 filldir_t fdr_filldir;
51 void *fdr_opaque;
52};
53
54/*
55 * Most fields left uninitialised to catch anybody who tries to
56 * use them. f_flags set to prevent file_accessed() from touching
57 * any other part of this. Its use is purely as a flag so that we
58 * know (in readpage()) whether or not do to locking.
59 */
60struct file gfs2_internal_file_sentinel = {
61 .f_flags = O_NOATIME|O_RDONLY,
62};
63
64static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
65 unsigned long offset, unsigned long size)
66{
67 char *kaddr;
68 unsigned long count = desc->count;
69
70 if (size > count)
71 size = count;
72
73 kaddr = kmap(page);
74 memcpy(desc->arg.buf, kaddr + offset, size);
75 kunmap(page);
76
77 desc->count = count - size;
78 desc->written += size;
79 desc->arg.buf += size;
80 return size;
81}
82
83int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
84 char *buf, loff_t *pos, unsigned size)
85{
86 struct inode *inode = &ip->i_inode;
87 read_descriptor_t desc;
88 desc.written = 0;
89 desc.arg.buf = buf;
90 desc.count = size;
91 desc.error = 0;
92 do_generic_mapping_read(inode->i_mapping, ra_state,
93 &gfs2_internal_file_sentinel, pos, &desc,
94 gfs2_read_actor);
95 return desc.written ? desc.written : desc.error;
96}
97
98/**
99 * gfs2_llseek - seek to a location in a file
100 * @file: the file
101 * @offset: the offset
102 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
103 *
104 * SEEK_END requires the glock for the file because it references the
105 * file's size.
106 *
107 * Returns: The new offset, or errno
108 */
109
110static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
111{
112 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
113 struct gfs2_holder i_gh;
114 loff_t error;
115
116 if (origin == 2) {
117 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
118 &i_gh);
119 if (!error) {
120 error = remote_llseek(file, offset, origin);
121 gfs2_glock_dq_uninit(&i_gh);
122 }
123 } else
124 error = remote_llseek(file, offset, origin);
125
126 return error;
127}
128
129/**
130 * filldir_func - Report a directory entry to the caller of gfs2_dir_read()
131 * @opaque: opaque data used by the function
132 * @name: the name of the directory entry
133 * @length: the length of the name
134 * @offset: the entry's offset in the directory
135 * @inum: the inode number the entry points to
136 * @type: the type of inode the entry points to
137 *
138 * Returns: 0 on success, 1 if buffer full
139 */
140
141static int filldir_func(void *opaque, const char *name, unsigned int length,
142 u64 offset, struct gfs2_inum *inum,
143 unsigned int type)
144{
145 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
146 struct gfs2_sbd *sdp = fdr->fdr_sbd;
147 int error;
148
149 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
150 inum->no_addr, type);
151 if (error)
152 return 1;
153
154 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
155 gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops,
156 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
157 gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops,
158 LM_ST_SHARED, LM_FLAG_TRY);
159 }
160
161 return 0;
162}
163
164/**
165 * gfs2_readdir - Read directory entries from a directory
166 * @file: The directory to read from
167 * @dirent: Buffer for dirents
168 * @filldir: Function used to do the copying
169 *
170 * Returns: errno
171 */
172
173static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
174{
175 struct inode *dir = file->f_mapping->host;
176 struct gfs2_inode *dip = GFS2_I(dir);
177 struct filldir_reg fdr;
178 struct gfs2_holder d_gh;
179 u64 offset = file->f_pos;
180 int error;
181
182 fdr.fdr_sbd = GFS2_SB(dir);
183 fdr.fdr_prefetch = 1;
184 fdr.fdr_filldir = filldir;
185 fdr.fdr_opaque = dirent;
186
187 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
188 error = gfs2_glock_nq_atime(&d_gh);
189 if (error) {
190 gfs2_holder_uninit(&d_gh);
191 return error;
192 }
193
194 error = gfs2_dir_read(dir, &offset, &fdr, filldir_func);
195
196 gfs2_glock_dq_uninit(&d_gh);
197
198 file->f_pos = offset;
199
200 return error;
201}
202
203/**
204 * fsflags_cvt
205 * @table: A table of 32 u32 flags
206 * @val: a 32 bit value to convert
207 *
208 * This function can be used to convert between fsflags values and
209 * GFS2's own flags values.
210 *
211 * Returns: the converted flags
212 */
213static u32 fsflags_cvt(const u32 *table, u32 val)
214{
215 u32 res = 0;
216 while(val) {
217 if (val & 1)
218 res |= *table;
219 table++;
220 val >>= 1;
221 }
222 return res;
223}
224
225static const u32 fsflags_to_gfs2[32] = {
226 [3] = GFS2_DIF_SYNC,
227 [4] = GFS2_DIF_IMMUTABLE,
228 [5] = GFS2_DIF_APPENDONLY,
229 [7] = GFS2_DIF_NOATIME,
230 [12] = GFS2_DIF_EXHASH,
231 [14] = GFS2_DIF_JDATA,
232 [20] = GFS2_DIF_DIRECTIO,
233};
234
235static const u32 gfs2_to_fsflags[32] = {
236 [gfs2fl_Sync] = FS_SYNC_FL,
237 [gfs2fl_Immutable] = FS_IMMUTABLE_FL,
238 [gfs2fl_AppendOnly] = FS_APPEND_FL,
239 [gfs2fl_NoAtime] = FS_NOATIME_FL,
240 [gfs2fl_ExHash] = FS_INDEX_FL,
241 [gfs2fl_Jdata] = FS_JOURNAL_DATA_FL,
242 [gfs2fl_Directio] = FS_DIRECTIO_FL,
243 [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
244 [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
245};
246
247static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
248{
249 struct inode *inode = filp->f_dentry->d_inode;
250 struct gfs2_inode *ip = GFS2_I(inode);
251 struct gfs2_holder gh;
252 int error;
253 u32 fsflags;
254
255 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
256 error = gfs2_glock_nq_m_atime(1, &gh);
257 if (error)
258 return error;
259
260 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
261 if (put_user(fsflags, ptr))
262 error = -EFAULT;
263
264 gfs2_glock_dq_m(1, &gh);
265 gfs2_holder_uninit(&gh);
266 return error;
267}
268
269/* Flags that can be set by user space */
270#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
271 GFS2_DIF_DIRECTIO| \
272 GFS2_DIF_IMMUTABLE| \
273 GFS2_DIF_APPENDONLY| \
274 GFS2_DIF_NOATIME| \
275 GFS2_DIF_SYNC| \
276 GFS2_DIF_SYSTEM| \
277 GFS2_DIF_INHERIT_DIRECTIO| \
278 GFS2_DIF_INHERIT_JDATA)
279
280/**
281 * gfs2_set_flags - set flags on an inode
282 * @inode: The inode
283 * @flags: The flags to set
284 * @mask: Indicates which flags are valid
285 *
286 */
287static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
288{
289 struct inode *inode = filp->f_dentry->d_inode;
290 struct gfs2_inode *ip = GFS2_I(inode);
291 struct gfs2_sbd *sdp = GFS2_SB(inode);
292 struct buffer_head *bh;
293 struct gfs2_holder gh;
294 int error;
295 u32 new_flags, flags;
296
297 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
298 if (error)
299 return error;
300
301 flags = ip->i_di.di_flags;
302 new_flags = (flags & ~mask) | (reqflags & mask);
303 if ((new_flags ^ flags) == 0)
304 goto out;
305
306 if (S_ISDIR(inode->i_mode)) {
307 if ((new_flags ^ flags) & GFS2_DIF_JDATA)
308 new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
309 if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
310 new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
311 }
312
313 error = -EINVAL;
314 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
315 goto out;
316
317 error = -EPERM;
318 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
319 goto out;
320 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
321 goto out;
322 if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
323 !capable(CAP_LINUX_IMMUTABLE))
324 goto out;
325 if (!IS_IMMUTABLE(inode)) {
326 error = permission(inode, MAY_WRITE, NULL);
327 if (error)
328 goto out;
329 }
330
331 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
332 if (error)
333 goto out;
334 error = gfs2_meta_inode_buffer(ip, &bh);
335 if (error)
336 goto out_trans_end;
337 gfs2_trans_add_bh(ip->i_gl, bh, 1);
338 ip->i_di.di_flags = new_flags;
339 gfs2_dinode_out(&ip->i_di, bh->b_data);
340 brelse(bh);
341out_trans_end:
342 gfs2_trans_end(sdp);
343out:
344 gfs2_glock_dq_uninit(&gh);
345 return error;
346}
347
348static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
349{
350 u32 fsflags, gfsflags;
351 if (get_user(fsflags, ptr))
352 return -EFAULT;
353 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
354 return do_gfs2_set_flags(filp, gfsflags, ~0);
355}
356
357static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
358{
359 switch(cmd) {
360 case FS_IOC_GETFLAGS:
361 return gfs2_get_flags(filp, (u32 __user *)arg);
362 case FS_IOC_SETFLAGS:
363 return gfs2_set_flags(filp, (u32 __user *)arg);
364 }
365 return -ENOTTY;
366}
367
368
369/**
370 * gfs2_mmap -
371 * @file: The file to map
372 * @vma: The VMA which described the mapping
373 *
374 * Returns: 0 or error code
375 */
376
377static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
378{
379 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
380 struct gfs2_holder i_gh;
381 int error;
382
383 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
384 error = gfs2_glock_nq_atime(&i_gh);
385 if (error) {
386 gfs2_holder_uninit(&i_gh);
387 return error;
388 }
389
390 /* This is VM_MAYWRITE instead of VM_WRITE because a call
391 to mprotect() can turn on VM_WRITE later. */
392
393 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
394 (VM_MAYSHARE | VM_MAYWRITE))
395 vma->vm_ops = &gfs2_vm_ops_sharewrite;
396 else
397 vma->vm_ops = &gfs2_vm_ops_private;
398
399 gfs2_glock_dq_uninit(&i_gh);
400
401 return error;
402}
403
404/**
405 * gfs2_open - open a file
406 * @inode: the inode to open
407 * @file: the struct file for this opening
408 *
409 * Returns: errno
410 */
411
412static int gfs2_open(struct inode *inode, struct file *file)
413{
414 struct gfs2_inode *ip = GFS2_I(inode);
415 struct gfs2_holder i_gh;
416 struct gfs2_file *fp;
417 int error;
418
419 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
420 if (!fp)
421 return -ENOMEM;
422
423 mutex_init(&fp->f_fl_mutex);
424
425 gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
426 file->private_data = fp;
427
428 if (S_ISREG(ip->i_di.di_mode)) {
429 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
430 &i_gh);
431 if (error)
432 goto fail;
433
434 if (!(file->f_flags & O_LARGEFILE) &&
435 ip->i_di.di_size > MAX_NON_LFS) {
436 error = -EFBIG;
437 goto fail_gunlock;
438 }
439
440 /* Listen to the Direct I/O flag */
441
442 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
443 file->f_flags |= O_DIRECT;
444
445 gfs2_glock_dq_uninit(&i_gh);
446 }
447
448 return 0;
449
450fail_gunlock:
451 gfs2_glock_dq_uninit(&i_gh);
452fail:
453 file->private_data = NULL;
454 kfree(fp);
455 return error;
456}
457
458/**
459 * gfs2_close - called to close a struct file
460 * @inode: the inode the struct file belongs to
461 * @file: the struct file being closed
462 *
463 * Returns: errno
464 */
465
466static int gfs2_close(struct inode *inode, struct file *file)
467{
468 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
469 struct gfs2_file *fp;
470
471 fp = file->private_data;
472 file->private_data = NULL;
473
474 if (gfs2_assert_warn(sdp, fp))
475 return -EIO;
476
477 kfree(fp);
478
479 return 0;
480}
481
482/**
483 * gfs2_fsync - sync the dirty data for a file (across the cluster)
484 * @file: the file that points to the dentry (we ignore this)
485 * @dentry: the dentry that points to the inode to sync
486 *
487 * Returns: errno
488 */
489
490static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
491{
492 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
493
494 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
495
496 return 0;
497}
498
499/**
500 * gfs2_lock - acquire/release a posix lock on a file
501 * @file: the file pointer
502 * @cmd: either modify or retrieve lock state, possibly wait
503 * @fl: type and range of lock
504 *
505 * Returns: errno
506 */
507
508static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
509{
510 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
511 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
512 struct lm_lockname name =
513 { .ln_number = ip->i_num.no_addr,
514 .ln_type = LM_TYPE_PLOCK };
515
516 if (!(fl->fl_flags & FL_POSIX))
517 return -ENOLCK;
518 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
519 return -ENOLCK;
520
521 if (sdp->sd_args.ar_localflocks) {
522 if (IS_GETLK(cmd)) {
523 struct file_lock tmp;
524 int ret;
525 ret = posix_test_lock(file, fl, &tmp);
526 fl->fl_type = F_UNLCK;
527 if (ret)
528 memcpy(fl, &tmp, sizeof(struct file_lock));
529 return 0;
530 } else {
531 return posix_lock_file_wait(file, fl);
532 }
533 }
534
535 if (IS_GETLK(cmd))
536 return gfs2_lm_plock_get(sdp, &name, file, fl);
537 else if (fl->fl_type == F_UNLCK)
538 return gfs2_lm_punlock(sdp, &name, file, fl);
539 else
540 return gfs2_lm_plock(sdp, &name, file, cmd, fl);
541}
542
543static int do_flock(struct file *file, int cmd, struct file_lock *fl)
544{
545 struct gfs2_file *fp = file->private_data;
546 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
547 struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
548 struct gfs2_glock *gl;
549 unsigned int state;
550 int flags;
551 int error = 0;
552
553 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
554 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
555
556 mutex_lock(&fp->f_fl_mutex);
557
558 gl = fl_gh->gh_gl;
559 if (gl) {
560 if (fl_gh->gh_state == state)
561 goto out;
562 gfs2_glock_hold(gl);
563 flock_lock_file_wait(file,
564 &(struct file_lock){.fl_type = F_UNLCK});
565 gfs2_glock_dq_uninit(fl_gh);
566 } else {
567 error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
568 ip->i_num.no_addr, &gfs2_flock_glops,
569 CREATE, &gl);
570 if (error)
571 goto out;
572 }
573
574 gfs2_holder_init(gl, state, flags, fl_gh);
575 gfs2_glock_put(gl);
576
577 error = gfs2_glock_nq(fl_gh);
578 if (error) {
579 gfs2_holder_uninit(fl_gh);
580 if (error == GLR_TRYFAILED)
581 error = -EAGAIN;
582 } else {
583 error = flock_lock_file_wait(file, fl);
584 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
585 }
586
587out:
588 mutex_unlock(&fp->f_fl_mutex);
589 return error;
590}
591
592static void do_unflock(struct file *file, struct file_lock *fl)
593{
594 struct gfs2_file *fp = file->private_data;
595 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
596
597 mutex_lock(&fp->f_fl_mutex);
598 flock_lock_file_wait(file, fl);
599 if (fl_gh->gh_gl)
600 gfs2_glock_dq_uninit(fl_gh);
601 mutex_unlock(&fp->f_fl_mutex);
602}
603
604/**
605 * gfs2_flock - acquire/release a flock lock on a file
606 * @file: the file pointer
607 * @cmd: either modify or retrieve lock state, possibly wait
608 * @fl: type and range of lock
609 *
610 * Returns: errno
611 */
612
613static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
614{
615 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
616 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
617
618 if (!(fl->fl_flags & FL_FLOCK))
619 return -ENOLCK;
620 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
621 return -ENOLCK;
622
623 if (sdp->sd_args.ar_localflocks)
624 return flock_lock_file_wait(file, fl);
625
626 if (fl->fl_type == F_UNLCK) {
627 do_unflock(file, fl);
628 return 0;
629 } else {
630 return do_flock(file, cmd, fl);
631 }
632}
633
634const struct file_operations gfs2_file_fops = {
635 .llseek = gfs2_llseek,
636 .read = do_sync_read,
637 .aio_read = generic_file_aio_read,
638 .write = do_sync_write,
639 .aio_write = generic_file_aio_write,
640 .unlocked_ioctl = gfs2_ioctl,
641 .mmap = gfs2_mmap,
642 .open = gfs2_open,
643 .release = gfs2_close,
644 .fsync = gfs2_fsync,
645 .lock = gfs2_lock,
646 .sendfile = generic_file_sendfile,
647 .flock = gfs2_flock,
648 .splice_read = generic_file_splice_read,
649 .splice_write = generic_file_splice_write,
650};
651
652const struct file_operations gfs2_dir_fops = {
653 .readdir = gfs2_readdir,
654 .unlocked_ioctl = gfs2_ioctl,
655 .open = gfs2_open,
656 .release = gfs2_close,
657 .fsync = gfs2_fsync,
658 .lock = gfs2_lock,
659 .flock = gfs2_flock,
660};
661
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..ce319f89ec8e
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12
13#include <linux/fs.h>
14struct gfs2_inode;
15
16extern struct file gfs2_internal_file_sentinel;
17extern int gfs2_internal_read(struct gfs2_inode *ip,
18 struct file_ra_state *ra_state,
19 char *buf, loff_t *pos, unsigned size);
20
21extern const struct file_operations gfs2_file_fops;
22extern const struct file_operations gfs2_dir_fops;
23
24#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..882873a6bd69
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,925 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/blkdev.h>
16#include <linux/kthread.h>
17#include <linux/namei.h>
18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h>
20#include <linux/lm_interface.h>
21
22#include "gfs2.h"
23#include "incore.h"
24#include "daemon.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "lm.h"
29#include "mount.h"
30#include "ops_export.h"
31#include "ops_fstype.h"
32#include "ops_super.h"
33#include "recovery.h"
34#include "rgrp.h"
35#include "super.h"
36#include "sys.h"
37#include "util.h"
38
39#define DO 0
40#define UNDO 1
41
42extern struct dentry_operations gfs2_dops;
43
44static struct gfs2_sbd *init_sbd(struct super_block *sb)
45{
46 struct gfs2_sbd *sdp;
47
48 sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
49 if (!sdp)
50 return NULL;
51
52 sb->s_fs_info = sdp;
53 sdp->sd_vfs = sb;
54
55 gfs2_tune_init(&sdp->sd_tune);
56
57 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
58 spin_lock_init(&sdp->sd_reclaim_lock);
59 init_waitqueue_head(&sdp->sd_reclaim_wq);
60
61 mutex_init(&sdp->sd_inum_mutex);
62 spin_lock_init(&sdp->sd_statfs_spin);
63 mutex_init(&sdp->sd_statfs_mutex);
64
65 spin_lock_init(&sdp->sd_rindex_spin);
66 mutex_init(&sdp->sd_rindex_mutex);
67 INIT_LIST_HEAD(&sdp->sd_rindex_list);
68 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
69 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
70
71 INIT_LIST_HEAD(&sdp->sd_jindex_list);
72 spin_lock_init(&sdp->sd_jindex_spin);
73 mutex_init(&sdp->sd_jindex_mutex);
74
75 INIT_LIST_HEAD(&sdp->sd_quota_list);
76 spin_lock_init(&sdp->sd_quota_spin);
77 mutex_init(&sdp->sd_quota_mutex);
78
79 spin_lock_init(&sdp->sd_log_lock);
80
81 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
82 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
83 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
84 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
85 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
86
87 mutex_init(&sdp->sd_log_reserve_mutex);
88 INIT_LIST_HEAD(&sdp->sd_ail1_list);
89 INIT_LIST_HEAD(&sdp->sd_ail2_list);
90
91 init_rwsem(&sdp->sd_log_flush_lock);
92 INIT_LIST_HEAD(&sdp->sd_log_flush_list);
93
94 INIT_LIST_HEAD(&sdp->sd_revoke_list);
95
96 mutex_init(&sdp->sd_freeze_lock);
97
98 return sdp;
99}
100
101static void init_vfs(struct super_block *sb, unsigned noatime)
102{
103 struct gfs2_sbd *sdp = sb->s_fs_info;
104
105 sb->s_magic = GFS2_MAGIC;
106 sb->s_op = &gfs2_super_ops;
107 sb->s_export_op = &gfs2_export_ops;
108 sb->s_maxbytes = MAX_LFS_FILESIZE;
109
110 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
111 set_bit(noatime, &sdp->sd_flags);
112
113 /* Don't let the VFS update atimes. GFS2 handles this itself. */
114 sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
115}
116
117static int init_names(struct gfs2_sbd *sdp, int silent)
118{
119 struct page *page;
120 char *proto, *table;
121 int error = 0;
122
123 proto = sdp->sd_args.ar_lockproto;
124 table = sdp->sd_args.ar_locktable;
125
126 /* Try to autodetect */
127
128 if (!proto[0] || !table[0]) {
129 struct gfs2_sb *sb;
130 page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
131 if (!page)
132 return -ENOBUFS;
133 sb = kmap(page);
134 gfs2_sb_in(&sdp->sd_sb, sb);
135 kunmap(page);
136 __free_page(page);
137
138 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
139 if (error)
140 goto out;
141
142 if (!proto[0])
143 proto = sdp->sd_sb.sb_lockproto;
144 if (!table[0])
145 table = sdp->sd_sb.sb_locktable;
146 }
147
148 if (!table[0])
149 table = sdp->sd_vfs->s_id;
150
151 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
152 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
153
154out:
155 return error;
156}
157
158static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
159 int undo)
160{
161 struct task_struct *p;
162 int error = 0;
163
164 if (undo)
165 goto fail_trans;
166
167 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
168 error = IS_ERR(p);
169 if (error) {
170 fs_err(sdp, "can't start scand thread: %d\n", error);
171 return error;
172 }
173 sdp->sd_scand_process = p;
174
175 for (sdp->sd_glockd_num = 0;
176 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
177 sdp->sd_glockd_num++) {
178 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
179 error = IS_ERR(p);
180 if (error) {
181 fs_err(sdp, "can't start glockd thread: %d\n", error);
182 goto fail;
183 }
184 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
185 }
186
187 error = gfs2_glock_nq_num(sdp,
188 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
189 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
190 mount_gh);
191 if (error) {
192 fs_err(sdp, "can't acquire mount glock: %d\n", error);
193 goto fail;
194 }
195
196 error = gfs2_glock_nq_num(sdp,
197 GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
198 LM_ST_SHARED,
199 LM_FLAG_NOEXP | GL_EXACT,
200 &sdp->sd_live_gh);
201 if (error) {
202 fs_err(sdp, "can't acquire live glock: %d\n", error);
203 goto fail_mount;
204 }
205
206 error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
207 CREATE, &sdp->sd_rename_gl);
208 if (error) {
209 fs_err(sdp, "can't create rename glock: %d\n", error);
210 goto fail_live;
211 }
212
213 error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
214 CREATE, &sdp->sd_trans_gl);
215 if (error) {
216 fs_err(sdp, "can't create transaction glock: %d\n", error);
217 goto fail_rename;
218 }
219 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
220
221 return 0;
222
223fail_trans:
224 gfs2_glock_put(sdp->sd_trans_gl);
225fail_rename:
226 gfs2_glock_put(sdp->sd_rename_gl);
227fail_live:
228 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
229fail_mount:
230 gfs2_glock_dq_uninit(mount_gh);
231fail:
232 while (sdp->sd_glockd_num--)
233 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
234
235 kthread_stop(sdp->sd_scand_process);
236 return error;
237}
238
239static struct inode *gfs2_lookup_root(struct super_block *sb,
240 struct gfs2_inum *inum)
241{
242 return gfs2_inode_lookup(sb, inum, DT_DIR);
243}
244
245static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
246{
247 struct super_block *sb = sdp->sd_vfs;
248 struct gfs2_holder sb_gh;
249 struct gfs2_inum *inum;
250 struct inode *inode;
251 int error = 0;
252
253 if (undo) {
254 if (sb->s_root) {
255 dput(sb->s_root);
256 sb->s_root = NULL;
257 }
258 return 0;
259 }
260
261 error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
262 LM_ST_SHARED, 0, &sb_gh);
263 if (error) {
264 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
265 return error;
266 }
267
268 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
269 if (error) {
270 fs_err(sdp, "can't read superblock: %d\n", error);
271 goto out;
272 }
273
274 /* Set up the buffer cache and SB for real */
275 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
276 error = -EINVAL;
277 fs_err(sdp, "FS block size (%u) is too small for device "
278 "block size (%u)\n",
279 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
280 goto out;
281 }
282 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
283 error = -EINVAL;
284 fs_err(sdp, "FS block size (%u) is too big for machine "
285 "page size (%u)\n",
286 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
287 goto out;
288 }
289 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
290
291 /* Get the root inode */
292 inum = &sdp->sd_sb.sb_root_dir;
293 if (sb->s_type == &gfs2meta_fs_type)
294 inum = &sdp->sd_sb.sb_master_dir;
295 inode = gfs2_lookup_root(sb, inum);
296 if (IS_ERR(inode)) {
297 error = PTR_ERR(inode);
298 fs_err(sdp, "can't read in root inode: %d\n", error);
299 goto out;
300 }
301
302 sb->s_root = d_alloc_root(inode);
303 if (!sb->s_root) {
304 fs_err(sdp, "can't get root dentry\n");
305 error = -ENOMEM;
306 iput(inode);
307 }
308 sb->s_root->d_op = &gfs2_dops;
309out:
310 gfs2_glock_dq_uninit(&sb_gh);
311 return error;
312}
313
314static int init_journal(struct gfs2_sbd *sdp, int undo)
315{
316 struct gfs2_holder ji_gh;
317 struct task_struct *p;
318 struct gfs2_inode *ip;
319 int jindex = 1;
320 int error = 0;
321
322 if (undo) {
323 jindex = 0;
324 goto fail_recoverd;
325 }
326
327 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
328 if (IS_ERR(sdp->sd_jindex)) {
329 fs_err(sdp, "can't lookup journal index: %d\n", error);
330 return PTR_ERR(sdp->sd_jindex);
331 }
332 ip = GFS2_I(sdp->sd_jindex);
333 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
334
335 /* Load in the journal index special file */
336
337 error = gfs2_jindex_hold(sdp, &ji_gh);
338 if (error) {
339 fs_err(sdp, "can't read journal index: %d\n", error);
340 goto fail;
341 }
342
343 error = -EINVAL;
344 if (!gfs2_jindex_size(sdp)) {
345 fs_err(sdp, "no journals!\n");
346 goto fail_jindex;
347 }
348
349 if (sdp->sd_args.ar_spectator) {
350 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
351 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
352 } else {
353 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
354 fs_err(sdp, "can't mount journal #%u\n",
355 sdp->sd_lockstruct.ls_jid);
356 fs_err(sdp, "there are only %u journals (0 - %u)\n",
357 gfs2_jindex_size(sdp),
358 gfs2_jindex_size(sdp) - 1);
359 goto fail_jindex;
360 }
361 sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
362
363 error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
364 &gfs2_journal_glops,
365 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
366 &sdp->sd_journal_gh);
367 if (error) {
368 fs_err(sdp, "can't acquire journal glock: %d\n", error);
369 goto fail_jindex;
370 }
371
372 ip = GFS2_I(sdp->sd_jdesc->jd_inode);
373 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
374 LM_FLAG_NOEXP | GL_EXACT,
375 &sdp->sd_jinode_gh);
376 if (error) {
377 fs_err(sdp, "can't acquire journal inode glock: %d\n",
378 error);
379 goto fail_journal_gh;
380 }
381
382 error = gfs2_jdesc_check(sdp->sd_jdesc);
383 if (error) {
384 fs_err(sdp, "my journal (%u) is bad: %d\n",
385 sdp->sd_jdesc->jd_jid, error);
386 goto fail_jinode_gh;
387 }
388 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
389 }
390
391 if (sdp->sd_lockstruct.ls_first) {
392 unsigned int x;
393 for (x = 0; x < sdp->sd_journals; x++) {
394 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
395 if (error) {
396 fs_err(sdp, "error recovering journal %u: %d\n",
397 x, error);
398 goto fail_jinode_gh;
399 }
400 }
401
402 gfs2_lm_others_may_mount(sdp);
403 } else if (!sdp->sd_args.ar_spectator) {
404 error = gfs2_recover_journal(sdp->sd_jdesc);
405 if (error) {
406 fs_err(sdp, "error recovering my journal: %d\n", error);
407 goto fail_jinode_gh;
408 }
409 }
410
411 set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
412 gfs2_glock_dq_uninit(&ji_gh);
413 jindex = 0;
414
415 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
416 error = IS_ERR(p);
417 if (error) {
418 fs_err(sdp, "can't start recoverd thread: %d\n", error);
419 goto fail_jinode_gh;
420 }
421 sdp->sd_recoverd_process = p;
422
423 return 0;
424
425fail_recoverd:
426 kthread_stop(sdp->sd_recoverd_process);
427fail_jinode_gh:
428 if (!sdp->sd_args.ar_spectator)
429 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
430fail_journal_gh:
431 if (!sdp->sd_args.ar_spectator)
432 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
433fail_jindex:
434 gfs2_jindex_free(sdp);
435 if (jindex)
436 gfs2_glock_dq_uninit(&ji_gh);
437fail:
438 iput(sdp->sd_jindex);
439 return error;
440}
441
442
443static int init_inodes(struct gfs2_sbd *sdp, int undo)
444{
445 int error = 0;
446 struct gfs2_inode *ip;
447 struct inode *inode;
448
449 if (undo)
450 goto fail_qinode;
451
452 inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
453 if (IS_ERR(inode)) {
454 error = PTR_ERR(inode);
455 fs_err(sdp, "can't read in master directory: %d\n", error);
456 goto fail;
457 }
458 sdp->sd_master_dir = inode;
459
460 error = init_journal(sdp, undo);
461 if (error)
462 goto fail_master;
463
464 /* Read in the master inode number inode */
465 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
466 if (IS_ERR(sdp->sd_inum_inode)) {
467 error = PTR_ERR(sdp->sd_inum_inode);
468 fs_err(sdp, "can't read in inum inode: %d\n", error);
469 goto fail_journal;
470 }
471
472
473 /* Read in the master statfs inode */
474 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
475 if (IS_ERR(sdp->sd_statfs_inode)) {
476 error = PTR_ERR(sdp->sd_statfs_inode);
477 fs_err(sdp, "can't read in statfs inode: %d\n", error);
478 goto fail_inum;
479 }
480
481 /* Read in the resource index inode */
482 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
483 if (IS_ERR(sdp->sd_rindex)) {
484 error = PTR_ERR(sdp->sd_rindex);
485 fs_err(sdp, "can't get resource index inode: %d\n", error);
486 goto fail_statfs;
487 }
488 ip = GFS2_I(sdp->sd_rindex);
489 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
490 sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
491
492 /* Read in the quota inode */
493 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
494 if (IS_ERR(sdp->sd_quota_inode)) {
495 error = PTR_ERR(sdp->sd_quota_inode);
496 fs_err(sdp, "can't get quota file inode: %d\n", error);
497 goto fail_rindex;
498 }
499 return 0;
500
501fail_qinode:
502 iput(sdp->sd_quota_inode);
503fail_rindex:
504 gfs2_clear_rgrpd(sdp);
505 iput(sdp->sd_rindex);
506fail_statfs:
507 iput(sdp->sd_statfs_inode);
508fail_inum:
509 iput(sdp->sd_inum_inode);
510fail_journal:
511 init_journal(sdp, UNDO);
512fail_master:
513 iput(sdp->sd_master_dir);
514fail:
515 return error;
516}
517
518static int init_per_node(struct gfs2_sbd *sdp, int undo)
519{
520 struct inode *pn = NULL;
521 char buf[30];
522 int error = 0;
523 struct gfs2_inode *ip;
524
525 if (sdp->sd_args.ar_spectator)
526 return 0;
527
528 if (undo)
529 goto fail_qc_gh;
530
531 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
532 if (IS_ERR(pn)) {
533 error = PTR_ERR(pn);
534 fs_err(sdp, "can't find per_node directory: %d\n", error);
535 return error;
536 }
537
538 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
539 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
540 if (IS_ERR(sdp->sd_ir_inode)) {
541 error = PTR_ERR(sdp->sd_ir_inode);
542 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
543 goto fail;
544 }
545
546 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
547 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
548 if (IS_ERR(sdp->sd_sc_inode)) {
549 error = PTR_ERR(sdp->sd_sc_inode);
550 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
551 goto fail_ir_i;
552 }
553
554 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
555 sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
556 if (IS_ERR(sdp->sd_qc_inode)) {
557 error = PTR_ERR(sdp->sd_qc_inode);
558 fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
559 goto fail_ut_i;
560 }
561
562 iput(pn);
563 pn = NULL;
564
565 ip = GFS2_I(sdp->sd_ir_inode);
566 error = gfs2_glock_nq_init(ip->i_gl,
567 LM_ST_EXCLUSIVE, 0,
568 &sdp->sd_ir_gh);
569 if (error) {
570 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
571 goto fail_qc_i;
572 }
573
574 ip = GFS2_I(sdp->sd_sc_inode);
575 error = gfs2_glock_nq_init(ip->i_gl,
576 LM_ST_EXCLUSIVE, 0,
577 &sdp->sd_sc_gh);
578 if (error) {
579 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
580 goto fail_ir_gh;
581 }
582
583 ip = GFS2_I(sdp->sd_qc_inode);
584 error = gfs2_glock_nq_init(ip->i_gl,
585 LM_ST_EXCLUSIVE, 0,
586 &sdp->sd_qc_gh);
587 if (error) {
588 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
589 goto fail_ut_gh;
590 }
591
592 return 0;
593
594fail_qc_gh:
595 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
596fail_ut_gh:
597 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
598fail_ir_gh:
599 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
600fail_qc_i:
601 iput(sdp->sd_qc_inode);
602fail_ut_i:
603 iput(sdp->sd_sc_inode);
604fail_ir_i:
605 iput(sdp->sd_ir_inode);
606fail:
607 if (pn)
608 iput(pn);
609 return error;
610}
611
612static int init_threads(struct gfs2_sbd *sdp, int undo)
613{
614 struct task_struct *p;
615 int error = 0;
616
617 if (undo)
618 goto fail_quotad;
619
620 sdp->sd_log_flush_time = jiffies;
621 sdp->sd_jindex_refresh_time = jiffies;
622
623 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
624 error = IS_ERR(p);
625 if (error) {
626 fs_err(sdp, "can't start logd thread: %d\n", error);
627 return error;
628 }
629 sdp->sd_logd_process = p;
630
631 sdp->sd_statfs_sync_time = jiffies;
632 sdp->sd_quota_sync_time = jiffies;
633
634 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
635 error = IS_ERR(p);
636 if (error) {
637 fs_err(sdp, "can't start quotad thread: %d\n", error);
638 goto fail;
639 }
640 sdp->sd_quotad_process = p;
641
642 return 0;
643
644
645fail_quotad:
646 kthread_stop(sdp->sd_quotad_process);
647fail:
648 kthread_stop(sdp->sd_logd_process);
649 return error;
650}
651
652/**
653 * fill_super - Read in superblock
654 * @sb: The VFS superblock
655 * @data: Mount options
656 * @silent: Don't complain if it's not a GFS2 filesystem
657 *
658 * Returns: errno
659 */
660
661static int fill_super(struct super_block *sb, void *data, int silent)
662{
663 struct gfs2_sbd *sdp;
664 struct gfs2_holder mount_gh;
665 int error;
666
667 sdp = init_sbd(sb);
668 if (!sdp) {
669 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
670 return -ENOMEM;
671 }
672
673 error = gfs2_mount_args(sdp, (char *)data, 0);
674 if (error) {
675 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
676 goto fail;
677 }
678
679 init_vfs(sb, SDF_NOATIME);
680
681 /* Set up the buffer cache and fill in some fake block size values
682 to allow us to read-in the on-disk superblock. */
683 sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
684 sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
685 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
686 GFS2_BASIC_BLOCK_SHIFT;
687 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
688
689 error = init_names(sdp, silent);
690 if (error)
691 goto fail;
692
693 error = gfs2_sys_fs_add(sdp);
694 if (error)
695 goto fail;
696
697 error = gfs2_lm_mount(sdp, silent);
698 if (error)
699 goto fail_sys;
700
701 error = init_locking(sdp, &mount_gh, DO);
702 if (error)
703 goto fail_lm;
704
705 error = init_sb(sdp, silent, DO);
706 if (error)
707 goto fail_locking;
708
709 error = init_inodes(sdp, DO);
710 if (error)
711 goto fail_sb;
712
713 error = init_per_node(sdp, DO);
714 if (error)
715 goto fail_inodes;
716
717 error = gfs2_statfs_init(sdp);
718 if (error) {
719 fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
720 goto fail_per_node;
721 }
722
723 error = init_threads(sdp, DO);
724 if (error)
725 goto fail_per_node;
726
727 if (!(sb->s_flags & MS_RDONLY)) {
728 error = gfs2_make_fs_rw(sdp);
729 if (error) {
730 fs_err(sdp, "can't make FS RW: %d\n", error);
731 goto fail_threads;
732 }
733 }
734
735 gfs2_glock_dq_uninit(&mount_gh);
736
737 return 0;
738
739fail_threads:
740 init_threads(sdp, UNDO);
741fail_per_node:
742 init_per_node(sdp, UNDO);
743fail_inodes:
744 init_inodes(sdp, UNDO);
745fail_sb:
746 init_sb(sdp, 0, UNDO);
747fail_locking:
748 init_locking(sdp, &mount_gh, UNDO);
749fail_lm:
750 gfs2_gl_hash_clear(sdp, WAIT);
751 gfs2_lm_unmount(sdp);
752 while (invalidate_inodes(sb))
753 yield();
754fail_sys:
755 gfs2_sys_fs_del(sdp);
756fail:
757 kfree(sdp);
758 sb->s_fs_info = NULL;
759 return error;
760}
761
762static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
763 const char *dev_name, void *data, struct vfsmount *mnt)
764{
765 struct super_block *sb;
766 struct gfs2_sbd *sdp;
767 int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
768 if (error)
769 goto out;
770 sb = mnt->mnt_sb;
771 sdp = sb->s_fs_info;
772 sdp->sd_gfs2mnt = mnt;
773out:
774 return error;
775}
776
777static int fill_super_meta(struct super_block *sb, struct super_block *new,
778 void *data, int silent)
779{
780 struct gfs2_sbd *sdp = sb->s_fs_info;
781 struct inode *inode;
782 int error = 0;
783
784 new->s_fs_info = sdp;
785 sdp->sd_vfs_meta = sb;
786
787 init_vfs(new, SDF_NOATIME);
788
789 /* Get the master inode */
790 inode = igrab(sdp->sd_master_dir);
791
792 new->s_root = d_alloc_root(inode);
793 if (!new->s_root) {
794 fs_err(sdp, "can't get root dentry\n");
795 error = -ENOMEM;
796 iput(inode);
797 } else
798 new->s_root->d_op = &gfs2_dops;
799
800 return error;
801}
802
803static int set_bdev_super(struct super_block *s, void *data)
804{
805 s->s_bdev = data;
806 s->s_dev = s->s_bdev->bd_dev;
807 return 0;
808}
809
810static int test_bdev_super(struct super_block *s, void *data)
811{
812 return s->s_bdev == data;
813}
814
815static struct super_block* get_gfs2_sb(const char *dev_name)
816{
817 struct kstat stat;
818 struct nameidata nd;
819 struct file_system_type *fstype;
820 struct super_block *sb = NULL, *s;
821 struct list_head *l;
822 int error;
823
824 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
825 if (error) {
826 printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
827 dev_name);
828 goto out;
829 }
830 error = vfs_getattr(nd.mnt, nd.dentry, &stat);
831
832 fstype = get_fs_type("gfs2");
833 list_for_each(l, &fstype->fs_supers) {
834 s = list_entry(l, struct super_block, s_instances);
835 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
836 (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
837 sb = s;
838 goto free_nd;
839 }
840 }
841
842 printk(KERN_WARNING "GFS2: Unrecognized block device or "
843 "mount point %s", dev_name);
844
845free_nd:
846 path_release(&nd);
847out:
848 return sb;
849}
850
851static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
852 const char *dev_name, void *data, struct vfsmount *mnt)
853{
854 int error = 0;
855 struct super_block *sb = NULL, *new;
856 struct gfs2_sbd *sdp;
857
858 sb = get_gfs2_sb(dev_name);
859 if (!sb) {
860 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
861 error = -ENOENT;
862 goto error;
863 }
864 sdp = (struct gfs2_sbd*) sb->s_fs_info;
865 if (sdp->sd_vfs_meta) {
866 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
867 error = -EBUSY;
868 goto error;
869 }
870 mutex_lock(&sb->s_bdev->bd_mount_mutex);
871 new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
872 mutex_unlock(&sb->s_bdev->bd_mount_mutex);
873 if (IS_ERR(new)) {
874 error = PTR_ERR(new);
875 goto error;
876 }
877 module_put(fs_type->owner);
878 new->s_flags = flags;
879 strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
880 sb_set_blocksize(new, sb->s_blocksize);
881 error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
882 if (error) {
883 up_write(&new->s_umount);
884 deactivate_super(new);
885 goto error;
886 }
887
888 new->s_flags |= MS_ACTIVE;
889
890 /* Grab a reference to the gfs2 mount point */
891 atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
892 return simple_set_mnt(mnt, new);
893error:
894 return error;
895}
896
897static void gfs2_kill_sb(struct super_block *sb)
898{
899 kill_block_super(sb);
900}
901
902static void gfs2_kill_sb_meta(struct super_block *sb)
903{
904 struct gfs2_sbd *sdp = sb->s_fs_info;
905 generic_shutdown_super(sb);
906 sdp->sd_vfs_meta = NULL;
907 atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
908}
909
910struct file_system_type gfs2_fs_type = {
911 .name = "gfs2",
912 .fs_flags = FS_REQUIRES_DEV,
913 .get_sb = gfs2_get_sb,
914 .kill_sb = gfs2_kill_sb,
915 .owner = THIS_MODULE,
916};
917
918struct file_system_type gfs2meta_fs_type = {
919 .name = "gfs2meta",
920 .fs_flags = FS_REQUIRES_DEV,
921 .get_sb = gfs2_get_sb_meta,
922 .kill_sb = gfs2_kill_sb_meta,
923 .owner = THIS_MODULE,
924};
925
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..7cc2c296271b
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct file_system_type gfs2_fs_type;
16extern struct file_system_type gfs2meta_fs_type;
17
18#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..ef6e5ed70e94
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1151 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/namei.h>
16#include <linux/utsname.h>
17#include <linux/mm.h>
18#include <linux/xattr.h>
19#include <linux/posix_acl.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <linux/lm_interface.h>
23#include <asm/uaccess.h>
24
25#include "gfs2.h"
26#include "incore.h"
27#include "acl.h"
28#include "bmap.h"
29#include "dir.h"
30#include "eaops.h"
31#include "eattr.h"
32#include "glock.h"
33#include "inode.h"
34#include "meta_io.h"
35#include "ops_dentry.h"
36#include "ops_inode.h"
37#include "quota.h"
38#include "rgrp.h"
39#include "trans.h"
40#include "util.h"
41
42/**
43 * gfs2_create - Create a file
44 * @dir: The directory in which to create the file
45 * @dentry: The dentry of the new file
46 * @mode: The mode of the new file
47 *
48 * Returns: errno
49 */
50
51static int gfs2_create(struct inode *dir, struct dentry *dentry,
52 int mode, struct nameidata *nd)
53{
54 struct gfs2_inode *dip = GFS2_I(dir);
55 struct gfs2_sbd *sdp = GFS2_SB(dir);
56 struct gfs2_holder ghs[2];
57 struct inode *inode;
58
59 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
60
61 for (;;) {
62 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
63 if (!IS_ERR(inode)) {
64 gfs2_trans_end(sdp);
65 if (dip->i_alloc.al_rgd)
66 gfs2_inplace_release(dip);
67 gfs2_quota_unlock(dip);
68 gfs2_alloc_put(dip);
69 gfs2_glock_dq_uninit_m(2, ghs);
70 mark_inode_dirty(inode);
71 break;
72 } else if (PTR_ERR(inode) != -EEXIST ||
73 (nd->intent.open.flags & O_EXCL)) {
74 gfs2_holder_uninit(ghs);
75 return PTR_ERR(inode);
76 }
77
78 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
79 if (inode) {
80 if (!IS_ERR(inode)) {
81 gfs2_holder_uninit(ghs);
82 break;
83 } else {
84 gfs2_holder_uninit(ghs);
85 return PTR_ERR(inode);
86 }
87 }
88 }
89
90 d_instantiate(dentry, inode);
91
92 return 0;
93}
94
95/**
96 * gfs2_lookup - Look up a filename in a directory and return its inode
97 * @dir: The directory inode
98 * @dentry: The dentry of the new inode
99 * @nd: passed from Linux VFS, ignored by us
100 *
101 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
102 *
103 * Returns: errno
104 */
105
106static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
107 struct nameidata *nd)
108{
109 struct inode *inode = NULL;
110
111 dentry->d_op = &gfs2_dops;
112
113 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
114 if (inode && IS_ERR(inode))
115 return ERR_PTR(PTR_ERR(inode));
116
117 if (inode)
118 return d_splice_alias(inode, dentry);
119 d_add(dentry, inode);
120
121 return NULL;
122}
123
124/**
125 * gfs2_link - Link to a file
126 * @old_dentry: The inode to link
127 * @dir: Add link to this directory
128 * @dentry: The name of the link
129 *
130 * Link the inode in "old_dentry" into the directory "dir" with the
131 * name in "dentry".
132 *
133 * Returns: errno
134 */
135
136static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
137 struct dentry *dentry)
138{
139 struct gfs2_inode *dip = GFS2_I(dir);
140 struct gfs2_sbd *sdp = GFS2_SB(dir);
141 struct inode *inode = old_dentry->d_inode;
142 struct gfs2_inode *ip = GFS2_I(inode);
143 struct gfs2_holder ghs[2];
144 int alloc_required;
145 int error;
146
147 if (S_ISDIR(ip->i_di.di_mode))
148 return -EPERM;
149
150 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
151 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
152
153 error = gfs2_glock_nq_m(2, ghs);
154 if (error)
155 goto out;
156
157 error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
158 if (error)
159 goto out_gunlock;
160
161 error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
162 switch (error) {
163 case -ENOENT:
164 break;
165 case 0:
166 error = -EEXIST;
167 default:
168 goto out_gunlock;
169 }
170
171 error = -EINVAL;
172 if (!dip->i_di.di_nlink)
173 goto out_gunlock;
174 error = -EFBIG;
175 if (dip->i_di.di_entries == (u32)-1)
176 goto out_gunlock;
177 error = -EPERM;
178 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
179 goto out_gunlock;
180 error = -EINVAL;
181 if (!ip->i_di.di_nlink)
182 goto out_gunlock;
183 error = -EMLINK;
184 if (ip->i_di.di_nlink == (u32)-1)
185 goto out_gunlock;
186
187 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
188 if (error < 0)
189 goto out_gunlock;
190 error = 0;
191
192 if (alloc_required) {
193 struct gfs2_alloc *al = gfs2_alloc_get(dip);
194
195 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
196 if (error)
197 goto out_alloc;
198
199 error = gfs2_quota_check(dip, dip->i_di.di_uid,
200 dip->i_di.di_gid);
201 if (error)
202 goto out_gunlock_q;
203
204 al->al_requested = sdp->sd_max_dirres;
205
206 error = gfs2_inplace_reserve(dip);
207 if (error)
208 goto out_gunlock_q;
209
210 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
211 al->al_rgd->rd_ri.ri_length +
212 2 * RES_DINODE + RES_STATFS +
213 RES_QUOTA, 0);
214 if (error)
215 goto out_ipres;
216 } else {
217 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
218 if (error)
219 goto out_ipres;
220 }
221
222 error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
223 IF2DT(ip->i_di.di_mode));
224 if (error)
225 goto out_end_trans;
226
227 error = gfs2_change_nlink(ip, +1);
228
229out_end_trans:
230 gfs2_trans_end(sdp);
231out_ipres:
232 if (alloc_required)
233 gfs2_inplace_release(dip);
234out_gunlock_q:
235 if (alloc_required)
236 gfs2_quota_unlock(dip);
237out_alloc:
238 if (alloc_required)
239 gfs2_alloc_put(dip);
240out_gunlock:
241 gfs2_glock_dq_m(2, ghs);
242out:
243 gfs2_holder_uninit(ghs);
244 gfs2_holder_uninit(ghs + 1);
245 if (!error) {
246 atomic_inc(&inode->i_count);
247 d_instantiate(dentry, inode);
248 mark_inode_dirty(inode);
249 }
250 return error;
251}
252
253/**
254 * gfs2_unlink - Unlink a file
255 * @dir: The inode of the directory containing the file to unlink
256 * @dentry: The file itself
257 *
258 * Unlink a file. Call gfs2_unlinki()
259 *
260 * Returns: errno
261 */
262
263static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
264{
265 struct gfs2_inode *dip = GFS2_I(dir);
266 struct gfs2_sbd *sdp = GFS2_SB(dir);
267 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
268 struct gfs2_holder ghs[2];
269 int error;
270
271 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
272 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
273
274 error = gfs2_glock_nq_m(2, ghs);
275 if (error)
276 goto out;
277
278 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
279 if (error)
280 goto out_gunlock;
281
282 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
283 if (error)
284 goto out_gunlock;
285
286 error = gfs2_dir_del(dip, &dentry->d_name);
287 if (error)
288 goto out_end_trans;
289
290 error = gfs2_change_nlink(ip, -1);
291
292out_end_trans:
293 gfs2_trans_end(sdp);
294out_gunlock:
295 gfs2_glock_dq_m(2, ghs);
296out:
297 gfs2_holder_uninit(ghs);
298 gfs2_holder_uninit(ghs + 1);
299 return error;
300}
301
302/**
303 * gfs2_symlink - Create a symlink
304 * @dir: The directory to create the symlink in
305 * @dentry: The dentry to put the symlink in
306 * @symname: The thing which the link points to
307 *
308 * Returns: errno
309 */
310
311static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
312 const char *symname)
313{
314 struct gfs2_inode *dip = GFS2_I(dir), *ip;
315 struct gfs2_sbd *sdp = GFS2_SB(dir);
316 struct gfs2_holder ghs[2];
317 struct inode *inode;
318 struct buffer_head *dibh;
319 int size;
320 int error;
321
322 /* Must be stuffed with a null terminator for gfs2_follow_link() */
323 size = strlen(symname);
324 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
325 return -ENAMETOOLONG;
326
327 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
328
329 inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
330 if (IS_ERR(inode)) {
331 gfs2_holder_uninit(ghs);
332 return PTR_ERR(inode);
333 }
334
335 ip = ghs[1].gh_gl->gl_object;
336
337 ip->i_di.di_size = size;
338
339 error = gfs2_meta_inode_buffer(ip, &dibh);
340
341 if (!gfs2_assert_withdraw(sdp, !error)) {
342 gfs2_dinode_out(&ip->i_di, dibh->b_data);
343 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
344 size);
345 brelse(dibh);
346 }
347
348 gfs2_trans_end(sdp);
349 if (dip->i_alloc.al_rgd)
350 gfs2_inplace_release(dip);
351 gfs2_quota_unlock(dip);
352 gfs2_alloc_put(dip);
353
354 gfs2_glock_dq_uninit_m(2, ghs);
355
356 d_instantiate(dentry, inode);
357 mark_inode_dirty(inode);
358
359 return 0;
360}
361
362/**
363 * gfs2_mkdir - Make a directory
364 * @dir: The parent directory of the new one
365 * @dentry: The dentry of the new directory
366 * @mode: The mode of the new directory
367 *
368 * Returns: errno
369 */
370
371static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
372{
373 struct gfs2_inode *dip = GFS2_I(dir), *ip;
374 struct gfs2_sbd *sdp = GFS2_SB(dir);
375 struct gfs2_holder ghs[2];
376 struct inode *inode;
377 struct buffer_head *dibh;
378 int error;
379
380 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
381
382 inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
383 if (IS_ERR(inode)) {
384 gfs2_holder_uninit(ghs);
385 return PTR_ERR(inode);
386 }
387
388 ip = ghs[1].gh_gl->gl_object;
389
390 ip->i_di.di_nlink = 2;
391 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
392 ip->i_di.di_flags |= GFS2_DIF_JDATA;
393 ip->i_di.di_payload_format = GFS2_FORMAT_DE;
394 ip->i_di.di_entries = 2;
395
396 error = gfs2_meta_inode_buffer(ip, &dibh);
397
398 if (!gfs2_assert_withdraw(sdp, !error)) {
399 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
400 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
401 struct qstr str;
402
403 gfs2_str2qstr(&str, ".");
404 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
405 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
406 dent->de_inum = di->di_num; /* already GFS2 endian */
407 dent->de_type = cpu_to_be16(DT_DIR);
408 di->di_entries = cpu_to_be32(1);
409
410 gfs2_str2qstr(&str, "..");
411 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
412 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
413
414 gfs2_inum_out(&dip->i_num, &dent->de_inum);
415 dent->de_type = cpu_to_be16(DT_DIR);
416
417 gfs2_dinode_out(&ip->i_di, di);
418
419 brelse(dibh);
420 }
421
422 error = gfs2_change_nlink(dip, +1);
423 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
424
425 gfs2_trans_end(sdp);
426 if (dip->i_alloc.al_rgd)
427 gfs2_inplace_release(dip);
428 gfs2_quota_unlock(dip);
429 gfs2_alloc_put(dip);
430
431 gfs2_glock_dq_uninit_m(2, ghs);
432
433 d_instantiate(dentry, inode);
434 mark_inode_dirty(inode);
435
436 return 0;
437}
438
439/**
440 * gfs2_rmdir - Remove a directory
441 * @dir: The parent directory of the directory to be removed
442 * @dentry: The dentry of the directory to remove
443 *
444 * Remove a directory. Call gfs2_rmdiri()
445 *
446 * Returns: errno
447 */
448
449static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
450{
451 struct gfs2_inode *dip = GFS2_I(dir);
452 struct gfs2_sbd *sdp = GFS2_SB(dir);
453 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
454 struct gfs2_holder ghs[2];
455 int error;
456
457 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
458 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
459
460 error = gfs2_glock_nq_m(2, ghs);
461 if (error)
462 goto out;
463
464 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
465 if (error)
466 goto out_gunlock;
467
468 if (ip->i_di.di_entries < 2) {
469 if (gfs2_consist_inode(ip))
470 gfs2_dinode_print(&ip->i_di);
471 error = -EIO;
472 goto out_gunlock;
473 }
474 if (ip->i_di.di_entries > 2) {
475 error = -ENOTEMPTY;
476 goto out_gunlock;
477 }
478
479 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
480 if (error)
481 goto out_gunlock;
482
483 error = gfs2_rmdiri(dip, &dentry->d_name, ip);
484
485 gfs2_trans_end(sdp);
486
487out_gunlock:
488 gfs2_glock_dq_m(2, ghs);
489out:
490 gfs2_holder_uninit(ghs);
491 gfs2_holder_uninit(ghs + 1);
492 return error;
493}
494
495/**
496 * gfs2_mknod - Make a special file
497 * @dir: The directory in which the special file will reside
498 * @dentry: The dentry of the special file
499 * @mode: The mode of the special file
500 * @rdev: The device specification of the special file
501 *
502 */
503
504static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
505 dev_t dev)
506{
507 struct gfs2_inode *dip = GFS2_I(dir), *ip;
508 struct gfs2_sbd *sdp = GFS2_SB(dir);
509 struct gfs2_holder ghs[2];
510 struct inode *inode;
511 struct buffer_head *dibh;
512 u32 major = 0, minor = 0;
513 int error;
514
515 switch (mode & S_IFMT) {
516 case S_IFBLK:
517 case S_IFCHR:
518 major = MAJOR(dev);
519 minor = MINOR(dev);
520 break;
521 case S_IFIFO:
522 case S_IFSOCK:
523 break;
524 default:
525 return -EOPNOTSUPP;
526 };
527
528 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
529
530 inode = gfs2_createi(ghs, &dentry->d_name, mode);
531 if (IS_ERR(inode)) {
532 gfs2_holder_uninit(ghs);
533 return PTR_ERR(inode);
534 }
535
536 ip = ghs[1].gh_gl->gl_object;
537
538 ip->i_di.di_major = major;
539 ip->i_di.di_minor = minor;
540
541 error = gfs2_meta_inode_buffer(ip, &dibh);
542
543 if (!gfs2_assert_withdraw(sdp, !error)) {
544 gfs2_dinode_out(&ip->i_di, dibh->b_data);
545 brelse(dibh);
546 }
547
548 gfs2_trans_end(sdp);
549 if (dip->i_alloc.al_rgd)
550 gfs2_inplace_release(dip);
551 gfs2_quota_unlock(dip);
552 gfs2_alloc_put(dip);
553
554 gfs2_glock_dq_uninit_m(2, ghs);
555
556 d_instantiate(dentry, inode);
557 mark_inode_dirty(inode);
558
559 return 0;
560}
561
562/**
563 * gfs2_rename - Rename a file
564 * @odir: Parent directory of old file name
565 * @odentry: The old dentry of the file
566 * @ndir: Parent directory of new file name
567 * @ndentry: The new dentry of the file
568 *
569 * Returns: errno
570 */
571
572static int gfs2_rename(struct inode *odir, struct dentry *odentry,
573 struct inode *ndir, struct dentry *ndentry)
574{
575 struct gfs2_inode *odip = GFS2_I(odir);
576 struct gfs2_inode *ndip = GFS2_I(ndir);
577 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
578 struct gfs2_inode *nip = NULL;
579 struct gfs2_sbd *sdp = GFS2_SB(odir);
580 struct gfs2_holder ghs[4], r_gh;
581 unsigned int num_gh;
582 int dir_rename = 0;
583 int alloc_required;
584 unsigned int x;
585 int error;
586
587 if (ndentry->d_inode) {
588 nip = GFS2_I(ndentry->d_inode);
589 if (ip == nip)
590 return 0;
591 }
592
593 /* Make sure we aren't trying to move a dirctory into it's subdir */
594
595 if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
596 dir_rename = 1;
597
598 error = gfs2_glock_nq_init(sdp->sd_rename_gl,
599 LM_ST_EXCLUSIVE, 0,
600 &r_gh);
601 if (error)
602 goto out;
603
604 error = gfs2_ok_to_move(ip, ndip);
605 if (error)
606 goto out_gunlock_r;
607 }
608
609 num_gh = 1;
610 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
611 if (odip != ndip) {
612 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
613 num_gh++;
614 }
615 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
616 num_gh++;
617
618 if (nip) {
619 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
620 num_gh++;
621 }
622
623 error = gfs2_glock_nq_m(num_gh, ghs);
624 if (error)
625 goto out_uninit;
626
627 /* Check out the old directory */
628
629 error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
630 if (error)
631 goto out_gunlock;
632
633 /* Check out the new directory */
634
635 if (nip) {
636 error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
637 if (error)
638 goto out_gunlock;
639
640 if (S_ISDIR(nip->i_di.di_mode)) {
641 if (nip->i_di.di_entries < 2) {
642 if (gfs2_consist_inode(nip))
643 gfs2_dinode_print(&nip->i_di);
644 error = -EIO;
645 goto out_gunlock;
646 }
647 if (nip->i_di.di_entries > 2) {
648 error = -ENOTEMPTY;
649 goto out_gunlock;
650 }
651 }
652 } else {
653 error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
654 if (error)
655 goto out_gunlock;
656
657 error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
658 switch (error) {
659 case -ENOENT:
660 error = 0;
661 break;
662 case 0:
663 error = -EEXIST;
664 default:
665 goto out_gunlock;
666 };
667
668 if (odip != ndip) {
669 if (!ndip->i_di.di_nlink) {
670 error = -EINVAL;
671 goto out_gunlock;
672 }
673 if (ndip->i_di.di_entries == (u32)-1) {
674 error = -EFBIG;
675 goto out_gunlock;
676 }
677 if (S_ISDIR(ip->i_di.di_mode) &&
678 ndip->i_di.di_nlink == (u32)-1) {
679 error = -EMLINK;
680 goto out_gunlock;
681 }
682 }
683 }
684
685 /* Check out the dir to be renamed */
686
687 if (dir_rename) {
688 error = permission(odentry->d_inode, MAY_WRITE, NULL);
689 if (error)
690 goto out_gunlock;
691 }
692
693 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
694 if (error < 0)
695 goto out_gunlock;
696 error = 0;
697
698 if (alloc_required) {
699 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
700
701 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
702 if (error)
703 goto out_alloc;
704
705 error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
706 ndip->i_di.di_gid);
707 if (error)
708 goto out_gunlock_q;
709
710 al->al_requested = sdp->sd_max_dirres;
711
712 error = gfs2_inplace_reserve(ndip);
713 if (error)
714 goto out_gunlock_q;
715
716 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
717 al->al_rgd->rd_ri.ri_length +
718 4 * RES_DINODE + 4 * RES_LEAF +
719 RES_STATFS + RES_QUOTA, 0);
720 if (error)
721 goto out_ipreserv;
722 } else {
723 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
724 5 * RES_LEAF, 0);
725 if (error)
726 goto out_gunlock;
727 }
728
729 /* Remove the target file, if it exists */
730
731 if (nip) {
732 if (S_ISDIR(nip->i_di.di_mode))
733 error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
734 else {
735 error = gfs2_dir_del(ndip, &ndentry->d_name);
736 if (error)
737 goto out_end_trans;
738 error = gfs2_change_nlink(nip, -1);
739 }
740 if (error)
741 goto out_end_trans;
742 }
743
744 if (dir_rename) {
745 struct qstr name;
746 gfs2_str2qstr(&name, "..");
747
748 error = gfs2_change_nlink(ndip, +1);
749 if (error)
750 goto out_end_trans;
751 error = gfs2_change_nlink(odip, -1);
752 if (error)
753 goto out_end_trans;
754
755 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
756 if (error)
757 goto out_end_trans;
758 } else {
759 struct buffer_head *dibh;
760 error = gfs2_meta_inode_buffer(ip, &dibh);
761 if (error)
762 goto out_end_trans;
763 ip->i_di.di_ctime = get_seconds();
764 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
765 gfs2_dinode_out(&ip->i_di, dibh->b_data);
766 brelse(dibh);
767 }
768
769 error = gfs2_dir_del(odip, &odentry->d_name);
770 if (error)
771 goto out_end_trans;
772
773 error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
774 IF2DT(ip->i_di.di_mode));
775 if (error)
776 goto out_end_trans;
777
778out_end_trans:
779 gfs2_trans_end(sdp);
780out_ipreserv:
781 if (alloc_required)
782 gfs2_inplace_release(ndip);
783out_gunlock_q:
784 if (alloc_required)
785 gfs2_quota_unlock(ndip);
786out_alloc:
787 if (alloc_required)
788 gfs2_alloc_put(ndip);
789out_gunlock:
790 gfs2_glock_dq_m(num_gh, ghs);
791out_uninit:
792 for (x = 0; x < num_gh; x++)
793 gfs2_holder_uninit(ghs + x);
794out_gunlock_r:
795 if (dir_rename)
796 gfs2_glock_dq_uninit(&r_gh);
797out:
798 return error;
799}
800
801/**
802 * gfs2_readlink - Read the value of a symlink
803 * @dentry: the symlink
804 * @buf: the buffer to read the symlink data into
805 * @size: the size of the buffer
806 *
807 * Returns: errno
808 */
809
810static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
811 int user_size)
812{
813 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
814 char array[GFS2_FAST_NAME_SIZE], *buf = array;
815 unsigned int len = GFS2_FAST_NAME_SIZE;
816 int error;
817
818 error = gfs2_readlinki(ip, &buf, &len);
819 if (error)
820 return error;
821
822 if (user_size > len - 1)
823 user_size = len - 1;
824
825 if (copy_to_user(user_buf, buf, user_size))
826 error = -EFAULT;
827 else
828 error = user_size;
829
830 if (buf != array)
831 kfree(buf);
832
833 return error;
834}
835
836/**
837 * gfs2_follow_link - Follow a symbolic link
838 * @dentry: The dentry of the link
839 * @nd: Data that we pass to vfs_follow_link()
840 *
841 * This can handle symlinks of any size. It is optimised for symlinks
842 * under GFS2_FAST_NAME_SIZE.
843 *
844 * Returns: 0 on success or error code
845 */
846
847static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
848{
849 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
850 char array[GFS2_FAST_NAME_SIZE], *buf = array;
851 unsigned int len = GFS2_FAST_NAME_SIZE;
852 int error;
853
854 error = gfs2_readlinki(ip, &buf, &len);
855 if (!error) {
856 error = vfs_follow_link(nd, buf);
857 if (buf != array)
858 kfree(buf);
859 }
860
861 return ERR_PTR(error);
862}
863
864/**
865 * gfs2_permission -
866 * @inode:
867 * @mask:
868 * @nd: passed from Linux VFS, ignored by us
869 *
870 * Returns: errno
871 */
872
873static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
874{
875 struct gfs2_inode *ip = GFS2_I(inode);
876 struct gfs2_holder i_gh;
877 int error;
878
879 if (ip->i_vn == ip->i_gl->gl_vn)
880 return generic_permission(inode, mask, gfs2_check_acl);
881
882 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
883 if (!error) {
884 error = generic_permission(inode, mask, gfs2_check_acl_locked);
885 gfs2_glock_dq_uninit(&i_gh);
886 }
887
888 return error;
889}
890
891static int setattr_size(struct inode *inode, struct iattr *attr)
892{
893 struct gfs2_inode *ip = GFS2_I(inode);
894 int error;
895
896 if (attr->ia_size != ip->i_di.di_size) {
897 error = vmtruncate(inode, attr->ia_size);
898 if (error)
899 return error;
900 }
901
902 error = gfs2_truncatei(ip, attr->ia_size);
903 if (error)
904 return error;
905
906 return error;
907}
908
909static int setattr_chown(struct inode *inode, struct iattr *attr)
910{
911 struct gfs2_inode *ip = GFS2_I(inode);
912 struct gfs2_sbd *sdp = GFS2_SB(inode);
913 struct buffer_head *dibh;
914 u32 ouid, ogid, nuid, ngid;
915 int error;
916
917 ouid = ip->i_di.di_uid;
918 ogid = ip->i_di.di_gid;
919 nuid = attr->ia_uid;
920 ngid = attr->ia_gid;
921
922 if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
923 ouid = nuid = NO_QUOTA_CHANGE;
924 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
925 ogid = ngid = NO_QUOTA_CHANGE;
926
927 gfs2_alloc_get(ip);
928
929 error = gfs2_quota_lock(ip, nuid, ngid);
930 if (error)
931 goto out_alloc;
932
933 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
934 error = gfs2_quota_check(ip, nuid, ngid);
935 if (error)
936 goto out_gunlock_q;
937 }
938
939 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
940 if (error)
941 goto out_gunlock_q;
942
943 error = gfs2_meta_inode_buffer(ip, &dibh);
944 if (error)
945 goto out_end_trans;
946
947 error = inode_setattr(inode, attr);
948 gfs2_assert_warn(sdp, !error);
949 gfs2_inode_attr_out(ip);
950
951 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
952 gfs2_dinode_out(&ip->i_di, dibh->b_data);
953 brelse(dibh);
954
955 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
956 gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
957 gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
958 }
959
960out_end_trans:
961 gfs2_trans_end(sdp);
962out_gunlock_q:
963 gfs2_quota_unlock(ip);
964out_alloc:
965 gfs2_alloc_put(ip);
966 return error;
967}
968
969/**
970 * gfs2_setattr - Change attributes on an inode
971 * @dentry: The dentry which is changing
972 * @attr: The structure describing the change
973 *
974 * The VFS layer wants to change one or more of an inodes attributes. Write
975 * that change out to disk.
976 *
977 * Returns: errno
978 */
979
980static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
981{
982 struct inode *inode = dentry->d_inode;
983 struct gfs2_inode *ip = GFS2_I(inode);
984 struct gfs2_holder i_gh;
985 int error;
986
987 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
988 if (error)
989 return error;
990
991 error = -EPERM;
992 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
993 goto out;
994
995 error = inode_change_ok(inode, attr);
996 if (error)
997 goto out;
998
999 if (attr->ia_valid & ATTR_SIZE)
1000 error = setattr_size(inode, attr);
1001 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1002 error = setattr_chown(inode, attr);
1003 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1004 error = gfs2_acl_chmod(ip, attr);
1005 else
1006 error = gfs2_setattr_simple(ip, attr);
1007
1008out:
1009 gfs2_glock_dq_uninit(&i_gh);
1010 if (!error)
1011 mark_inode_dirty(inode);
1012 return error;
1013}
1014
1015/**
1016 * gfs2_getattr - Read out an inode's attributes
1017 * @mnt: The vfsmount the inode is being accessed from
1018 * @dentry: The dentry to stat
1019 * @stat: The inode's stats
1020 *
1021 * Returns: errno
1022 */
1023
1024static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1025 struct kstat *stat)
1026{
1027 struct inode *inode = dentry->d_inode;
1028 struct gfs2_inode *ip = GFS2_I(inode);
1029 struct gfs2_holder gh;
1030 int error;
1031
1032 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1033 if (!error) {
1034 generic_fillattr(inode, stat);
1035 gfs2_glock_dq_uninit(&gh);
1036 }
1037
1038 return error;
1039}
1040
1041static int gfs2_setxattr(struct dentry *dentry, const char *name,
1042 const void *data, size_t size, int flags)
1043{
1044 struct inode *inode = dentry->d_inode;
1045 struct gfs2_ea_request er;
1046
1047 memset(&er, 0, sizeof(struct gfs2_ea_request));
1048 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1049 if (er.er_type == GFS2_EATYPE_UNUSED)
1050 return -EOPNOTSUPP;
1051 er.er_data = (char *)data;
1052 er.er_name_len = strlen(er.er_name);
1053 er.er_data_len = size;
1054 er.er_flags = flags;
1055
1056 gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
1057
1058 return gfs2_ea_set(GFS2_I(inode), &er);
1059}
1060
1061static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1062 void *data, size_t size)
1063{
1064 struct gfs2_ea_request er;
1065
1066 memset(&er, 0, sizeof(struct gfs2_ea_request));
1067 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1068 if (er.er_type == GFS2_EATYPE_UNUSED)
1069 return -EOPNOTSUPP;
1070 er.er_data = data;
1071 er.er_name_len = strlen(er.er_name);
1072 er.er_data_len = size;
1073
1074 return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
1075}
1076
1077static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1078{
1079 struct gfs2_ea_request er;
1080
1081 memset(&er, 0, sizeof(struct gfs2_ea_request));
1082 er.er_data = (size) ? buffer : NULL;
1083 er.er_data_len = size;
1084
1085 return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
1086}
1087
1088static int gfs2_removexattr(struct dentry *dentry, const char *name)
1089{
1090 struct gfs2_ea_request er;
1091
1092 memset(&er, 0, sizeof(struct gfs2_ea_request));
1093 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1094 if (er.er_type == GFS2_EATYPE_UNUSED)
1095 return -EOPNOTSUPP;
1096 er.er_name_len = strlen(er.er_name);
1097
1098 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
1099}
1100
1101struct inode_operations gfs2_file_iops = {
1102 .permission = gfs2_permission,
1103 .setattr = gfs2_setattr,
1104 .getattr = gfs2_getattr,
1105 .setxattr = gfs2_setxattr,
1106 .getxattr = gfs2_getxattr,
1107 .listxattr = gfs2_listxattr,
1108 .removexattr = gfs2_removexattr,
1109};
1110
1111struct inode_operations gfs2_dev_iops = {
1112 .permission = gfs2_permission,
1113 .setattr = gfs2_setattr,
1114 .getattr = gfs2_getattr,
1115 .setxattr = gfs2_setxattr,
1116 .getxattr = gfs2_getxattr,
1117 .listxattr = gfs2_listxattr,
1118 .removexattr = gfs2_removexattr,
1119};
1120
1121struct inode_operations gfs2_dir_iops = {
1122 .create = gfs2_create,
1123 .lookup = gfs2_lookup,
1124 .link = gfs2_link,
1125 .unlink = gfs2_unlink,
1126 .symlink = gfs2_symlink,
1127 .mkdir = gfs2_mkdir,
1128 .rmdir = gfs2_rmdir,
1129 .mknod = gfs2_mknod,
1130 .rename = gfs2_rename,
1131 .permission = gfs2_permission,
1132 .setattr = gfs2_setattr,
1133 .getattr = gfs2_getattr,
1134 .setxattr = gfs2_setxattr,
1135 .getxattr = gfs2_getxattr,
1136 .listxattr = gfs2_listxattr,
1137 .removexattr = gfs2_removexattr,
1138};
1139
1140struct inode_operations gfs2_symlink_iops = {
1141 .readlink = gfs2_readlink,
1142 .follow_link = gfs2_follow_link,
1143 .permission = gfs2_permission,
1144 .setattr = gfs2_setattr,
1145 .getattr = gfs2_getattr,
1146 .setxattr = gfs2_setxattr,
1147 .getxattr = gfs2_getxattr,
1148 .listxattr = gfs2_listxattr,
1149 .removexattr = gfs2_removexattr,
1150};
1151
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..b15acb4fd34c
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct inode_operations gfs2_file_iops;
16extern struct inode_operations gfs2_dir_iops;
17extern struct inode_operations gfs2_symlink_iops;
18extern struct inode_operations gfs2_dev_iops;
19
20#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..06f06f7773d0
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,468 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/seq_file.h>
17#include <linux/mount.h>
18#include <linux/kthread.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <linux/lm_interface.h>
23
24#include "gfs2.h"
25#include "incore.h"
26#include "glock.h"
27#include "inode.h"
28#include "lm.h"
29#include "log.h"
30#include "mount.h"
31#include "ops_super.h"
32#include "quota.h"
33#include "recovery.h"
34#include "rgrp.h"
35#include "super.h"
36#include "sys.h"
37#include "util.h"
38#include "trans.h"
39#include "dir.h"
40#include "eattr.h"
41#include "bmap.h"
42
43/**
44 * gfs2_write_inode - Make sure the inode is stable on the disk
45 * @inode: The inode
46 * @sync: synchronous write flag
47 *
48 * Returns: errno
49 */
50
51static int gfs2_write_inode(struct inode *inode, int sync)
52{
53 struct gfs2_inode *ip = GFS2_I(inode);
54
55 /* Check this is a "normal" inode */
56 if (inode->i_private) {
57 if (current->flags & PF_MEMALLOC)
58 return 0;
59 if (sync)
60 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
61 }
62
63 return 0;
64}
65
66/**
67 * gfs2_put_super - Unmount the filesystem
68 * @sb: The VFS superblock
69 *
70 */
71
72static void gfs2_put_super(struct super_block *sb)
73{
74 struct gfs2_sbd *sdp = sb->s_fs_info;
75 int error;
76
77 if (!sdp)
78 return;
79
80 if (!strncmp(sb->s_type->name, "gfs2meta", 8))
81 return; /* Nothing to do */
82
83 /* Unfreeze the filesystem, if we need to */
84
85 mutex_lock(&sdp->sd_freeze_lock);
86 if (sdp->sd_freeze_count)
87 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
88 mutex_unlock(&sdp->sd_freeze_lock);
89
90 kthread_stop(sdp->sd_quotad_process);
91 kthread_stop(sdp->sd_logd_process);
92 kthread_stop(sdp->sd_recoverd_process);
93 while (sdp->sd_glockd_num--)
94 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
95 kthread_stop(sdp->sd_scand_process);
96
97 if (!(sb->s_flags & MS_RDONLY)) {
98 error = gfs2_make_fs_ro(sdp);
99 if (error)
100 gfs2_io_error(sdp);
101 }
102 /* At this point, we're through modifying the disk */
103
104 /* Release stuff */
105
106 iput(sdp->sd_master_dir);
107 iput(sdp->sd_jindex);
108 iput(sdp->sd_inum_inode);
109 iput(sdp->sd_statfs_inode);
110 iput(sdp->sd_rindex);
111 iput(sdp->sd_quota_inode);
112
113 gfs2_glock_put(sdp->sd_rename_gl);
114 gfs2_glock_put(sdp->sd_trans_gl);
115
116 if (!sdp->sd_args.ar_spectator) {
117 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
118 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
119 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
120 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
121 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
122 iput(sdp->sd_ir_inode);
123 iput(sdp->sd_sc_inode);
124 iput(sdp->sd_qc_inode);
125 }
126
127 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
128 gfs2_clear_rgrpd(sdp);
129 gfs2_jindex_free(sdp);
130 /* Take apart glock structures and buffer lists */
131 gfs2_gl_hash_clear(sdp, WAIT);
132 /* Unmount the locking protocol */
133 gfs2_lm_unmount(sdp);
134
135 /* At this point, we're through participating in the lockspace */
136 gfs2_sys_fs_del(sdp);
137 kfree(sdp);
138}
139
140/**
141 * gfs2_write_super - disk commit all incore transactions
142 * @sb: the filesystem
143 *
144 * This function is called every time sync(2) is called.
145 * After this exits, all dirty buffers are synced.
146 */
147
148static void gfs2_write_super(struct super_block *sb)
149{
150 gfs2_log_flush(sb->s_fs_info, NULL);
151}
152
153/**
154 * gfs2_write_super_lockfs - prevent further writes to the filesystem
155 * @sb: the VFS structure for the filesystem
156 *
157 */
158
159static void gfs2_write_super_lockfs(struct super_block *sb)
160{
161 struct gfs2_sbd *sdp = sb->s_fs_info;
162 int error;
163
164 for (;;) {
165 error = gfs2_freeze_fs(sdp);
166 if (!error)
167 break;
168
169 switch (error) {
170 case -EBUSY:
171 fs_err(sdp, "waiting for recovery before freeze\n");
172 break;
173
174 default:
175 fs_err(sdp, "error freezing FS: %d\n", error);
176 break;
177 }
178
179 fs_err(sdp, "retrying...\n");
180 msleep(1000);
181 }
182}
183
184/**
185 * gfs2_unlockfs - reallow writes to the filesystem
186 * @sb: the VFS structure for the filesystem
187 *
188 */
189
190static void gfs2_unlockfs(struct super_block *sb)
191{
192 gfs2_unfreeze_fs(sb->s_fs_info);
193}
194
195/**
196 * gfs2_statfs - Gather and return stats about the filesystem
197 * @sb: The superblock
198 * @statfsbuf: The buffer
199 *
200 * Returns: 0 on success or error code
201 */
202
203static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
204{
205 struct super_block *sb = dentry->d_inode->i_sb;
206 struct gfs2_sbd *sdp = sb->s_fs_info;
207 struct gfs2_statfs_change sc;
208 int error;
209
210 if (gfs2_tune_get(sdp, gt_statfs_slow))
211 error = gfs2_statfs_slow(sdp, &sc);
212 else
213 error = gfs2_statfs_i(sdp, &sc);
214
215 if (error)
216 return error;
217
218 buf->f_type = GFS2_MAGIC;
219 buf->f_bsize = sdp->sd_sb.sb_bsize;
220 buf->f_blocks = sc.sc_total;
221 buf->f_bfree = sc.sc_free;
222 buf->f_bavail = sc.sc_free;
223 buf->f_files = sc.sc_dinodes + sc.sc_free;
224 buf->f_ffree = sc.sc_free;
225 buf->f_namelen = GFS2_FNAMESIZE;
226
227 return 0;
228}
229
230/**
231 * gfs2_remount_fs - called when the FS is remounted
232 * @sb: the filesystem
233 * @flags: the remount flags
234 * @data: extra data passed in (not used right now)
235 *
236 * Returns: errno
237 */
238
239static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
240{
241 struct gfs2_sbd *sdp = sb->s_fs_info;
242 int error;
243
244 error = gfs2_mount_args(sdp, data, 1);
245 if (error)
246 return error;
247
248 if (sdp->sd_args.ar_spectator)
249 *flags |= MS_RDONLY;
250 else {
251 if (*flags & MS_RDONLY) {
252 if (!(sb->s_flags & MS_RDONLY))
253 error = gfs2_make_fs_ro(sdp);
254 } else if (!(*flags & MS_RDONLY) &&
255 (sb->s_flags & MS_RDONLY)) {
256 error = gfs2_make_fs_rw(sdp);
257 }
258 }
259
260 if (*flags & (MS_NOATIME | MS_NODIRATIME))
261 set_bit(SDF_NOATIME, &sdp->sd_flags);
262 else
263 clear_bit(SDF_NOATIME, &sdp->sd_flags);
264
265 /* Don't let the VFS update atimes. GFS2 handles this itself. */
266 *flags |= MS_NOATIME | MS_NODIRATIME;
267
268 return error;
269}
270
271/**
272 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
273 * @inode: The VFS inode
274 *
275 */
276
277static void gfs2_clear_inode(struct inode *inode)
278{
279 /* This tells us its a "real" inode and not one which only
280 * serves to contain an address space (see rgrp.c, meta_io.c)
281 * which therefore doesn't have its own glocks.
282 */
283 if (inode->i_private) {
284 struct gfs2_inode *ip = GFS2_I(inode);
285 gfs2_glock_inode_squish(inode);
286 gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
287 ip->i_gl->gl_object = NULL;
288 gfs2_glock_schedule_for_reclaim(ip->i_gl);
289 gfs2_glock_put(ip->i_gl);
290 ip->i_gl = NULL;
291 if (ip->i_iopen_gh.gh_gl)
292 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
293 }
294}
295
296/**
297 * gfs2_show_options - Show mount options for /proc/mounts
298 * @s: seq_file structure
299 * @mnt: vfsmount
300 *
301 * Returns: 0 on success or error code
302 */
303
304static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
305{
306 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
307 struct gfs2_args *args = &sdp->sd_args;
308
309 if (args->ar_lockproto[0])
310 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
311 if (args->ar_locktable[0])
312 seq_printf(s, ",locktable=%s", args->ar_locktable);
313 if (args->ar_hostdata[0])
314 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
315 if (args->ar_spectator)
316 seq_printf(s, ",spectator");
317 if (args->ar_ignore_local_fs)
318 seq_printf(s, ",ignore_local_fs");
319 if (args->ar_localflocks)
320 seq_printf(s, ",localflocks");
321 if (args->ar_localcaching)
322 seq_printf(s, ",localcaching");
323 if (args->ar_debug)
324 seq_printf(s, ",debug");
325 if (args->ar_upgrade)
326 seq_printf(s, ",upgrade");
327 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
328 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
329 if (args->ar_posix_acl)
330 seq_printf(s, ",acl");
331 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
332 char *state;
333 switch (args->ar_quota) {
334 case GFS2_QUOTA_OFF:
335 state = "off";
336 break;
337 case GFS2_QUOTA_ACCOUNT:
338 state = "account";
339 break;
340 case GFS2_QUOTA_ON:
341 state = "on";
342 break;
343 default:
344 state = "unknown";
345 break;
346 }
347 seq_printf(s, ",quota=%s", state);
348 }
349 if (args->ar_suiddir)
350 seq_printf(s, ",suiddir");
351 if (args->ar_data != GFS2_DATA_DEFAULT) {
352 char *state;
353 switch (args->ar_data) {
354 case GFS2_DATA_WRITEBACK:
355 state = "writeback";
356 break;
357 case GFS2_DATA_ORDERED:
358 state = "ordered";
359 break;
360 default:
361 state = "unknown";
362 break;
363 }
364 seq_printf(s, ",data=%s", state);
365 }
366
367 return 0;
368}
369
370/*
371 * We have to (at the moment) hold the inodes main lock to cover
372 * the gap between unlocking the shared lock on the iopen lock and
373 * taking the exclusive lock. I'd rather do a shared -> exclusive
374 * conversion on the iopen lock, but we can change that later. This
375 * is safe, just less efficient.
376 */
377static void gfs2_delete_inode(struct inode *inode)
378{
379 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
380 struct gfs2_inode *ip = GFS2_I(inode);
381 struct gfs2_holder gh;
382 int error;
383
384 if (!inode->i_private)
385 goto out;
386
387 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
388 if (unlikely(error)) {
389 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
390 goto out;
391 }
392
393 gfs2_glock_dq(&ip->i_iopen_gh);
394 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
395 error = gfs2_glock_nq(&ip->i_iopen_gh);
396 if (error)
397 goto out_uninit;
398
399 if (S_ISDIR(ip->i_di.di_mode) &&
400 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
401 error = gfs2_dir_exhash_dealloc(ip);
402 if (error)
403 goto out_unlock;
404 }
405
406 if (ip->i_di.di_eattr) {
407 error = gfs2_ea_dealloc(ip);
408 if (error)
409 goto out_unlock;
410 }
411
412 if (!gfs2_is_stuffed(ip)) {
413 error = gfs2_file_dealloc(ip);
414 if (error)
415 goto out_unlock;
416 }
417
418 error = gfs2_dinode_dealloc(ip);
419
420out_unlock:
421 gfs2_glock_dq(&ip->i_iopen_gh);
422out_uninit:
423 gfs2_holder_uninit(&ip->i_iopen_gh);
424 gfs2_glock_dq_uninit(&gh);
425 if (error)
426 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
427out:
428 truncate_inode_pages(&inode->i_data, 0);
429 clear_inode(inode);
430}
431
432
433
434static struct inode *gfs2_alloc_inode(struct super_block *sb)
435{
436 struct gfs2_sbd *sdp = sb->s_fs_info;
437 struct gfs2_inode *ip;
438
439 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
440 if (ip) {
441 ip->i_flags = 0;
442 ip->i_gl = NULL;
443 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
444 ip->i_last_pfault = jiffies;
445 }
446 return &ip->i_inode;
447}
448
449static void gfs2_destroy_inode(struct inode *inode)
450{
451 kmem_cache_free(gfs2_inode_cachep, inode);
452}
453
454struct super_operations gfs2_super_ops = {
455 .alloc_inode = gfs2_alloc_inode,
456 .destroy_inode = gfs2_destroy_inode,
457 .write_inode = gfs2_write_inode,
458 .delete_inode = gfs2_delete_inode,
459 .put_super = gfs2_put_super,
460 .write_super = gfs2_write_super,
461 .write_super_lockfs = gfs2_write_super_lockfs,
462 .unlockfs = gfs2_unlockfs,
463 .statfs = gfs2_statfs,
464 .remount_fs = gfs2_remount_fs,
465 .clear_inode = gfs2_clear_inode,
466 .show_options = gfs2_show_options,
467};
468
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..9de73f042f78
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct super_operations gfs2_super_ops;
16
17#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..5453d2947ab3
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "inode.h"
25#include "ops_vm.h"
26#include "quota.h"
27#include "rgrp.h"
28#include "trans.h"
29#include "util.h"
30
31static void pfault_be_greedy(struct gfs2_inode *ip)
32{
33 unsigned int time;
34
35 spin_lock(&ip->i_spin);
36 time = ip->i_greedy;
37 ip->i_last_pfault = jiffies;
38 spin_unlock(&ip->i_spin);
39
40 igrab(&ip->i_inode);
41 if (gfs2_glock_be_greedy(ip->i_gl, time))
42 iput(&ip->i_inode);
43}
44
45static struct page *gfs2_private_nopage(struct vm_area_struct *area,
46 unsigned long address, int *type)
47{
48 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
49 struct page *result;
50
51 set_bit(GIF_PAGED, &ip->i_flags);
52
53 result = filemap_nopage(area, address, type);
54
55 if (result && result != NOPAGE_OOM)
56 pfault_be_greedy(ip);
57
58 return result;
59}
60
61static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
62{
63 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
64 unsigned long index = page->index;
65 u64 lblock = index << (PAGE_CACHE_SHIFT -
66 sdp->sd_sb.sb_bsize_shift);
67 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
68 struct gfs2_alloc *al;
69 unsigned int data_blocks, ind_blocks;
70 unsigned int x;
71 int error;
72
73 al = gfs2_alloc_get(ip);
74
75 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
76 if (error)
77 goto out;
78
79 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
80 if (error)
81 goto out_gunlock_q;
82
83 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
84
85 al->al_requested = data_blocks + ind_blocks;
86
87 error = gfs2_inplace_reserve(ip);
88 if (error)
89 goto out_gunlock_q;
90
91 error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
92 ind_blocks + RES_DINODE +
93 RES_STATFS + RES_QUOTA, 0);
94 if (error)
95 goto out_ipres;
96
97 if (gfs2_is_stuffed(ip)) {
98 error = gfs2_unstuff_dinode(ip, NULL);
99 if (error)
100 goto out_trans;
101 }
102
103 for (x = 0; x < blocks; ) {
104 u64 dblock;
105 unsigned int extlen;
106 int new = 1;
107
108 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
109 if (error)
110 goto out_trans;
111
112 lblock += extlen;
113 x += extlen;
114 }
115
116 gfs2_assert_warn(sdp, al->al_alloced);
117
118out_trans:
119 gfs2_trans_end(sdp);
120out_ipres:
121 gfs2_inplace_release(ip);
122out_gunlock_q:
123 gfs2_quota_unlock(ip);
124out:
125 gfs2_alloc_put(ip);
126 return error;
127}
128
129static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
130 unsigned long address, int *type)
131{
132 struct file *file = area->vm_file;
133 struct gfs2_file *gf = file->private_data;
134 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
135 struct gfs2_holder i_gh;
136 struct page *result = NULL;
137 unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
138 area->vm_pgoff;
139 int alloc_required;
140 int error;
141
142 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
143 if (error)
144 return NULL;
145
146 set_bit(GIF_PAGED, &ip->i_flags);
147 set_bit(GIF_SW_PAGED, &ip->i_flags);
148
149 error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
150 PAGE_CACHE_SIZE, &alloc_required);
151 if (error)
152 goto out;
153
154 set_bit(GFF_EXLOCK, &gf->f_flags);
155 result = filemap_nopage(area, address, type);
156 clear_bit(GFF_EXLOCK, &gf->f_flags);
157 if (!result || result == NOPAGE_OOM)
158 goto out;
159
160 if (alloc_required) {
161 error = alloc_page_backing(ip, result);
162 if (error) {
163 page_cache_release(result);
164 result = NULL;
165 goto out;
166 }
167 set_page_dirty(result);
168 }
169
170 pfault_be_greedy(ip);
171out:
172 gfs2_glock_dq_uninit(&i_gh);
173
174 return result;
175}
176
177struct vm_operations_struct gfs2_vm_ops_private = {
178 .nopage = gfs2_private_nopage,
179};
180
181struct vm_operations_struct gfs2_vm_ops_sharewrite = {
182 .nopage = gfs2_sharewrite_nopage,
183};
184
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..4ae8f43ed5e3
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13#include <linux/mm.h>
14
15extern struct vm_operations_struct gfs2_vm_ops_private;
16extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
17
18#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..a3deae7416c9
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1228 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10/*
11 * Quota change tags are associated with each transaction that allocates or
12 * deallocates space. Those changes are accumulated locally to each node (in a
13 * per-node file) and then are periodically synced to the quota file. This
14 * avoids the bottleneck of constantly touching the quota file, but introduces
15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check
19 * program to be run on node crashes or anything like that.
20 *
21 * There are couple of knobs that let the administrator manage the quota
22 * fuzziness. "quota_quantum" sets the maximum time a quota change can be
23 * sitting on one node before being synced to the quota file. (The default is
24 * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
25 * of quota file syncs increases as the user moves closer to their limit. The
26 * more frequent the syncs, the more accurate the quota enforcement, but that
27 * means that there is more contention between the nodes for the quota file.
28 * The default value is one. This sets the maximum theoretical quota overrun
29 * (with infinite node with infinite bandwidth) to twice the user's limit. (In
30 * practice, the maximum overrun you see should be much less.) A "quota_scale"
31 * number greater than one makes quota syncs more frequent and reduces the
32 * maximum overrun. Numbers less than one (but greater than zero) make quota
33 * syncs less frequent.
34 *
35 * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
36 * the quota file, so it is not being constantly read.
37 */
38
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/spinlock.h>
42#include <linux/completion.h>
43#include <linux/buffer_head.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/bio.h>
47#include <linux/gfs2_ondisk.h>
48#include <linux/lm_interface.h>
49
50#include "gfs2.h"
51#include "incore.h"
52#include "bmap.h"
53#include "glock.h"
54#include "glops.h"
55#include "log.h"
56#include "meta_io.h"
57#include "quota.h"
58#include "rgrp.h"
59#include "super.h"
60#include "trans.h"
61#include "inode.h"
62#include "ops_file.h"
63#include "ops_address.h"
64#include "util.h"
65
66#define QUOTA_USER 1
67#define QUOTA_GROUP 0
68
69static u64 qd2offset(struct gfs2_quota_data *qd)
70{
71 u64 offset;
72
73 offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
74 offset *= sizeof(struct gfs2_quota);
75
76 return offset;
77}
78
79static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
80 struct gfs2_quota_data **qdp)
81{
82 struct gfs2_quota_data *qd;
83 int error;
84
85 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
86 if (!qd)
87 return -ENOMEM;
88
89 qd->qd_count = 1;
90 qd->qd_id = id;
91 if (user)
92 set_bit(QDF_USER, &qd->qd_flags);
93 qd->qd_slot = -1;
94
95 error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
96 &gfs2_quota_glops, CREATE, &qd->qd_gl);
97 if (error)
98 goto fail;
99
100 error = gfs2_lvb_hold(qd->qd_gl);
101 gfs2_glock_put(qd->qd_gl);
102 if (error)
103 goto fail;
104
105 *qdp = qd;
106
107 return 0;
108
109fail:
110 kfree(qd);
111 return error;
112}
113
114static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
115 struct gfs2_quota_data **qdp)
116{
117 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
118 int error, found;
119
120 *qdp = NULL;
121
122 for (;;) {
123 found = 0;
124 spin_lock(&sdp->sd_quota_spin);
125 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
126 if (qd->qd_id == id &&
127 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
128 qd->qd_count++;
129 found = 1;
130 break;
131 }
132 }
133
134 if (!found)
135 qd = NULL;
136
137 if (!qd && new_qd) {
138 qd = new_qd;
139 list_add(&qd->qd_list, &sdp->sd_quota_list);
140 atomic_inc(&sdp->sd_quota_count);
141 new_qd = NULL;
142 }
143
144 spin_unlock(&sdp->sd_quota_spin);
145
146 if (qd || !create) {
147 if (new_qd) {
148 gfs2_lvb_unhold(new_qd->qd_gl);
149 kfree(new_qd);
150 }
151 *qdp = qd;
152 return 0;
153 }
154
155 error = qd_alloc(sdp, user, id, &new_qd);
156 if (error)
157 return error;
158 }
159}
160
161static void qd_hold(struct gfs2_quota_data *qd)
162{
163 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
164
165 spin_lock(&sdp->sd_quota_spin);
166 gfs2_assert(sdp, qd->qd_count);
167 qd->qd_count++;
168 spin_unlock(&sdp->sd_quota_spin);
169}
170
171static void qd_put(struct gfs2_quota_data *qd)
172{
173 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
174 spin_lock(&sdp->sd_quota_spin);
175 gfs2_assert(sdp, qd->qd_count);
176 if (!--qd->qd_count)
177 qd->qd_last_touched = jiffies;
178 spin_unlock(&sdp->sd_quota_spin);
179}
180
181static int slot_get(struct gfs2_quota_data *qd)
182{
183 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
184 unsigned int c, o = 0, b;
185 unsigned char byte = 0;
186
187 spin_lock(&sdp->sd_quota_spin);
188
189 if (qd->qd_slot_count++) {
190 spin_unlock(&sdp->sd_quota_spin);
191 return 0;
192 }
193
194 for (c = 0; c < sdp->sd_quota_chunks; c++)
195 for (o = 0; o < PAGE_SIZE; o++) {
196 byte = sdp->sd_quota_bitmap[c][o];
197 if (byte != 0xFF)
198 goto found;
199 }
200
201 goto fail;
202
203found:
204 for (b = 0; b < 8; b++)
205 if (!(byte & (1 << b)))
206 break;
207 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
208
209 if (qd->qd_slot >= sdp->sd_quota_slots)
210 goto fail;
211
212 sdp->sd_quota_bitmap[c][o] |= 1 << b;
213
214 spin_unlock(&sdp->sd_quota_spin);
215
216 return 0;
217
218fail:
219 qd->qd_slot_count--;
220 spin_unlock(&sdp->sd_quota_spin);
221 return -ENOSPC;
222}
223
224static void slot_hold(struct gfs2_quota_data *qd)
225{
226 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
227
228 spin_lock(&sdp->sd_quota_spin);
229 gfs2_assert(sdp, qd->qd_slot_count);
230 qd->qd_slot_count++;
231 spin_unlock(&sdp->sd_quota_spin);
232}
233
234static void slot_put(struct gfs2_quota_data *qd)
235{
236 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
237
238 spin_lock(&sdp->sd_quota_spin);
239 gfs2_assert(sdp, qd->qd_slot_count);
240 if (!--qd->qd_slot_count) {
241 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
242 qd->qd_slot = -1;
243 }
244 spin_unlock(&sdp->sd_quota_spin);
245}
246
247static int bh_get(struct gfs2_quota_data *qd)
248{
249 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
250 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
251 unsigned int block, offset;
252 struct buffer_head *bh;
253 int error;
254 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
255
256 mutex_lock(&sdp->sd_quota_mutex);
257
258 if (qd->qd_bh_count++) {
259 mutex_unlock(&sdp->sd_quota_mutex);
260 return 0;
261 }
262
263 block = qd->qd_slot / sdp->sd_qc_per_block;
264 offset = qd->qd_slot % sdp->sd_qc_per_block;;
265
266 bh_map.b_size = 1 << ip->i_inode.i_blkbits;
267 error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map);
268 if (error)
269 goto fail;
270 error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
271 if (error)
272 goto fail;
273 error = -EIO;
274 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
275 goto fail_brelse;
276
277 qd->qd_bh = bh;
278 qd->qd_bh_qc = (struct gfs2_quota_change *)
279 (bh->b_data + sizeof(struct gfs2_meta_header) +
280 offset * sizeof(struct gfs2_quota_change));
281
282 mutex_lock(&sdp->sd_quota_mutex);
283
284 return 0;
285
286fail_brelse:
287 brelse(bh);
288fail:
289 qd->qd_bh_count--;
290 mutex_unlock(&sdp->sd_quota_mutex);
291 return error;
292}
293
294static void bh_put(struct gfs2_quota_data *qd)
295{
296 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
297
298 mutex_lock(&sdp->sd_quota_mutex);
299 gfs2_assert(sdp, qd->qd_bh_count);
300 if (!--qd->qd_bh_count) {
301 brelse(qd->qd_bh);
302 qd->qd_bh = NULL;
303 qd->qd_bh_qc = NULL;
304 }
305 mutex_unlock(&sdp->sd_quota_mutex);
306}
307
308static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
309{
310 struct gfs2_quota_data *qd = NULL;
311 int error;
312 int found = 0;
313
314 *qdp = NULL;
315
316 if (sdp->sd_vfs->s_flags & MS_RDONLY)
317 return 0;
318
319 spin_lock(&sdp->sd_quota_spin);
320
321 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
322 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
323 !test_bit(QDF_CHANGE, &qd->qd_flags) ||
324 qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
325 continue;
326
327 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
328
329 set_bit(QDF_LOCKED, &qd->qd_flags);
330 gfs2_assert_warn(sdp, qd->qd_count);
331 qd->qd_count++;
332 qd->qd_change_sync = qd->qd_change;
333 gfs2_assert_warn(sdp, qd->qd_slot_count);
334 qd->qd_slot_count++;
335 found = 1;
336
337 break;
338 }
339
340 if (!found)
341 qd = NULL;
342
343 spin_unlock(&sdp->sd_quota_spin);
344
345 if (qd) {
346 gfs2_assert_warn(sdp, qd->qd_change_sync);
347 error = bh_get(qd);
348 if (error) {
349 clear_bit(QDF_LOCKED, &qd->qd_flags);
350 slot_put(qd);
351 qd_put(qd);
352 return error;
353 }
354 }
355
356 *qdp = qd;
357
358 return 0;
359}
360
361static int qd_trylock(struct gfs2_quota_data *qd)
362{
363 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
364
365 if (sdp->sd_vfs->s_flags & MS_RDONLY)
366 return 0;
367
368 spin_lock(&sdp->sd_quota_spin);
369
370 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
371 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
372 spin_unlock(&sdp->sd_quota_spin);
373 return 0;
374 }
375
376 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
377
378 set_bit(QDF_LOCKED, &qd->qd_flags);
379 gfs2_assert_warn(sdp, qd->qd_count);
380 qd->qd_count++;
381 qd->qd_change_sync = qd->qd_change;
382 gfs2_assert_warn(sdp, qd->qd_slot_count);
383 qd->qd_slot_count++;
384
385 spin_unlock(&sdp->sd_quota_spin);
386
387 gfs2_assert_warn(sdp, qd->qd_change_sync);
388 if (bh_get(qd)) {
389 clear_bit(QDF_LOCKED, &qd->qd_flags);
390 slot_put(qd);
391 qd_put(qd);
392 return 0;
393 }
394
395 return 1;
396}
397
398static void qd_unlock(struct gfs2_quota_data *qd)
399{
400 gfs2_assert_warn(qd->qd_gl->gl_sbd,
401 test_bit(QDF_LOCKED, &qd->qd_flags));
402 clear_bit(QDF_LOCKED, &qd->qd_flags);
403 bh_put(qd);
404 slot_put(qd);
405 qd_put(qd);
406}
407
408static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
409 struct gfs2_quota_data **qdp)
410{
411 int error;
412
413 error = qd_get(sdp, user, id, create, qdp);
414 if (error)
415 return error;
416
417 error = slot_get(*qdp);
418 if (error)
419 goto fail;
420
421 error = bh_get(*qdp);
422 if (error)
423 goto fail_slot;
424
425 return 0;
426
427fail_slot:
428 slot_put(*qdp);
429fail:
430 qd_put(*qdp);
431 return error;
432}
433
434static void qdsb_put(struct gfs2_quota_data *qd)
435{
436 bh_put(qd);
437 slot_put(qd);
438 qd_put(qd);
439}
440
441int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
442{
443 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
444 struct gfs2_alloc *al = &ip->i_alloc;
445 struct gfs2_quota_data **qd = al->al_qd;
446 int error;
447
448 if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
449 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
450 return -EIO;
451
452 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
453 return 0;
454
455 error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
456 if (error)
457 goto out;
458 al->al_qd_num++;
459 qd++;
460
461 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
462 if (error)
463 goto out;
464 al->al_qd_num++;
465 qd++;
466
467 if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
468 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
469 if (error)
470 goto out;
471 al->al_qd_num++;
472 qd++;
473 }
474
475 if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
476 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
477 if (error)
478 goto out;
479 al->al_qd_num++;
480 qd++;
481 }
482
483out:
484 if (error)
485 gfs2_quota_unhold(ip);
486 return error;
487}
488
489void gfs2_quota_unhold(struct gfs2_inode *ip)
490{
491 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
492 struct gfs2_alloc *al = &ip->i_alloc;
493 unsigned int x;
494
495 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
496
497 for (x = 0; x < al->al_qd_num; x++) {
498 qdsb_put(al->al_qd[x]);
499 al->al_qd[x] = NULL;
500 }
501 al->al_qd_num = 0;
502}
503
504static int sort_qd(const void *a, const void *b)
505{
506 const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
507 const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
508
509 if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
510 !test_bit(QDF_USER, &qd_b->qd_flags)) {
511 if (test_bit(QDF_USER, &qd_a->qd_flags))
512 return -1;
513 else
514 return 1;
515 }
516 if (qd_a->qd_id < qd_b->qd_id)
517 return -1;
518 if (qd_a->qd_id > qd_b->qd_id)
519 return 1;
520
521 return 0;
522}
523
524static void do_qc(struct gfs2_quota_data *qd, s64 change)
525{
526 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
527 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
528 struct gfs2_quota_change *qc = qd->qd_bh_qc;
529 s64 x;
530
531 mutex_lock(&sdp->sd_quota_mutex);
532 gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
533
534 if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
535 qc->qc_change = 0;
536 qc->qc_flags = 0;
537 if (test_bit(QDF_USER, &qd->qd_flags))
538 qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
539 qc->qc_id = cpu_to_be32(qd->qd_id);
540 }
541
542 x = qc->qc_change;
543 x = be64_to_cpu(x) + change;
544 qc->qc_change = cpu_to_be64(x);
545
546 spin_lock(&sdp->sd_quota_spin);
547 qd->qd_change = x;
548 spin_unlock(&sdp->sd_quota_spin);
549
550 if (!x) {
551 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
552 clear_bit(QDF_CHANGE, &qd->qd_flags);
553 qc->qc_flags = 0;
554 qc->qc_id = 0;
555 slot_put(qd);
556 qd_put(qd);
557 } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
558 qd_hold(qd);
559 slot_hold(qd);
560 }
561
562 mutex_unlock(&sdp->sd_quota_mutex);
563}
564
565/**
566 * gfs2_adjust_quota
567 *
568 * This function was mostly borrowed from gfs2_block_truncate_page which was
569 * in turn mostly borrowed from ext3
570 */
571static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
572 s64 change, struct gfs2_quota_data *qd)
573{
574 struct inode *inode = &ip->i_inode;
575 struct address_space *mapping = inode->i_mapping;
576 unsigned long index = loc >> PAGE_CACHE_SHIFT;
577 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
578 unsigned blocksize, iblock, pos;
579 struct buffer_head *bh;
580 struct page *page;
581 void *kaddr;
582 __be64 *ptr;
583 s64 value;
584 int err = -EIO;
585
586 page = grab_cache_page(mapping, index);
587 if (!page)
588 return -ENOMEM;
589
590 blocksize = inode->i_sb->s_blocksize;
591 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
592
593 if (!page_has_buffers(page))
594 create_empty_buffers(page, blocksize, 0);
595
596 bh = page_buffers(page);
597 pos = blocksize;
598 while (offset >= pos) {
599 bh = bh->b_this_page;
600 iblock++;
601 pos += blocksize;
602 }
603
604 if (!buffer_mapped(bh)) {
605 gfs2_get_block(inode, iblock, bh, 1);
606 if (!buffer_mapped(bh))
607 goto unlock;
608 }
609
610 if (PageUptodate(page))
611 set_buffer_uptodate(bh);
612
613 if (!buffer_uptodate(bh)) {
614 ll_rw_block(READ_META, 1, &bh);
615 wait_on_buffer(bh);
616 if (!buffer_uptodate(bh))
617 goto unlock;
618 }
619
620 gfs2_trans_add_bh(ip->i_gl, bh, 0);
621
622 kaddr = kmap_atomic(page, KM_USER0);
623 ptr = kaddr + offset;
624 value = (s64)be64_to_cpu(*ptr) + change;
625 *ptr = cpu_to_be64(value);
626 flush_dcache_page(page);
627 kunmap_atomic(kaddr, KM_USER0);
628 err = 0;
629 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
630 qd->qd_qb.qb_value = cpu_to_be64(value);
631unlock:
632 unlock_page(page);
633 page_cache_release(page);
634 return err;
635}
636
637static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
638{
639 struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
640 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
641 unsigned int data_blocks, ind_blocks;
642 struct gfs2_holder *ghs, i_gh;
643 unsigned int qx, x;
644 struct gfs2_quota_data *qd;
645 loff_t offset;
646 unsigned int nalloc = 0;
647 struct gfs2_alloc *al = NULL;
648 int error;
649
650 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
651 &data_blocks, &ind_blocks);
652
653 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
654 if (!ghs)
655 return -ENOMEM;
656
657 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
658 for (qx = 0; qx < num_qd; qx++) {
659 error = gfs2_glock_nq_init(qda[qx]->qd_gl,
660 LM_ST_EXCLUSIVE,
661 GL_NOCACHE, &ghs[qx]);
662 if (error)
663 goto out;
664 }
665
666 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
667 if (error)
668 goto out;
669
670 for (x = 0; x < num_qd; x++) {
671 int alloc_required;
672
673 offset = qd2offset(qda[x]);
674 error = gfs2_write_alloc_required(ip, offset,
675 sizeof(struct gfs2_quota),
676 &alloc_required);
677 if (error)
678 goto out_gunlock;
679 if (alloc_required)
680 nalloc++;
681 }
682
683 if (nalloc) {
684 al = gfs2_alloc_get(ip);
685
686 al->al_requested = nalloc * (data_blocks + ind_blocks);
687
688 error = gfs2_inplace_reserve(ip);
689 if (error)
690 goto out_alloc;
691
692 error = gfs2_trans_begin(sdp,
693 al->al_rgd->rd_ri.ri_length +
694 num_qd * data_blocks +
695 nalloc * ind_blocks +
696 RES_DINODE + num_qd +
697 RES_STATFS, 0);
698 if (error)
699 goto out_ipres;
700 } else {
701 error = gfs2_trans_begin(sdp,
702 num_qd * data_blocks +
703 RES_DINODE + num_qd, 0);
704 if (error)
705 goto out_gunlock;
706 }
707
708 for (x = 0; x < num_qd; x++) {
709 qd = qda[x];
710 offset = qd2offset(qd);
711 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
712 (struct gfs2_quota_data *)
713 qd->qd_gl->gl_lvb);
714 if (error)
715 goto out_end_trans;
716
717 do_qc(qd, -qd->qd_change_sync);
718 }
719
720 error = 0;
721
722out_end_trans:
723 gfs2_trans_end(sdp);
724out_ipres:
725 if (nalloc)
726 gfs2_inplace_release(ip);
727out_alloc:
728 if (nalloc)
729 gfs2_alloc_put(ip);
730out_gunlock:
731 gfs2_glock_dq_uninit(&i_gh);
732out:
733 while (qx--)
734 gfs2_glock_dq_uninit(&ghs[qx]);
735 kfree(ghs);
736 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
737 return error;
738}
739
740static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
741 struct gfs2_holder *q_gh)
742{
743 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
744 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
745 struct gfs2_holder i_gh;
746 struct gfs2_quota q;
747 char buf[sizeof(struct gfs2_quota)];
748 struct file_ra_state ra_state;
749 int error;
750 struct gfs2_quota_lvb *qlvb;
751
752 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
753restart:
754 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
755 if (error)
756 return error;
757
758 qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
759
760 if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
761 loff_t pos;
762 gfs2_glock_dq_uninit(q_gh);
763 error = gfs2_glock_nq_init(qd->qd_gl,
764 LM_ST_EXCLUSIVE, GL_NOCACHE,
765 q_gh);
766 if (error)
767 return error;
768
769 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
770 if (error)
771 goto fail;
772
773 memset(buf, 0, sizeof(struct gfs2_quota));
774 pos = qd2offset(qd);
775 error = gfs2_internal_read(ip, &ra_state, buf,
776 &pos, sizeof(struct gfs2_quota));
777 if (error < 0)
778 goto fail_gunlock;
779
780 gfs2_glock_dq_uninit(&i_gh);
781
782
783 gfs2_quota_in(&q, buf);
784 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
785 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
786 qlvb->__pad = 0;
787 qlvb->qb_limit = cpu_to_be64(q.qu_limit);
788 qlvb->qb_warn = cpu_to_be64(q.qu_warn);
789 qlvb->qb_value = cpu_to_be64(q.qu_value);
790 qd->qd_qb = *qlvb;
791
792 if (gfs2_glock_is_blocking(qd->qd_gl)) {
793 gfs2_glock_dq_uninit(q_gh);
794 force_refresh = 0;
795 goto restart;
796 }
797 }
798
799 return 0;
800
801fail_gunlock:
802 gfs2_glock_dq_uninit(&i_gh);
803fail:
804 gfs2_glock_dq_uninit(q_gh);
805 return error;
806}
807
808int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
809{
810 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
811 struct gfs2_alloc *al = &ip->i_alloc;
812 unsigned int x;
813 int error = 0;
814
815 gfs2_quota_hold(ip, uid, gid);
816
817 if (capable(CAP_SYS_RESOURCE) ||
818 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
819 return 0;
820
821 sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
822 sort_qd, NULL);
823
824 for (x = 0; x < al->al_qd_num; x++) {
825 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
826 if (error)
827 break;
828 }
829
830 if (!error)
831 set_bit(GIF_QD_LOCKED, &ip->i_flags);
832 else {
833 while (x--)
834 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
835 gfs2_quota_unhold(ip);
836 }
837
838 return error;
839}
840
841static int need_sync(struct gfs2_quota_data *qd)
842{
843 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
844 struct gfs2_tune *gt = &sdp->sd_tune;
845 s64 value;
846 unsigned int num, den;
847 int do_sync = 1;
848
849 if (!qd->qd_qb.qb_limit)
850 return 0;
851
852 spin_lock(&sdp->sd_quota_spin);
853 value = qd->qd_change;
854 spin_unlock(&sdp->sd_quota_spin);
855
856 spin_lock(&gt->gt_spin);
857 num = gt->gt_quota_scale_num;
858 den = gt->gt_quota_scale_den;
859 spin_unlock(&gt->gt_spin);
860
861 if (value < 0)
862 do_sync = 0;
863 else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
864 (s64)be64_to_cpu(qd->qd_qb.qb_limit))
865 do_sync = 0;
866 else {
867 value *= gfs2_jindex_size(sdp) * num;
868 do_div(value, den);
869 value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
870 if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
871 do_sync = 0;
872 }
873
874 return do_sync;
875}
876
877void gfs2_quota_unlock(struct gfs2_inode *ip)
878{
879 struct gfs2_alloc *al = &ip->i_alloc;
880 struct gfs2_quota_data *qda[4];
881 unsigned int count = 0;
882 unsigned int x;
883
884 if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
885 goto out;
886
887 for (x = 0; x < al->al_qd_num; x++) {
888 struct gfs2_quota_data *qd;
889 int sync;
890
891 qd = al->al_qd[x];
892 sync = need_sync(qd);
893
894 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
895
896 if (sync && qd_trylock(qd))
897 qda[count++] = qd;
898 }
899
900 if (count) {
901 do_sync(count, qda);
902 for (x = 0; x < count; x++)
903 qd_unlock(qda[x]);
904 }
905
906out:
907 gfs2_quota_unhold(ip);
908}
909
910#define MAX_LINE 256
911
912static int print_message(struct gfs2_quota_data *qd, char *type)
913{
914 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
915
916 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
917 sdp->sd_fsname, type,
918 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
919 qd->qd_id);
920
921 return 0;
922}
923
924int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
925{
926 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
927 struct gfs2_alloc *al = &ip->i_alloc;
928 struct gfs2_quota_data *qd;
929 s64 value;
930 unsigned int x;
931 int error = 0;
932
933 if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
934 return 0;
935
936 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
937 return 0;
938
939 for (x = 0; x < al->al_qd_num; x++) {
940 qd = al->al_qd[x];
941
942 if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
943 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
944 continue;
945
946 value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
947 spin_lock(&sdp->sd_quota_spin);
948 value += qd->qd_change;
949 spin_unlock(&sdp->sd_quota_spin);
950
951 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
952 print_message(qd, "exceeded");
953 error = -EDQUOT;
954 break;
955 } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
956 (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
957 time_after_eq(jiffies, qd->qd_last_warn +
958 gfs2_tune_get(sdp,
959 gt_quota_warn_period) * HZ)) {
960 error = print_message(qd, "warning");
961 qd->qd_last_warn = jiffies;
962 }
963 }
964
965 return error;
966}
967
968void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
969 u32 uid, u32 gid)
970{
971 struct gfs2_alloc *al = &ip->i_alloc;
972 struct gfs2_quota_data *qd;
973 unsigned int x;
974 unsigned int found = 0;
975
976 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
977 return;
978 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
979 return;
980
981 for (x = 0; x < al->al_qd_num; x++) {
982 qd = al->al_qd[x];
983
984 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
985 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
986 do_qc(qd, change);
987 found++;
988 }
989 }
990}
991
992int gfs2_quota_sync(struct gfs2_sbd *sdp)
993{
994 struct gfs2_quota_data **qda;
995 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
996 unsigned int num_qd;
997 unsigned int x;
998 int error = 0;
999
1000 sdp->sd_quota_sync_gen++;
1001
1002 qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
1003 if (!qda)
1004 return -ENOMEM;
1005
1006 do {
1007 num_qd = 0;
1008
1009 for (;;) {
1010 error = qd_fish(sdp, qda + num_qd);
1011 if (error || !qda[num_qd])
1012 break;
1013 if (++num_qd == max_qd)
1014 break;
1015 }
1016
1017 if (num_qd) {
1018 if (!error)
1019 error = do_sync(num_qd, qda);
1020 if (!error)
1021 for (x = 0; x < num_qd; x++)
1022 qda[x]->qd_sync_gen =
1023 sdp->sd_quota_sync_gen;
1024
1025 for (x = 0; x < num_qd; x++)
1026 qd_unlock(qda[x]);
1027 }
1028 } while (!error && num_qd == max_qd);
1029
1030 kfree(qda);
1031
1032 return error;
1033}
1034
1035int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
1036{
1037 struct gfs2_quota_data *qd;
1038 struct gfs2_holder q_gh;
1039 int error;
1040
1041 error = qd_get(sdp, user, id, CREATE, &qd);
1042 if (error)
1043 return error;
1044
1045 error = do_glock(qd, FORCE, &q_gh);
1046 if (!error)
1047 gfs2_glock_dq_uninit(&q_gh);
1048
1049 qd_put(qd);
1050
1051 return error;
1052}
1053
1054int gfs2_quota_init(struct gfs2_sbd *sdp)
1055{
1056 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1057 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
1058 unsigned int x, slot = 0;
1059 unsigned int found = 0;
1060 u64 dblock;
1061 u32 extlen = 0;
1062 int error;
1063
1064 if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
1065 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
1066 gfs2_consist_inode(ip);
1067 return -EIO;
1068 }
1069 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1070 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1071
1072 error = -ENOMEM;
1073
1074 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1075 sizeof(unsigned char *), GFP_KERNEL);
1076 if (!sdp->sd_quota_bitmap)
1077 return error;
1078
1079 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1080 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
1081 if (!sdp->sd_quota_bitmap[x])
1082 goto fail;
1083 }
1084
1085 for (x = 0; x < blocks; x++) {
1086 struct buffer_head *bh;
1087 unsigned int y;
1088
1089 if (!extlen) {
1090 int new = 0;
1091 error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
1092 if (error)
1093 goto fail;
1094 }
1095 error = -EIO;
1096 bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
1097 if (!bh)
1098 goto fail;
1099 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
1100 brelse(bh);
1101 goto fail;
1102 }
1103
1104 for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1105 y++, slot++) {
1106 struct gfs2_quota_change qc;
1107 struct gfs2_quota_data *qd;
1108
1109 gfs2_quota_change_in(&qc, bh->b_data +
1110 sizeof(struct gfs2_meta_header) +
1111 y * sizeof(struct gfs2_quota_change));
1112 if (!qc.qc_change)
1113 continue;
1114
1115 error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
1116 qc.qc_id, &qd);
1117 if (error) {
1118 brelse(bh);
1119 goto fail;
1120 }
1121
1122 set_bit(QDF_CHANGE, &qd->qd_flags);
1123 qd->qd_change = qc.qc_change;
1124 qd->qd_slot = slot;
1125 qd->qd_slot_count = 1;
1126 qd->qd_last_touched = jiffies;
1127
1128 spin_lock(&sdp->sd_quota_spin);
1129 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1130 list_add(&qd->qd_list, &sdp->sd_quota_list);
1131 atomic_inc(&sdp->sd_quota_count);
1132 spin_unlock(&sdp->sd_quota_spin);
1133
1134 found++;
1135 }
1136
1137 brelse(bh);
1138 dblock++;
1139 extlen--;
1140 }
1141
1142 if (found)
1143 fs_info(sdp, "found %u quota changes\n", found);
1144
1145 return 0;
1146
1147fail:
1148 gfs2_quota_cleanup(sdp);
1149 return error;
1150}
1151
1152void gfs2_quota_scan(struct gfs2_sbd *sdp)
1153{
1154 struct gfs2_quota_data *qd, *safe;
1155 LIST_HEAD(dead);
1156
1157 spin_lock(&sdp->sd_quota_spin);
1158 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1159 if (!qd->qd_count &&
1160 time_after_eq(jiffies, qd->qd_last_touched +
1161 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1162 list_move(&qd->qd_list, &dead);
1163 gfs2_assert_warn(sdp,
1164 atomic_read(&sdp->sd_quota_count) > 0);
1165 atomic_dec(&sdp->sd_quota_count);
1166 }
1167 }
1168 spin_unlock(&sdp->sd_quota_spin);
1169
1170 while (!list_empty(&dead)) {
1171 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1172 list_del(&qd->qd_list);
1173
1174 gfs2_assert_warn(sdp, !qd->qd_change);
1175 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1176 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1177
1178 gfs2_lvb_unhold(qd->qd_gl);
1179 kfree(qd);
1180 }
1181}
1182
1183void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1184{
1185 struct list_head *head = &sdp->sd_quota_list;
1186 struct gfs2_quota_data *qd;
1187 unsigned int x;
1188
1189 spin_lock(&sdp->sd_quota_spin);
1190 while (!list_empty(head)) {
1191 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1192
1193 if (qd->qd_count > 1 ||
1194 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1195 list_move(&qd->qd_list, head);
1196 spin_unlock(&sdp->sd_quota_spin);
1197 schedule();
1198 spin_lock(&sdp->sd_quota_spin);
1199 continue;
1200 }
1201
1202 list_del(&qd->qd_list);
1203 atomic_dec(&sdp->sd_quota_count);
1204 spin_unlock(&sdp->sd_quota_spin);
1205
1206 if (!qd->qd_count) {
1207 gfs2_assert_warn(sdp, !qd->qd_change);
1208 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1209 } else
1210 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1211 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1212
1213 gfs2_lvb_unhold(qd->qd_gl);
1214 kfree(qd);
1215
1216 spin_lock(&sdp->sd_quota_spin);
1217 }
1218 spin_unlock(&sdp->sd_quota_spin);
1219
1220 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1221
1222 if (sdp->sd_quota_bitmap) {
1223 for (x = 0; x < sdp->sd_quota_chunks; x++)
1224 kfree(sdp->sd_quota_bitmap[x]);
1225 kfree(sdp->sd_quota_bitmap);
1226 }
1227}
1228
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..a8be1417051f
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __QUOTA_DOT_H__
11#define __QUOTA_DOT_H__
12
13struct gfs2_inode;
14struct gfs2_sbd;
15
16#define NO_QUOTA_CHANGE ((u32)-1)
17
18int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
19void gfs2_quota_unhold(struct gfs2_inode *ip);
20
21int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
22void gfs2_quota_unlock(struct gfs2_inode *ip);
23
24int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
25void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
26 u32 uid, u32 gid);
27
28int gfs2_quota_sync(struct gfs2_sbd *sdp);
29int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
30
31int gfs2_quota_init(struct gfs2_sbd *sdp);
32void gfs2_quota_scan(struct gfs2_sbd *sdp);
33void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
34
35#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..62cd223819b7
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,571 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "glops.h"
24#include "lm.h"
25#include "lops.h"
26#include "meta_io.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30#include "dir.h"
31
32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
33 struct buffer_head **bh)
34{
35 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
36 struct gfs2_glock *gl = ip->i_gl;
37 int new = 0;
38 u64 dblock;
39 u32 extlen;
40 int error;
41
42 error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
43 if (error)
44 return error;
45 if (!dblock) {
46 gfs2_consist_inode(ip);
47 return -EIO;
48 }
49
50 *bh = gfs2_meta_ra(gl, dblock, extlen);
51
52 return error;
53}
54
55int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
56{
57 struct list_head *head = &sdp->sd_revoke_list;
58 struct gfs2_revoke_replay *rr;
59 int found = 0;
60
61 list_for_each_entry(rr, head, rr_list) {
62 if (rr->rr_blkno == blkno) {
63 found = 1;
64 break;
65 }
66 }
67
68 if (found) {
69 rr->rr_where = where;
70 return 0;
71 }
72
73 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
74 if (!rr)
75 return -ENOMEM;
76
77 rr->rr_blkno = blkno;
78 rr->rr_where = where;
79 list_add(&rr->rr_list, head);
80
81 return 1;
82}
83
84int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
85{
86 struct gfs2_revoke_replay *rr;
87 int wrap, a, b, revoke;
88 int found = 0;
89
90 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
91 if (rr->rr_blkno == blkno) {
92 found = 1;
93 break;
94 }
95 }
96
97 if (!found)
98 return 0;
99
100 wrap = (rr->rr_where < sdp->sd_replay_tail);
101 a = (sdp->sd_replay_tail < where);
102 b = (where < rr->rr_where);
103 revoke = (wrap) ? (a || b) : (a && b);
104
105 return revoke;
106}
107
108void gfs2_revoke_clean(struct gfs2_sbd *sdp)
109{
110 struct list_head *head = &sdp->sd_revoke_list;
111 struct gfs2_revoke_replay *rr;
112
113 while (!list_empty(head)) {
114 rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
115 list_del(&rr->rr_list);
116 kfree(rr);
117 }
118}
119
120/**
121 * get_log_header - read the log header for a given segment
122 * @jd: the journal
123 * @blk: the block to look at
124 * @lh: the log header to return
125 *
126 * Read the log header for a given segement in a given journal. Do a few
127 * sanity checks on it.
128 *
129 * Returns: 0 on success,
130 * 1 if the header was invalid or incomplete,
131 * errno on error
132 */
133
134static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
135 struct gfs2_log_header *head)
136{
137 struct buffer_head *bh;
138 struct gfs2_log_header lh;
139 u32 hash;
140 int error;
141
142 error = gfs2_replay_read_block(jd, blk, &bh);
143 if (error)
144 return error;
145
146 memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
147 lh.lh_hash = 0;
148 hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
149 gfs2_log_header_in(&lh, bh->b_data);
150
151 brelse(bh);
152
153 if (lh.lh_header.mh_magic != GFS2_MAGIC ||
154 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
155 lh.lh_blkno != blk || lh.lh_hash != hash)
156 return 1;
157
158 *head = lh;
159
160 return 0;
161}
162
163/**
164 * find_good_lh - find a good log header
165 * @jd: the journal
166 * @blk: the segment to start searching from
167 * @lh: the log header to fill in
168 * @forward: if true search forward in the log, else search backward
169 *
170 * Call get_log_header() to get a log header for a segment, but if the
171 * segment is bad, either scan forward or backward until we find a good one.
172 *
173 * Returns: errno
174 */
175
176static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
177 struct gfs2_log_header *head)
178{
179 unsigned int orig_blk = *blk;
180 int error;
181
182 for (;;) {
183 error = get_log_header(jd, *blk, head);
184 if (error <= 0)
185 return error;
186
187 if (++*blk == jd->jd_blocks)
188 *blk = 0;
189
190 if (*blk == orig_blk) {
191 gfs2_consist_inode(GFS2_I(jd->jd_inode));
192 return -EIO;
193 }
194 }
195}
196
197/**
198 * jhead_scan - make sure we've found the head of the log
199 * @jd: the journal
200 * @head: this is filled in with the log descriptor of the head
201 *
202 * At this point, seg and lh should be either the head of the log or just
203 * before. Scan forward until we find the head.
204 *
205 * Returns: errno
206 */
207
208static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
209{
210 unsigned int blk = head->lh_blkno;
211 struct gfs2_log_header lh;
212 int error;
213
214 for (;;) {
215 if (++blk == jd->jd_blocks)
216 blk = 0;
217
218 error = get_log_header(jd, blk, &lh);
219 if (error < 0)
220 return error;
221 if (error == 1)
222 continue;
223
224 if (lh.lh_sequence == head->lh_sequence) {
225 gfs2_consist_inode(GFS2_I(jd->jd_inode));
226 return -EIO;
227 }
228 if (lh.lh_sequence < head->lh_sequence)
229 break;
230
231 *head = lh;
232 }
233
234 return 0;
235}
236
237/**
238 * gfs2_find_jhead - find the head of a log
239 * @jd: the journal
240 * @head: the log descriptor for the head of the log is returned here
241 *
242 * Do a binary search of a journal and find the valid log entry with the
243 * highest sequence number. (i.e. the log head)
244 *
245 * Returns: errno
246 */
247
248int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
249{
250 struct gfs2_log_header lh_1, lh_m;
251 u32 blk_1, blk_2, blk_m;
252 int error;
253
254 blk_1 = 0;
255 blk_2 = jd->jd_blocks - 1;
256
257 for (;;) {
258 blk_m = (blk_1 + blk_2) / 2;
259
260 error = find_good_lh(jd, &blk_1, &lh_1);
261 if (error)
262 return error;
263
264 error = find_good_lh(jd, &blk_m, &lh_m);
265 if (error)
266 return error;
267
268 if (blk_1 == blk_m || blk_m == blk_2)
269 break;
270
271 if (lh_1.lh_sequence <= lh_m.lh_sequence)
272 blk_1 = blk_m;
273 else
274 blk_2 = blk_m;
275 }
276
277 error = jhead_scan(jd, &lh_1);
278 if (error)
279 return error;
280
281 *head = lh_1;
282
283 return error;
284}
285
286/**
287 * foreach_descriptor - go through the active part of the log
288 * @jd: the journal
289 * @start: the first log header in the active region
290 * @end: the last log header (don't process the contents of this entry))
291 *
292 * Call a given function once for every log descriptor in the active
293 * portion of the log.
294 *
295 * Returns: errno
296 */
297
298static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
299 unsigned int end, int pass)
300{
301 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
302 struct buffer_head *bh;
303 struct gfs2_log_descriptor *ld;
304 int error = 0;
305 u32 length;
306 __be64 *ptr;
307 unsigned int offset = sizeof(struct gfs2_log_descriptor);
308 offset += sizeof(__be64) - 1;
309 offset &= ~(sizeof(__be64) - 1);
310
311 while (start != end) {
312 error = gfs2_replay_read_block(jd, start, &bh);
313 if (error)
314 return error;
315 if (gfs2_meta_check(sdp, bh)) {
316 brelse(bh);
317 return -EIO;
318 }
319 ld = (struct gfs2_log_descriptor *)bh->b_data;
320 length = be32_to_cpu(ld->ld_length);
321
322 if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
323 struct gfs2_log_header lh;
324 error = get_log_header(jd, start, &lh);
325 if (!error) {
326 gfs2_replay_incr_blk(sdp, &start);
327 brelse(bh);
328 continue;
329 }
330 if (error == 1) {
331 gfs2_consist_inode(GFS2_I(jd->jd_inode));
332 error = -EIO;
333 }
334 brelse(bh);
335 return error;
336 } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
337 brelse(bh);
338 return -EIO;
339 }
340 ptr = (__be64 *)(bh->b_data + offset);
341 error = lops_scan_elements(jd, start, ld, ptr, pass);
342 if (error) {
343 brelse(bh);
344 return error;
345 }
346
347 while (length--)
348 gfs2_replay_incr_blk(sdp, &start);
349
350 brelse(bh);
351 }
352
353 return 0;
354}
355
356/**
357 * clean_journal - mark a dirty journal as being clean
358 * @sdp: the filesystem
359 * @jd: the journal
360 * @gl: the journal's glock
361 * @head: the head journal to start from
362 *
363 * Returns: errno
364 */
365
366static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
367{
368 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
369 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
370 unsigned int lblock;
371 struct gfs2_log_header *lh;
372 u32 hash;
373 struct buffer_head *bh;
374 int error;
375 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
376
377 lblock = head->lh_blkno;
378 gfs2_replay_incr_blk(sdp, &lblock);
379 bh_map.b_size = 1 << ip->i_inode.i_blkbits;
380 error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map);
381 if (error)
382 return error;
383 if (!bh_map.b_blocknr) {
384 gfs2_consist_inode(ip);
385 return -EIO;
386 }
387
388 bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr);
389 lock_buffer(bh);
390 memset(bh->b_data, 0, bh->b_size);
391 set_buffer_uptodate(bh);
392 clear_buffer_dirty(bh);
393 unlock_buffer(bh);
394
395 lh = (struct gfs2_log_header *)bh->b_data;
396 memset(lh, 0, sizeof(struct gfs2_log_header));
397 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
398 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
399 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
400 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
401 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
402 lh->lh_blkno = cpu_to_be32(lblock);
403 hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
404 lh->lh_hash = cpu_to_be32(hash);
405
406 set_buffer_dirty(bh);
407 if (sync_dirty_buffer(bh))
408 gfs2_io_error_bh(sdp, bh);
409 brelse(bh);
410
411 return error;
412}
413
414/**
415 * gfs2_recover_journal - recovery a given journal
416 * @jd: the struct gfs2_jdesc describing the journal
417 *
418 * Acquire the journal's lock, check to see if the journal is clean, and
419 * do recovery if necessary.
420 *
421 * Returns: errno
422 */
423
424int gfs2_recover_journal(struct gfs2_jdesc *jd)
425{
426 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
427 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
428 struct gfs2_log_header head;
429 struct gfs2_holder j_gh, ji_gh, t_gh;
430 unsigned long t;
431 int ro = 0;
432 unsigned int pass;
433 int error;
434
435 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
436 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
437 jd->jd_jid);
438
439 /* Aquire the journal lock so we can do recovery */
440
441 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
442 LM_ST_EXCLUSIVE,
443 LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
444 &j_gh);
445 switch (error) {
446 case 0:
447 break;
448
449 case GLR_TRYFAILED:
450 fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
451 error = 0;
452
453 default:
454 goto fail;
455 };
456
457 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
458 LM_FLAG_NOEXP, &ji_gh);
459 if (error)
460 goto fail_gunlock_j;
461 } else {
462 fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
463 }
464
465 fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
466
467 error = gfs2_jdesc_check(jd);
468 if (error)
469 goto fail_gunlock_ji;
470
471 error = gfs2_find_jhead(jd, &head);
472 if (error)
473 goto fail_gunlock_ji;
474
475 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
476 fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
477 jd->jd_jid);
478
479 t = jiffies;
480
481 /* Acquire a shared hold on the transaction lock */
482
483 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
484 LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
485 GL_NOCANCEL | GL_NOCACHE, &t_gh);
486 if (error)
487 goto fail_gunlock_ji;
488
489 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
490 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
491 ro = 1;
492 } else {
493 if (sdp->sd_vfs->s_flags & MS_RDONLY)
494 ro = 1;
495 }
496
497 if (ro) {
498 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
499 jd->jd_jid);
500 error = -EROFS;
501 goto fail_gunlock_tr;
502 }
503
504 fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
505
506 for (pass = 0; pass < 2; pass++) {
507 lops_before_scan(jd, &head, pass);
508 error = foreach_descriptor(jd, head.lh_tail,
509 head.lh_blkno, pass);
510 lops_after_scan(jd, error, pass);
511 if (error)
512 goto fail_gunlock_tr;
513 }
514
515 error = clean_journal(jd, &head);
516 if (error)
517 goto fail_gunlock_tr;
518
519 gfs2_glock_dq_uninit(&t_gh);
520 t = DIV_ROUND_UP(jiffies - t, HZ);
521 fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
522 jd->jd_jid, t);
523 }
524
525 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
526 gfs2_glock_dq_uninit(&ji_gh);
527
528 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
529
530 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
531 gfs2_glock_dq_uninit(&j_gh);
532
533 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
534 return 0;
535
536fail_gunlock_tr:
537 gfs2_glock_dq_uninit(&t_gh);
538fail_gunlock_ji:
539 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
540 gfs2_glock_dq_uninit(&ji_gh);
541fail_gunlock_j:
542 gfs2_glock_dq_uninit(&j_gh);
543 }
544
545 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
546
547fail:
548 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
549 return error;
550}
551
552/**
553 * gfs2_check_journals - Recover any dirty journals
554 * @sdp: the filesystem
555 *
556 */
557
558void gfs2_check_journals(struct gfs2_sbd *sdp)
559{
560 struct gfs2_jdesc *jd;
561
562 for (;;) {
563 jd = gfs2_jdesc_find_dirty(sdp);
564 if (!jd)
565 break;
566
567 if (jd != sdp->sd_jdesc)
568 gfs2_recover_journal(jd);
569 }
570}
571
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..961feedf4d8b
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __RECOVERY_DOT_H__
11#define __RECOVERY_DOT_H__
12
13#include "incore.h"
14
15static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
16{
17 if (++*blk == sdp->sd_jdesc->jd_blocks)
18 *blk = 0;
19}
20
21int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
22 struct buffer_head **bh);
23
24int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
25int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
26void gfs2_revoke_clean(struct gfs2_sbd *sdp);
27
28int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header *head);
30int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
31void gfs2_check_journals(struct gfs2_sbd *sdp);
32
33#endif /* __RECOVERY_DOT_H__ */
34
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..b261385c0065
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1513 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/fs.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "glock.h"
22#include "glops.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "super.h"
28#include "trans.h"
29#include "ops_file.h"
30#include "util.h"
31
32#define BFITNOENT ((u32)~0)
33
34/*
35 * These routines are used by the resource group routines (rgrp.c)
36 * to keep track of block allocation. Each block is represented by two
37 * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks.
38 *
39 * 0 = Free
40 * 1 = Used (not metadata)
41 * 2 = Unlinked (still in use) inode
42 * 3 = Used (metadata)
43 */
44
45static const char valid_change[16] = {
46 /* current */
47 /* n */ 0, 1, 1, 1,
48 /* e */ 1, 0, 0, 0,
49 /* w */ 0, 0, 0, 1,
50 1, 0, 0, 0
51};
52
53/**
54 * gfs2_setbit - Set a bit in the bitmaps
55 * @buffer: the buffer that holds the bitmaps
56 * @buflen: the length (in bytes) of the buffer
57 * @block: the block to set
58 * @new_state: the new state of the block
59 *
60 */
61
62static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
63 unsigned int buflen, u32 block,
64 unsigned char new_state)
65{
66 unsigned char *byte, *end, cur_state;
67 unsigned int bit;
68
69 byte = buffer + (block / GFS2_NBBY);
70 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
71 end = buffer + buflen;
72
73 gfs2_assert(rgd->rd_sbd, byte < end);
74
75 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
76
77 if (valid_change[new_state * 4 + cur_state]) {
78 *byte ^= cur_state << bit;
79 *byte |= new_state << bit;
80 } else
81 gfs2_consist_rgrpd(rgd);
82}
83
84/**
85 * gfs2_testbit - test a bit in the bitmaps
86 * @buffer: the buffer that holds the bitmaps
87 * @buflen: the length (in bytes) of the buffer
88 * @block: the block to read
89 *
90 */
91
92static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
93 unsigned int buflen, u32 block)
94{
95 unsigned char *byte, *end, cur_state;
96 unsigned int bit;
97
98 byte = buffer + (block / GFS2_NBBY);
99 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
100 end = buffer + buflen;
101
102 gfs2_assert(rgd->rd_sbd, byte < end);
103
104 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
105
106 return cur_state;
107}
108
109/**
110 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
111 * a block in a given allocation state.
112 * @buffer: the buffer that holds the bitmaps
113 * @buflen: the length (in bytes) of the buffer
114 * @goal: start search at this block's bit-pair (within @buffer)
115 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
116 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
117 *
118 * Scope of @goal and returned block number is only within this bitmap buffer,
119 * not entire rgrp or filesystem. @buffer will be offset from the actual
120 * beginning of a bitmap block buffer, skipping any header structures.
121 *
122 * Return: the block number (bitmap buffer scope) that was found
123 */
124
125static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
126 unsigned int buflen, u32 goal,
127 unsigned char old_state)
128{
129 unsigned char *byte, *end, alloc;
130 u32 blk = goal;
131 unsigned int bit;
132
133 byte = buffer + (goal / GFS2_NBBY);
134 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
135 end = buffer + buflen;
136 alloc = (old_state & 1) ? 0 : 0x55;
137
138 while (byte < end) {
139 if ((*byte & 0x55) == alloc) {
140 blk += (8 - bit) >> 1;
141
142 bit = 0;
143 byte++;
144
145 continue;
146 }
147
148 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
149 return blk;
150
151 bit += GFS2_BIT_SIZE;
152 if (bit >= 8) {
153 bit = 0;
154 byte++;
155 }
156
157 blk++;
158 }
159
160 return BFITNOENT;
161}
162
163/**
164 * gfs2_bitcount - count the number of bits in a certain state
165 * @buffer: the buffer that holds the bitmaps
166 * @buflen: the length (in bytes) of the buffer
167 * @state: the state of the block we're looking for
168 *
169 * Returns: The number of bits
170 */
171
172static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
173 unsigned int buflen, unsigned char state)
174{
175 unsigned char *byte = buffer;
176 unsigned char *end = buffer + buflen;
177 unsigned char state1 = state << 2;
178 unsigned char state2 = state << 4;
179 unsigned char state3 = state << 6;
180 u32 count = 0;
181
182 for (; byte < end; byte++) {
183 if (((*byte) & 0x03) == state)
184 count++;
185 if (((*byte) & 0x0C) == state1)
186 count++;
187 if (((*byte) & 0x30) == state2)
188 count++;
189 if (((*byte) & 0xC0) == state3)
190 count++;
191 }
192
193 return count;
194}
195
196/**
197 * gfs2_rgrp_verify - Verify that a resource group is consistent
198 * @sdp: the filesystem
199 * @rgd: the rgrp
200 *
201 */
202
203void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
204{
205 struct gfs2_sbd *sdp = rgd->rd_sbd;
206 struct gfs2_bitmap *bi = NULL;
207 u32 length = rgd->rd_ri.ri_length;
208 u32 count[4], tmp;
209 int buf, x;
210
211 memset(count, 0, 4 * sizeof(u32));
212
213 /* Count # blocks in each of 4 possible allocation states */
214 for (buf = 0; buf < length; buf++) {
215 bi = rgd->rd_bits + buf;
216 for (x = 0; x < 4; x++)
217 count[x] += gfs2_bitcount(rgd,
218 bi->bi_bh->b_data +
219 bi->bi_offset,
220 bi->bi_len, x);
221 }
222
223 if (count[0] != rgd->rd_rg.rg_free) {
224 if (gfs2_consist_rgrpd(rgd))
225 fs_err(sdp, "free data mismatch: %u != %u\n",
226 count[0], rgd->rd_rg.rg_free);
227 return;
228 }
229
230 tmp = rgd->rd_ri.ri_data -
231 rgd->rd_rg.rg_free -
232 rgd->rd_rg.rg_dinodes;
233 if (count[1] + count[2] != tmp) {
234 if (gfs2_consist_rgrpd(rgd))
235 fs_err(sdp, "used data mismatch: %u != %u\n",
236 count[1], tmp);
237 return;
238 }
239
240 if (count[3] != rgd->rd_rg.rg_dinodes) {
241 if (gfs2_consist_rgrpd(rgd))
242 fs_err(sdp, "used metadata mismatch: %u != %u\n",
243 count[3], rgd->rd_rg.rg_dinodes);
244 return;
245 }
246
247 if (count[2] > count[3]) {
248 if (gfs2_consist_rgrpd(rgd))
249 fs_err(sdp, "unlinked inodes > inodes: %u\n",
250 count[2]);
251 return;
252 }
253
254}
255
256static inline int rgrp_contains_block(struct gfs2_rindex *ri, u64 block)
257{
258 u64 first = ri->ri_data0;
259 u64 last = first + ri->ri_data;
260 return first <= block && block < last;
261}
262
263/**
264 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
265 * @sdp: The GFS2 superblock
266 * @n: The data block number
267 *
268 * Returns: The resource group, or NULL if not found
269 */
270
271struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
272{
273 struct gfs2_rgrpd *rgd;
274
275 spin_lock(&sdp->sd_rindex_spin);
276
277 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
278 if (rgrp_contains_block(&rgd->rd_ri, blk)) {
279 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
280 spin_unlock(&sdp->sd_rindex_spin);
281 return rgd;
282 }
283 }
284
285 spin_unlock(&sdp->sd_rindex_spin);
286
287 return NULL;
288}
289
290/**
291 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
292 * @sdp: The GFS2 superblock
293 *
294 * Returns: The first rgrp in the filesystem
295 */
296
297struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
298{
299 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
300 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
301}
302
303/**
304 * gfs2_rgrpd_get_next - get the next RG
305 * @rgd: A RG
306 *
307 * Returns: The next rgrp
308 */
309
310struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
311{
312 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
313 return NULL;
314 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
315}
316
317static void clear_rgrpdi(struct gfs2_sbd *sdp)
318{
319 struct list_head *head;
320 struct gfs2_rgrpd *rgd;
321 struct gfs2_glock *gl;
322
323 spin_lock(&sdp->sd_rindex_spin);
324 sdp->sd_rindex_forward = NULL;
325 head = &sdp->sd_rindex_recent_list;
326 while (!list_empty(head)) {
327 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
328 list_del(&rgd->rd_recent);
329 }
330 spin_unlock(&sdp->sd_rindex_spin);
331
332 head = &sdp->sd_rindex_list;
333 while (!list_empty(head)) {
334 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
335 gl = rgd->rd_gl;
336
337 list_del(&rgd->rd_list);
338 list_del(&rgd->rd_list_mru);
339
340 if (gl) {
341 gl->gl_object = NULL;
342 gfs2_glock_put(gl);
343 }
344
345 kfree(rgd->rd_bits);
346 kfree(rgd);
347 }
348}
349
350void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
351{
352 mutex_lock(&sdp->sd_rindex_mutex);
353 clear_rgrpdi(sdp);
354 mutex_unlock(&sdp->sd_rindex_mutex);
355}
356
357/**
358 * gfs2_compute_bitstructs - Compute the bitmap sizes
359 * @rgd: The resource group descriptor
360 *
361 * Calculates bitmap descriptors, one for each block that contains bitmap data
362 *
363 * Returns: errno
364 */
365
366static int compute_bitstructs(struct gfs2_rgrpd *rgd)
367{
368 struct gfs2_sbd *sdp = rgd->rd_sbd;
369 struct gfs2_bitmap *bi;
370 u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
371 u32 bytes_left, bytes;
372 int x;
373
374 if (!length)
375 return -EINVAL;
376
377 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
378 if (!rgd->rd_bits)
379 return -ENOMEM;
380
381 bytes_left = rgd->rd_ri.ri_bitbytes;
382
383 for (x = 0; x < length; x++) {
384 bi = rgd->rd_bits + x;
385
386 /* small rgrp; bitmap stored completely in header block */
387 if (length == 1) {
388 bytes = bytes_left;
389 bi->bi_offset = sizeof(struct gfs2_rgrp);
390 bi->bi_start = 0;
391 bi->bi_len = bytes;
392 /* header block */
393 } else if (x == 0) {
394 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
395 bi->bi_offset = sizeof(struct gfs2_rgrp);
396 bi->bi_start = 0;
397 bi->bi_len = bytes;
398 /* last block */
399 } else if (x + 1 == length) {
400 bytes = bytes_left;
401 bi->bi_offset = sizeof(struct gfs2_meta_header);
402 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
403 bi->bi_len = bytes;
404 /* other blocks */
405 } else {
406 bytes = sdp->sd_sb.sb_bsize -
407 sizeof(struct gfs2_meta_header);
408 bi->bi_offset = sizeof(struct gfs2_meta_header);
409 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
410 bi->bi_len = bytes;
411 }
412
413 bytes_left -= bytes;
414 }
415
416 if (bytes_left) {
417 gfs2_consist_rgrpd(rgd);
418 return -EIO;
419 }
420 bi = rgd->rd_bits + (length - 1);
421 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
422 if (gfs2_consist_rgrpd(rgd)) {
423 gfs2_rindex_print(&rgd->rd_ri);
424 fs_err(sdp, "start=%u len=%u offset=%u\n",
425 bi->bi_start, bi->bi_len, bi->bi_offset);
426 }
427 return -EIO;
428 }
429
430 return 0;
431}
432
433/**
434 * gfs2_ri_update - Pull in a new resource index from the disk
435 * @gl: The glock covering the rindex inode
436 *
437 * Returns: 0 on successful update, error code otherwise
438 */
439
440static int gfs2_ri_update(struct gfs2_inode *ip)
441{
442 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
443 struct inode *inode = &ip->i_inode;
444 struct gfs2_rgrpd *rgd;
445 char buf[sizeof(struct gfs2_rindex)];
446 struct file_ra_state ra_state;
447 u64 junk = ip->i_di.di_size;
448 int error;
449
450 if (do_div(junk, sizeof(struct gfs2_rindex))) {
451 gfs2_consist_inode(ip);
452 return -EIO;
453 }
454
455 clear_rgrpdi(sdp);
456
457 file_ra_state_init(&ra_state, inode->i_mapping);
458 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
459 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
460 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
461 sizeof(struct gfs2_rindex));
462 if (!error)
463 break;
464 if (error != sizeof(struct gfs2_rindex)) {
465 if (error > 0)
466 error = -EIO;
467 goto fail;
468 }
469
470 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
471 error = -ENOMEM;
472 if (!rgd)
473 goto fail;
474
475 mutex_init(&rgd->rd_mutex);
476 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
477 rgd->rd_sbd = sdp;
478
479 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
480 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
481
482 gfs2_rindex_in(&rgd->rd_ri, buf);
483 error = compute_bitstructs(rgd);
484 if (error)
485 goto fail;
486
487 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
488 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
489 if (error)
490 goto fail;
491
492 rgd->rd_gl->gl_object = rgd;
493 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
494 }
495
496 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
497 return 0;
498
499fail:
500 clear_rgrpdi(sdp);
501 return error;
502}
503
504/**
505 * gfs2_rindex_hold - Grab a lock on the rindex
506 * @sdp: The GFS2 superblock
507 * @ri_gh: the glock holder
508 *
509 * We grab a lock on the rindex inode to make sure that it doesn't
510 * change whilst we are performing an operation. We keep this lock
511 * for quite long periods of time compared to other locks. This
512 * doesn't matter, since it is shared and it is very, very rarely
513 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
514 *
515 * This makes sure that we're using the latest copy of the resource index
516 * special file, which might have been updated if someone expanded the
517 * filesystem (via gfs2_grow utility), which adds new resource groups.
518 *
519 * Returns: 0 on success, error code otherwise
520 */
521
522int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
523{
524 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
525 struct gfs2_glock *gl = ip->i_gl;
526 int error;
527
528 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
529 if (error)
530 return error;
531
532 /* Read new copy from disk if we don't have the latest */
533 if (sdp->sd_rindex_vn != gl->gl_vn) {
534 mutex_lock(&sdp->sd_rindex_mutex);
535 if (sdp->sd_rindex_vn != gl->gl_vn) {
536 error = gfs2_ri_update(ip);
537 if (error)
538 gfs2_glock_dq_uninit(ri_gh);
539 }
540 mutex_unlock(&sdp->sd_rindex_mutex);
541 }
542
543 return error;
544}
545
546/**
547 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
548 * @rgd: the struct gfs2_rgrpd describing the RG to read in
549 *
550 * Read in all of a Resource Group's header and bitmap blocks.
551 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
552 *
553 * Returns: errno
554 */
555
556int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
557{
558 struct gfs2_sbd *sdp = rgd->rd_sbd;
559 struct gfs2_glock *gl = rgd->rd_gl;
560 unsigned int length = rgd->rd_ri.ri_length;
561 struct gfs2_bitmap *bi;
562 unsigned int x, y;
563 int error;
564
565 mutex_lock(&rgd->rd_mutex);
566
567 spin_lock(&sdp->sd_rindex_spin);
568 if (rgd->rd_bh_count) {
569 rgd->rd_bh_count++;
570 spin_unlock(&sdp->sd_rindex_spin);
571 mutex_unlock(&rgd->rd_mutex);
572 return 0;
573 }
574 spin_unlock(&sdp->sd_rindex_spin);
575
576 for (x = 0; x < length; x++) {
577 bi = rgd->rd_bits + x;
578 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh);
579 if (error)
580 goto fail;
581 }
582
583 for (y = length; y--;) {
584 bi = rgd->rd_bits + y;
585 error = gfs2_meta_wait(sdp, bi->bi_bh);
586 if (error)
587 goto fail;
588 if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
589 GFS2_METATYPE_RG)) {
590 error = -EIO;
591 goto fail;
592 }
593 }
594
595 if (rgd->rd_rg_vn != gl->gl_vn) {
596 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
597 rgd->rd_rg_vn = gl->gl_vn;
598 }
599
600 spin_lock(&sdp->sd_rindex_spin);
601 rgd->rd_free_clone = rgd->rd_rg.rg_free;
602 rgd->rd_bh_count++;
603 spin_unlock(&sdp->sd_rindex_spin);
604
605 mutex_unlock(&rgd->rd_mutex);
606
607 return 0;
608
609fail:
610 while (x--) {
611 bi = rgd->rd_bits + x;
612 brelse(bi->bi_bh);
613 bi->bi_bh = NULL;
614 gfs2_assert_warn(sdp, !bi->bi_clone);
615 }
616 mutex_unlock(&rgd->rd_mutex);
617
618 return error;
619}
620
621void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
622{
623 struct gfs2_sbd *sdp = rgd->rd_sbd;
624
625 spin_lock(&sdp->sd_rindex_spin);
626 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
627 rgd->rd_bh_count++;
628 spin_unlock(&sdp->sd_rindex_spin);
629}
630
631/**
632 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
633 * @rgd: the struct gfs2_rgrpd describing the RG to read in
634 *
635 */
636
637void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
638{
639 struct gfs2_sbd *sdp = rgd->rd_sbd;
640 int x, length = rgd->rd_ri.ri_length;
641
642 spin_lock(&sdp->sd_rindex_spin);
643 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
644 if (--rgd->rd_bh_count) {
645 spin_unlock(&sdp->sd_rindex_spin);
646 return;
647 }
648
649 for (x = 0; x < length; x++) {
650 struct gfs2_bitmap *bi = rgd->rd_bits + x;
651 kfree(bi->bi_clone);
652 bi->bi_clone = NULL;
653 brelse(bi->bi_bh);
654 bi->bi_bh = NULL;
655 }
656
657 spin_unlock(&sdp->sd_rindex_spin);
658}
659
660void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
661{
662 struct gfs2_sbd *sdp = rgd->rd_sbd;
663 unsigned int length = rgd->rd_ri.ri_length;
664 unsigned int x;
665
666 for (x = 0; x < length; x++) {
667 struct gfs2_bitmap *bi = rgd->rd_bits + x;
668 if (!bi->bi_clone)
669 continue;
670 memcpy(bi->bi_clone + bi->bi_offset,
671 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
672 }
673
674 spin_lock(&sdp->sd_rindex_spin);
675 rgd->rd_free_clone = rgd->rd_rg.rg_free;
676 spin_unlock(&sdp->sd_rindex_spin);
677}
678
679/**
680 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
681 * @ip: the incore GFS2 inode structure
682 *
683 * Returns: the struct gfs2_alloc
684 */
685
686struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
687{
688 struct gfs2_alloc *al = &ip->i_alloc;
689
690 /* FIXME: Should assert that the correct locks are held here... */
691 memset(al, 0, sizeof(*al));
692 return al;
693}
694
695/**
696 * try_rgrp_fit - See if a given reservation will fit in a given RG
697 * @rgd: the RG data
698 * @al: the struct gfs2_alloc structure describing the reservation
699 *
700 * If there's room for the requested blocks to be allocated from the RG:
701 * Sets the $al_reserved_data field in @al.
702 * Sets the $al_reserved_meta field in @al.
703 * Sets the $al_rgd field in @al.
704 *
705 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
706 */
707
708static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
709{
710 struct gfs2_sbd *sdp = rgd->rd_sbd;
711 int ret = 0;
712
713 spin_lock(&sdp->sd_rindex_spin);
714 if (rgd->rd_free_clone >= al->al_requested) {
715 al->al_rgd = rgd;
716 ret = 1;
717 }
718 spin_unlock(&sdp->sd_rindex_spin);
719
720 return ret;
721}
722
723/**
724 * recent_rgrp_first - get first RG from "recent" list
725 * @sdp: The GFS2 superblock
726 * @rglast: address of the rgrp used last
727 *
728 * Returns: The first rgrp in the recent list
729 */
730
731static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
732 u64 rglast)
733{
734 struct gfs2_rgrpd *rgd = NULL;
735
736 spin_lock(&sdp->sd_rindex_spin);
737
738 if (list_empty(&sdp->sd_rindex_recent_list))
739 goto out;
740
741 if (!rglast)
742 goto first;
743
744 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
745 if (rgd->rd_ri.ri_addr == rglast)
746 goto out;
747 }
748
749first:
750 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
751 rd_recent);
752out:
753 spin_unlock(&sdp->sd_rindex_spin);
754 return rgd;
755}
756
757/**
758 * recent_rgrp_next - get next RG from "recent" list
759 * @cur_rgd: current rgrp
760 * @remove:
761 *
762 * Returns: The next rgrp in the recent list
763 */
764
765static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
766 int remove)
767{
768 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
769 struct list_head *head;
770 struct gfs2_rgrpd *rgd;
771
772 spin_lock(&sdp->sd_rindex_spin);
773
774 head = &sdp->sd_rindex_recent_list;
775
776 list_for_each_entry(rgd, head, rd_recent) {
777 if (rgd == cur_rgd) {
778 if (cur_rgd->rd_recent.next != head)
779 rgd = list_entry(cur_rgd->rd_recent.next,
780 struct gfs2_rgrpd, rd_recent);
781 else
782 rgd = NULL;
783
784 if (remove)
785 list_del(&cur_rgd->rd_recent);
786
787 goto out;
788 }
789 }
790
791 rgd = NULL;
792 if (!list_empty(head))
793 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
794
795out:
796 spin_unlock(&sdp->sd_rindex_spin);
797 return rgd;
798}
799
800/**
801 * recent_rgrp_add - add an RG to tail of "recent" list
802 * @new_rgd: The rgrp to add
803 *
804 */
805
806static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
807{
808 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
809 struct gfs2_rgrpd *rgd;
810 unsigned int count = 0;
811 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
812
813 spin_lock(&sdp->sd_rindex_spin);
814
815 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
816 if (rgd == new_rgd)
817 goto out;
818
819 if (++count >= max)
820 goto out;
821 }
822 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
823
824out:
825 spin_unlock(&sdp->sd_rindex_spin);
826}
827
828/**
829 * forward_rgrp_get - get an rgrp to try next from full list
830 * @sdp: The GFS2 superblock
831 *
832 * Returns: The rgrp to try next
833 */
834
835static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
836{
837 struct gfs2_rgrpd *rgd;
838 unsigned int journals = gfs2_jindex_size(sdp);
839 unsigned int rg = 0, x;
840
841 spin_lock(&sdp->sd_rindex_spin);
842
843 rgd = sdp->sd_rindex_forward;
844 if (!rgd) {
845 if (sdp->sd_rgrps >= journals)
846 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
847
848 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
849 x++, rgd = gfs2_rgrpd_get_next(rgd))
850 /* Do Nothing */;
851
852 sdp->sd_rindex_forward = rgd;
853 }
854
855 spin_unlock(&sdp->sd_rindex_spin);
856
857 return rgd;
858}
859
860/**
861 * forward_rgrp_set - set the forward rgrp pointer
862 * @sdp: the filesystem
863 * @rgd: The new forward rgrp
864 *
865 */
866
867static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
868{
869 spin_lock(&sdp->sd_rindex_spin);
870 sdp->sd_rindex_forward = rgd;
871 spin_unlock(&sdp->sd_rindex_spin);
872}
873
874/**
875 * get_local_rgrp - Choose and lock a rgrp for allocation
876 * @ip: the inode to reserve space for
877 * @rgp: the chosen and locked rgrp
878 *
879 * Try to acquire rgrp in way which avoids contending with others.
880 *
881 * Returns: errno
882 */
883
884static int get_local_rgrp(struct gfs2_inode *ip)
885{
886 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
887 struct gfs2_rgrpd *rgd, *begin = NULL;
888 struct gfs2_alloc *al = &ip->i_alloc;
889 int flags = LM_FLAG_TRY;
890 int skipped = 0;
891 int loops = 0;
892 int error;
893
894 /* Try recently successful rgrps */
895
896 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
897
898 while (rgd) {
899 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
900 LM_FLAG_TRY, &al->al_rgd_gh);
901 switch (error) {
902 case 0:
903 if (try_rgrp_fit(rgd, al))
904 goto out;
905 gfs2_glock_dq_uninit(&al->al_rgd_gh);
906 rgd = recent_rgrp_next(rgd, 1);
907 break;
908
909 case GLR_TRYFAILED:
910 rgd = recent_rgrp_next(rgd, 0);
911 break;
912
913 default:
914 return error;
915 }
916 }
917
918 /* Go through full list of rgrps */
919
920 begin = rgd = forward_rgrp_get(sdp);
921
922 for (;;) {
923 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
924 &al->al_rgd_gh);
925 switch (error) {
926 case 0:
927 if (try_rgrp_fit(rgd, al))
928 goto out;
929 gfs2_glock_dq_uninit(&al->al_rgd_gh);
930 break;
931
932 case GLR_TRYFAILED:
933 skipped++;
934 break;
935
936 default:
937 return error;
938 }
939
940 rgd = gfs2_rgrpd_get_next(rgd);
941 if (!rgd)
942 rgd = gfs2_rgrpd_get_first(sdp);
943
944 if (rgd == begin) {
945 if (++loops >= 2 || !skipped)
946 return -ENOSPC;
947 flags = 0;
948 }
949 }
950
951out:
952 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
953
954 if (begin) {
955 recent_rgrp_add(rgd);
956 rgd = gfs2_rgrpd_get_next(rgd);
957 if (!rgd)
958 rgd = gfs2_rgrpd_get_first(sdp);
959 forward_rgrp_set(sdp, rgd);
960 }
961
962 return 0;
963}
964
965/**
966 * gfs2_inplace_reserve_i - Reserve space in the filesystem
967 * @ip: the inode to reserve space for
968 *
969 * Returns: errno
970 */
971
972int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
973{
974 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
975 struct gfs2_alloc *al = &ip->i_alloc;
976 int error;
977
978 if (gfs2_assert_warn(sdp, al->al_requested))
979 return -EINVAL;
980
981 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
982 if (error)
983 return error;
984
985 error = get_local_rgrp(ip);
986 if (error) {
987 gfs2_glock_dq_uninit(&al->al_ri_gh);
988 return error;
989 }
990
991 al->al_file = file;
992 al->al_line = line;
993
994 return 0;
995}
996
997/**
998 * gfs2_inplace_release - release an inplace reservation
999 * @ip: the inode the reservation was taken out on
1000 *
1001 * Release a reservation made by gfs2_inplace_reserve().
1002 */
1003
1004void gfs2_inplace_release(struct gfs2_inode *ip)
1005{
1006 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1007 struct gfs2_alloc *al = &ip->i_alloc;
1008
1009 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1010 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
1011 "al_file = %s, al_line = %u\n",
1012 al->al_alloced, al->al_requested, al->al_file,
1013 al->al_line);
1014
1015 al->al_rgd = NULL;
1016 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1017 gfs2_glock_dq_uninit(&al->al_ri_gh);
1018}
1019
1020/**
1021 * gfs2_get_block_type - Check a block in a RG is of given type
1022 * @rgd: the resource group holding the block
1023 * @block: the block number
1024 *
1025 * Returns: The block type (GFS2_BLKST_*)
1026 */
1027
1028unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1029{
1030 struct gfs2_bitmap *bi = NULL;
1031 u32 length, rgrp_block, buf_block;
1032 unsigned int buf;
1033 unsigned char type;
1034
1035 length = rgd->rd_ri.ri_length;
1036 rgrp_block = block - rgd->rd_ri.ri_data0;
1037
1038 for (buf = 0; buf < length; buf++) {
1039 bi = rgd->rd_bits + buf;
1040 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1041 break;
1042 }
1043
1044 gfs2_assert(rgd->rd_sbd, buf < length);
1045 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
1046
1047 type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1048 bi->bi_len, buf_block);
1049
1050 return type;
1051}
1052
1053/**
1054 * rgblk_search - find a block in @old_state, change allocation
1055 * state to @new_state
1056 * @rgd: the resource group descriptor
1057 * @goal: the goal block within the RG (start here to search for avail block)
1058 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
1059 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1060 *
1061 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
1062 * Add the found bitmap buffer to the transaction.
1063 * Set the found bits to @new_state to change block's allocation state.
1064 *
1065 * This function never fails, because we wouldn't call it unless we
1066 * know (from reservation results, etc.) that a block is available.
1067 *
1068 * Scope of @goal and returned block is just within rgrp, not the whole
1069 * filesystem.
1070 *
1071 * Returns: the block number allocated
1072 */
1073
1074static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1075 unsigned char old_state, unsigned char new_state)
1076{
1077 struct gfs2_bitmap *bi = NULL;
1078 u32 length = rgd->rd_ri.ri_length;
1079 u32 blk = 0;
1080 unsigned int buf, x;
1081
1082 /* Find bitmap block that contains bits for goal block */
1083 for (buf = 0; buf < length; buf++) {
1084 bi = rgd->rd_bits + buf;
1085 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1086 break;
1087 }
1088
1089 gfs2_assert(rgd->rd_sbd, buf < length);
1090
1091 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1092 goal -= bi->bi_start * GFS2_NBBY;
1093
1094 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1095 "x <= length", instead of "x < length", because we typically start
1096 the search in the middle of a bit block, but if we can't find an
1097 allocatable block anywhere else, we want to be able wrap around and
1098 search in the first part of our first-searched bit block. */
1099 for (x = 0; x <= length; x++) {
1100 if (bi->bi_clone)
1101 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
1102 bi->bi_len, goal, old_state);
1103 else
1104 blk = gfs2_bitfit(rgd,
1105 bi->bi_bh->b_data + bi->bi_offset,
1106 bi->bi_len, goal, old_state);
1107 if (blk != BFITNOENT)
1108 break;
1109
1110 /* Try next bitmap block (wrap back to rgrp header if at end) */
1111 buf = (buf + 1) % length;
1112 bi = rgd->rd_bits + buf;
1113 goal = 0;
1114 }
1115
1116 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
1117 blk = 0;
1118
1119 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1120 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1121 bi->bi_len, blk, new_state);
1122 if (bi->bi_clone)
1123 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
1124 bi->bi_len, blk, new_state);
1125
1126 return bi->bi_start * GFS2_NBBY + blk;
1127}
1128
1129/**
1130 * rgblk_free - Change alloc state of given block(s)
1131 * @sdp: the filesystem
1132 * @bstart: the start of a run of blocks to free
1133 * @blen: the length of the block run (all must lie within ONE RG!)
1134 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1135 *
1136 * Returns: Resource group containing the block(s)
1137 */
1138
1139static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1140 u32 blen, unsigned char new_state)
1141{
1142 struct gfs2_rgrpd *rgd;
1143 struct gfs2_bitmap *bi = NULL;
1144 u32 length, rgrp_blk, buf_blk;
1145 unsigned int buf;
1146
1147 rgd = gfs2_blk2rgrpd(sdp, bstart);
1148 if (!rgd) {
1149 if (gfs2_consist(sdp))
1150 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
1151 return NULL;
1152 }
1153
1154 length = rgd->rd_ri.ri_length;
1155
1156 rgrp_blk = bstart - rgd->rd_ri.ri_data0;
1157
1158 while (blen--) {
1159 for (buf = 0; buf < length; buf++) {
1160 bi = rgd->rd_bits + buf;
1161 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1162 break;
1163 }
1164
1165 gfs2_assert(rgd->rd_sbd, buf < length);
1166
1167 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1168 rgrp_blk++;
1169
1170 if (!bi->bi_clone) {
1171 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1172 GFP_NOFS | __GFP_NOFAIL);
1173 memcpy(bi->bi_clone + bi->bi_offset,
1174 bi->bi_bh->b_data + bi->bi_offset,
1175 bi->bi_len);
1176 }
1177 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1178 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1179 bi->bi_len, buf_blk, new_state);
1180 }
1181
1182 return rgd;
1183}
1184
1185/**
1186 * gfs2_alloc_data - Allocate a data block
1187 * @ip: the inode to allocate the data block for
1188 *
1189 * Returns: the allocated block
1190 */
1191
1192u64 gfs2_alloc_data(struct gfs2_inode *ip)
1193{
1194 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1195 struct gfs2_alloc *al = &ip->i_alloc;
1196 struct gfs2_rgrpd *rgd = al->al_rgd;
1197 u32 goal, blk;
1198 u64 block;
1199
1200 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
1201 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
1202 else
1203 goal = rgd->rd_last_alloc_data;
1204
1205 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1206 rgd->rd_last_alloc_data = blk;
1207
1208 block = rgd->rd_ri.ri_data0 + blk;
1209 ip->i_di.di_goal_data = block;
1210
1211 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1212 rgd->rd_rg.rg_free--;
1213
1214 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1215 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1216
1217 al->al_alloced++;
1218
1219 gfs2_statfs_change(sdp, 0, -1, 0);
1220 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1221
1222 spin_lock(&sdp->sd_rindex_spin);
1223 rgd->rd_free_clone--;
1224 spin_unlock(&sdp->sd_rindex_spin);
1225
1226 return block;
1227}
1228
1229/**
1230 * gfs2_alloc_meta - Allocate a metadata block
1231 * @ip: the inode to allocate the metadata block for
1232 *
1233 * Returns: the allocated block
1234 */
1235
1236u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1237{
1238 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1239 struct gfs2_alloc *al = &ip->i_alloc;
1240 struct gfs2_rgrpd *rgd = al->al_rgd;
1241 u32 goal, blk;
1242 u64 block;
1243
1244 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
1245 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
1246 else
1247 goal = rgd->rd_last_alloc_meta;
1248
1249 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1250 rgd->rd_last_alloc_meta = blk;
1251
1252 block = rgd->rd_ri.ri_data0 + blk;
1253 ip->i_di.di_goal_meta = block;
1254
1255 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1256 rgd->rd_rg.rg_free--;
1257
1258 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1259 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1260
1261 al->al_alloced++;
1262
1263 gfs2_statfs_change(sdp, 0, -1, 0);
1264 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1265 gfs2_trans_add_unrevoke(sdp, block);
1266
1267 spin_lock(&sdp->sd_rindex_spin);
1268 rgd->rd_free_clone--;
1269 spin_unlock(&sdp->sd_rindex_spin);
1270
1271 return block;
1272}
1273
1274/**
1275 * gfs2_alloc_di - Allocate a dinode
1276 * @dip: the directory that the inode is going in
1277 *
1278 * Returns: the block allocated
1279 */
1280
1281u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1282{
1283 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1284 struct gfs2_alloc *al = &dip->i_alloc;
1285 struct gfs2_rgrpd *rgd = al->al_rgd;
1286 u32 blk;
1287 u64 block;
1288
1289 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
1290 GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
1291
1292 rgd->rd_last_alloc_meta = blk;
1293
1294 block = rgd->rd_ri.ri_data0 + blk;
1295
1296 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1297 rgd->rd_rg.rg_free--;
1298 rgd->rd_rg.rg_dinodes++;
1299 *generation = rgd->rd_rg.rg_igeneration++;
1300 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1301 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1302
1303 al->al_alloced++;
1304
1305 gfs2_statfs_change(sdp, 0, -1, +1);
1306 gfs2_trans_add_unrevoke(sdp, block);
1307
1308 spin_lock(&sdp->sd_rindex_spin);
1309 rgd->rd_free_clone--;
1310 spin_unlock(&sdp->sd_rindex_spin);
1311
1312 return block;
1313}
1314
1315/**
1316 * gfs2_free_data - free a contiguous run of data block(s)
1317 * @ip: the inode these blocks are being freed from
1318 * @bstart: first block of a run of contiguous blocks
1319 * @blen: the length of the block run
1320 *
1321 */
1322
1323void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1324{
1325 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1326 struct gfs2_rgrpd *rgd;
1327
1328 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1329 if (!rgd)
1330 return;
1331
1332 rgd->rd_rg.rg_free += blen;
1333
1334 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1335 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1336
1337 gfs2_trans_add_rg(rgd);
1338
1339 gfs2_statfs_change(sdp, 0, +blen, 0);
1340 gfs2_quota_change(ip, -(s64)blen,
1341 ip->i_di.di_uid, ip->i_di.di_gid);
1342}
1343
1344/**
1345 * gfs2_free_meta - free a contiguous run of data block(s)
1346 * @ip: the inode these blocks are being freed from
1347 * @bstart: first block of a run of contiguous blocks
1348 * @blen: the length of the block run
1349 *
1350 */
1351
1352void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1353{
1354 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1355 struct gfs2_rgrpd *rgd;
1356
1357 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1358 if (!rgd)
1359 return;
1360
1361 rgd->rd_rg.rg_free += blen;
1362
1363 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1364 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1365
1366 gfs2_trans_add_rg(rgd);
1367
1368 gfs2_statfs_change(sdp, 0, +blen, 0);
1369 gfs2_quota_change(ip, -(s64)blen, ip->i_di.di_uid, ip->i_di.di_gid);
1370 gfs2_meta_wipe(ip, bstart, blen);
1371}
1372
1373void gfs2_unlink_di(struct inode *inode)
1374{
1375 struct gfs2_inode *ip = GFS2_I(inode);
1376 struct gfs2_sbd *sdp = GFS2_SB(inode);
1377 struct gfs2_rgrpd *rgd;
1378 u64 blkno = ip->i_num.no_addr;
1379
1380 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
1381 if (!rgd)
1382 return;
1383 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1384 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1385 gfs2_trans_add_rg(rgd);
1386}
1387
1388static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1389{
1390 struct gfs2_sbd *sdp = rgd->rd_sbd;
1391 struct gfs2_rgrpd *tmp_rgd;
1392
1393 tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
1394 if (!tmp_rgd)
1395 return;
1396 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1397
1398 if (!rgd->rd_rg.rg_dinodes)
1399 gfs2_consist_rgrpd(rgd);
1400 rgd->rd_rg.rg_dinodes--;
1401 rgd->rd_rg.rg_free++;
1402
1403 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1404 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1405
1406 gfs2_statfs_change(sdp, 0, +1, -1);
1407 gfs2_trans_add_rg(rgd);
1408}
1409
1410
1411void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1412{
1413 gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
1414 gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
1415 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
1416}
1417
1418/**
1419 * gfs2_rlist_add - add a RG to a list of RGs
1420 * @sdp: the filesystem
1421 * @rlist: the list of resource groups
1422 * @block: the block
1423 *
1424 * Figure out what RG a block belongs to and add that RG to the list
1425 *
1426 * FIXME: Don't use NOFAIL
1427 *
1428 */
1429
1430void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1431 u64 block)
1432{
1433 struct gfs2_rgrpd *rgd;
1434 struct gfs2_rgrpd **tmp;
1435 unsigned int new_space;
1436 unsigned int x;
1437
1438 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1439 return;
1440
1441 rgd = gfs2_blk2rgrpd(sdp, block);
1442 if (!rgd) {
1443 if (gfs2_consist(sdp))
1444 fs_err(sdp, "block = %llu\n", (unsigned long long)block);
1445 return;
1446 }
1447
1448 for (x = 0; x < rlist->rl_rgrps; x++)
1449 if (rlist->rl_rgd[x] == rgd)
1450 return;
1451
1452 if (rlist->rl_rgrps == rlist->rl_space) {
1453 new_space = rlist->rl_space + 10;
1454
1455 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
1456 GFP_NOFS | __GFP_NOFAIL);
1457
1458 if (rlist->rl_rgd) {
1459 memcpy(tmp, rlist->rl_rgd,
1460 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
1461 kfree(rlist->rl_rgd);
1462 }
1463
1464 rlist->rl_space = new_space;
1465 rlist->rl_rgd = tmp;
1466 }
1467
1468 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
1469}
1470
1471/**
1472 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
1473 * and initialize an array of glock holders for them
1474 * @rlist: the list of resource groups
1475 * @state: the lock state to acquire the RG lock in
1476 * @flags: the modifier flags for the holder structures
1477 *
1478 * FIXME: Don't use NOFAIL
1479 *
1480 */
1481
1482void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1483 int flags)
1484{
1485 unsigned int x;
1486
1487 rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
1488 GFP_NOFS | __GFP_NOFAIL);
1489 for (x = 0; x < rlist->rl_rgrps; x++)
1490 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1491 state, flags,
1492 &rlist->rl_ghs[x]);
1493}
1494
1495/**
1496 * gfs2_rlist_free - free a resource group list
1497 * @list: the list of resource groups
1498 *
1499 */
1500
1501void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1502{
1503 unsigned int x;
1504
1505 kfree(rlist->rl_rgd);
1506
1507 if (rlist->rl_ghs) {
1508 for (x = 0; x < rlist->rl_rgrps; x++)
1509 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1510 kfree(rlist->rl_ghs);
1511 }
1512}
1513
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..b01e0cfc99b5
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,69 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__
12
13struct gfs2_rgrpd;
14struct gfs2_sbd;
15struct gfs2_holder;
16
17void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
18
19struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
20struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
21struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
22
23void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
24int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
25
26int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
27void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
28void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
29
30void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
31
32struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
33static inline void gfs2_alloc_put(struct gfs2_inode *ip)
34{
35 return; /* So we can see where ip->i_alloc is used */
36}
37
38int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
39 char *file, unsigned int line);
40#define gfs2_inplace_reserve(ip) \
41gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
42
43void gfs2_inplace_release(struct gfs2_inode *ip);
44
45unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
46
47u64 gfs2_alloc_data(struct gfs2_inode *ip);
48u64 gfs2_alloc_meta(struct gfs2_inode *ip);
49u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
50
51void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
52void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
53void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
54void gfs2_unlink_di(struct inode *inode);
55
56struct gfs2_rgrp_list {
57 unsigned int rl_rgrps;
58 unsigned int rl_space;
59 struct gfs2_rgrpd **rl_rgd;
60 struct gfs2_holder *rl_ghs;
61};
62
63void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
64 u64 block);
65void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
66 int flags);
67void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
68
69#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..6a78b1b32e25
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,976 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "bmap.h"
23#include "dir.h"
24#include "glock.h"
25#include "glops.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "quota.h"
30#include "recovery.h"
31#include "rgrp.h"
32#include "super.h"
33#include "trans.h"
34#include "util.h"
35
36static const u32 gfs2_old_fs_formats[] = {
37 0
38};
39
40static const u32 gfs2_old_multihost_formats[] = {
41 0
42};
43
44/**
45 * gfs2_tune_init - Fill a gfs2_tune structure with default values
46 * @gt: tune
47 *
48 */
49
50void gfs2_tune_init(struct gfs2_tune *gt)
51{
52 spin_lock_init(&gt->gt_spin);
53
54 gt->gt_ilimit = 100;
55 gt->gt_ilimit_tries = 3;
56 gt->gt_ilimit_min = 1;
57 gt->gt_demote_secs = 300;
58 gt->gt_incore_log_blocks = 1024;
59 gt->gt_log_flush_secs = 60;
60 gt->gt_jindex_refresh_secs = 60;
61 gt->gt_scand_secs = 15;
62 gt->gt_recoverd_secs = 60;
63 gt->gt_logd_secs = 1;
64 gt->gt_quotad_secs = 5;
65 gt->gt_quota_simul_sync = 64;
66 gt->gt_quota_warn_period = 10;
67 gt->gt_quota_scale_num = 1;
68 gt->gt_quota_scale_den = 1;
69 gt->gt_quota_cache_secs = 300;
70 gt->gt_quota_quantum = 60;
71 gt->gt_atime_quantum = 3600;
72 gt->gt_new_files_jdata = 0;
73 gt->gt_new_files_directio = 0;
74 gt->gt_max_atomic_write = 4 << 20;
75 gt->gt_max_readahead = 1 << 18;
76 gt->gt_lockdump_size = 131072;
77 gt->gt_stall_secs = 600;
78 gt->gt_complain_secs = 10;
79 gt->gt_reclaim_limit = 5000;
80 gt->gt_entries_per_readdir = 32;
81 gt->gt_prefetch_secs = 10;
82 gt->gt_greedy_default = HZ / 10;
83 gt->gt_greedy_quantum = HZ / 40;
84 gt->gt_greedy_max = HZ / 4;
85 gt->gt_statfs_quantum = 30;
86 gt->gt_statfs_slow = 0;
87}
88
89/**
90 * gfs2_check_sb - Check superblock
91 * @sdp: the filesystem
92 * @sb: The superblock
93 * @silent: Don't print a message if the check fails
94 *
95 * Checks the version code of the FS is one that we understand how to
96 * read and that the sizes of the various on-disk structures have not
97 * changed.
98 */
99
100int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
101{
102 unsigned int x;
103
104 if (sb->sb_header.mh_magic != GFS2_MAGIC ||
105 sb->sb_header.mh_type != GFS2_METATYPE_SB) {
106 if (!silent)
107 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
108 return -EINVAL;
109 }
110
111 /* If format numbers match exactly, we're done. */
112
113 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
114 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
115 return 0;
116
117 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
118 for (x = 0; gfs2_old_fs_formats[x]; x++)
119 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
120 break;
121
122 if (!gfs2_old_fs_formats[x]) {
123 printk(KERN_WARNING
124 "GFS2: code version (%u, %u) is incompatible "
125 "with ondisk format (%u, %u)\n",
126 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
127 sb->sb_fs_format, sb->sb_multihost_format);
128 printk(KERN_WARNING
129 "GFS2: I don't know how to upgrade this FS\n");
130 return -EINVAL;
131 }
132 }
133
134 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
135 for (x = 0; gfs2_old_multihost_formats[x]; x++)
136 if (gfs2_old_multihost_formats[x] ==
137 sb->sb_multihost_format)
138 break;
139
140 if (!gfs2_old_multihost_formats[x]) {
141 printk(KERN_WARNING
142 "GFS2: code version (%u, %u) is incompatible "
143 "with ondisk format (%u, %u)\n",
144 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
145 sb->sb_fs_format, sb->sb_multihost_format);
146 printk(KERN_WARNING
147 "GFS2: I don't know how to upgrade this FS\n");
148 return -EINVAL;
149 }
150 }
151
152 if (!sdp->sd_args.ar_upgrade) {
153 printk(KERN_WARNING
154 "GFS2: code version (%u, %u) is incompatible "
155 "with ondisk format (%u, %u)\n",
156 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
157 sb->sb_fs_format, sb->sb_multihost_format);
158 printk(KERN_INFO
159 "GFS2: Use the \"upgrade\" mount option to upgrade "
160 "the FS\n");
161 printk(KERN_INFO "GFS2: See the manual for more details\n");
162 return -EINVAL;
163 }
164
165 return 0;
166}
167
168
169static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
170{
171 struct page *page = bio->bi_private;
172 if (bio->bi_size)
173 return 1;
174
175 if (!error)
176 SetPageUptodate(page);
177 else
178 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
179 unlock_page(page);
180 return 0;
181}
182
183struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
184{
185 struct page *page;
186 struct bio *bio;
187
188 page = alloc_page(GFP_KERNEL);
189 if (unlikely(!page))
190 return NULL;
191
192 ClearPageUptodate(page);
193 ClearPageDirty(page);
194 lock_page(page);
195
196 bio = bio_alloc(GFP_KERNEL, 1);
197 if (unlikely(!bio)) {
198 __free_page(page);
199 return NULL;
200 }
201
202 bio->bi_sector = sector;
203 bio->bi_bdev = sb->s_bdev;
204 bio_add_page(bio, page, PAGE_SIZE, 0);
205
206 bio->bi_end_io = end_bio_io_page;
207 bio->bi_private = page;
208 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
209 wait_on_page_locked(page);
210 bio_put(bio);
211 if (!PageUptodate(page)) {
212 __free_page(page);
213 return NULL;
214 }
215 return page;
216}
217
218/**
219 * gfs2_read_sb - Read super block
220 * @sdp: The GFS2 superblock
221 * @gl: the glock for the superblock (assumed to be held)
222 * @silent: Don't print message if mount fails
223 *
224 */
225
226int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
227{
228 u32 hash_blocks, ind_blocks, leaf_blocks;
229 u32 tmp_blocks;
230 unsigned int x;
231 int error;
232 struct page *page;
233 char *sb;
234
235 page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
236 if (!page) {
237 if (!silent)
238 fs_err(sdp, "can't read superblock\n");
239 return -EIO;
240 }
241 sb = kmap(page);
242 gfs2_sb_in(&sdp->sd_sb, sb);
243 kunmap(page);
244 __free_page(page);
245
246 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
247 if (error)
248 return error;
249
250 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
251 GFS2_BASIC_BLOCK_SHIFT;
252 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
253 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
254 sizeof(struct gfs2_dinode)) / sizeof(u64);
255 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
256 sizeof(struct gfs2_meta_header)) / sizeof(u64);
257 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
258 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
259 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
260 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
261 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
262 sizeof(struct gfs2_meta_header)) /
263 sizeof(struct gfs2_quota_change);
264
265 /* Compute maximum reservation required to add a entry to a directory */
266
267 hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
268 sdp->sd_jbsize);
269
270 ind_blocks = 0;
271 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
272 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
273 ind_blocks += tmp_blocks;
274 }
275
276 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
277
278 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
279
280 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
281 sizeof(struct gfs2_dinode);
282 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
283 for (x = 2;; x++) {
284 u64 space, d;
285 u32 m;
286
287 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
288 d = space;
289 m = do_div(d, sdp->sd_inptrs);
290
291 if (d != sdp->sd_heightsize[x - 1] || m)
292 break;
293 sdp->sd_heightsize[x] = space;
294 }
295 sdp->sd_max_height = x;
296 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
297
298 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
299 sizeof(struct gfs2_dinode);
300 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
301 for (x = 2;; x++) {
302 u64 space, d;
303 u32 m;
304
305 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
306 d = space;
307 m = do_div(d, sdp->sd_inptrs);
308
309 if (d != sdp->sd_jheightsize[x - 1] || m)
310 break;
311 sdp->sd_jheightsize[x] = space;
312 }
313 sdp->sd_max_jheight = x;
314 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
315
316 return 0;
317}
318
319/**
320 * gfs2_jindex_hold - Grab a lock on the jindex
321 * @sdp: The GFS2 superblock
322 * @ji_gh: the holder for the jindex glock
323 *
324 * This is very similar to the gfs2_rindex_hold() function, except that
325 * in general we hold the jindex lock for longer periods of time and
326 * we grab it far less frequently (in general) then the rgrp lock.
327 *
328 * Returns: errno
329 */
330
331int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
332{
333 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
334 struct qstr name;
335 char buf[20];
336 struct gfs2_jdesc *jd;
337 int error;
338
339 name.name = buf;
340
341 mutex_lock(&sdp->sd_jindex_mutex);
342
343 for (;;) {
344 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
345 GL_LOCAL_EXCL, ji_gh);
346 if (error)
347 break;
348
349 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
350 name.hash = gfs2_disk_hash(name.name, name.len);
351
352 error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
353 if (error == -ENOENT) {
354 error = 0;
355 break;
356 }
357
358 gfs2_glock_dq_uninit(ji_gh);
359
360 if (error)
361 break;
362
363 error = -ENOMEM;
364 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
365 if (!jd)
366 break;
367
368 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
369 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
370 if (!jd->jd_inode)
371 error = -ENOENT;
372 else
373 error = PTR_ERR(jd->jd_inode);
374 kfree(jd);
375 break;
376 }
377
378 spin_lock(&sdp->sd_jindex_spin);
379 jd->jd_jid = sdp->sd_journals++;
380 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
381 spin_unlock(&sdp->sd_jindex_spin);
382 }
383
384 mutex_unlock(&sdp->sd_jindex_mutex);
385
386 return error;
387}
388
389/**
390 * gfs2_jindex_free - Clear all the journal index information
391 * @sdp: The GFS2 superblock
392 *
393 */
394
395void gfs2_jindex_free(struct gfs2_sbd *sdp)
396{
397 struct list_head list;
398 struct gfs2_jdesc *jd;
399
400 spin_lock(&sdp->sd_jindex_spin);
401 list_add(&list, &sdp->sd_jindex_list);
402 list_del_init(&sdp->sd_jindex_list);
403 sdp->sd_journals = 0;
404 spin_unlock(&sdp->sd_jindex_spin);
405
406 while (!list_empty(&list)) {
407 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
408 list_del(&jd->jd_list);
409 iput(jd->jd_inode);
410 kfree(jd);
411 }
412}
413
414static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
415{
416 struct gfs2_jdesc *jd;
417 int found = 0;
418
419 list_for_each_entry(jd, head, jd_list) {
420 if (jd->jd_jid == jid) {
421 found = 1;
422 break;
423 }
424 }
425
426 if (!found)
427 jd = NULL;
428
429 return jd;
430}
431
432struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
433{
434 struct gfs2_jdesc *jd;
435
436 spin_lock(&sdp->sd_jindex_spin);
437 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
438 spin_unlock(&sdp->sd_jindex_spin);
439
440 return jd;
441}
442
443void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
444{
445 struct gfs2_jdesc *jd;
446
447 spin_lock(&sdp->sd_jindex_spin);
448 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
449 if (jd)
450 jd->jd_dirty = 1;
451 spin_unlock(&sdp->sd_jindex_spin);
452}
453
454struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
455{
456 struct gfs2_jdesc *jd;
457 int found = 0;
458
459 spin_lock(&sdp->sd_jindex_spin);
460
461 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
462 if (jd->jd_dirty) {
463 jd->jd_dirty = 0;
464 found = 1;
465 break;
466 }
467 }
468 spin_unlock(&sdp->sd_jindex_spin);
469
470 if (!found)
471 jd = NULL;
472
473 return jd;
474}
475
476int gfs2_jdesc_check(struct gfs2_jdesc *jd)
477{
478 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
479 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
480 int ar;
481 int error;
482
483 if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
484 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
485 gfs2_consist_inode(ip);
486 return -EIO;
487 }
488 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
489
490 error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
491 if (!error && ar) {
492 gfs2_consist_inode(ip);
493 error = -EIO;
494 }
495
496 return error;
497}
498
499/**
500 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
501 * @sdp: the filesystem
502 *
503 * Returns: errno
504 */
505
506int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
507{
508 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
509 struct gfs2_glock *j_gl = ip->i_gl;
510 struct gfs2_holder t_gh;
511 struct gfs2_log_header head;
512 int error;
513
514 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
515 GL_LOCAL_EXCL, &t_gh);
516 if (error)
517 return error;
518
519 gfs2_meta_cache_flush(ip);
520 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
521
522 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
523 if (error)
524 goto fail;
525
526 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
527 gfs2_consist(sdp);
528 error = -EIO;
529 goto fail;
530 }
531
532 /* Initialize some head of the log stuff */
533 sdp->sd_log_sequence = head.lh_sequence + 1;
534 gfs2_log_pointers_init(sdp, head.lh_blkno);
535
536 error = gfs2_quota_init(sdp);
537 if (error)
538 goto fail;
539
540 set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
541
542 gfs2_glock_dq_uninit(&t_gh);
543
544 return 0;
545
546fail:
547 t_gh.gh_flags |= GL_NOCACHE;
548 gfs2_glock_dq_uninit(&t_gh);
549
550 return error;
551}
552
553/**
554 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
555 * @sdp: the filesystem
556 *
557 * Returns: errno
558 */
559
560int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
561{
562 struct gfs2_holder t_gh;
563 int error;
564
565 gfs2_quota_sync(sdp);
566 gfs2_statfs_sync(sdp);
567
568 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
569 GL_LOCAL_EXCL | GL_NOCACHE,
570 &t_gh);
571 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
572 return error;
573
574 gfs2_meta_syncfs(sdp);
575 gfs2_log_shutdown(sdp);
576
577 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
578
579 if (t_gh.gh_gl)
580 gfs2_glock_dq_uninit(&t_gh);
581
582 gfs2_quota_cleanup(sdp);
583
584 return error;
585}
586
587int gfs2_statfs_init(struct gfs2_sbd *sdp)
588{
589 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
590 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
591 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
592 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
593 struct buffer_head *m_bh, *l_bh;
594 struct gfs2_holder gh;
595 int error;
596
597 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
598 &gh);
599 if (error)
600 return error;
601
602 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
603 if (error)
604 goto out;
605
606 if (sdp->sd_args.ar_spectator) {
607 spin_lock(&sdp->sd_statfs_spin);
608 gfs2_statfs_change_in(m_sc, m_bh->b_data +
609 sizeof(struct gfs2_dinode));
610 spin_unlock(&sdp->sd_statfs_spin);
611 } else {
612 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
613 if (error)
614 goto out_m_bh;
615
616 spin_lock(&sdp->sd_statfs_spin);
617 gfs2_statfs_change_in(m_sc, m_bh->b_data +
618 sizeof(struct gfs2_dinode));
619 gfs2_statfs_change_in(l_sc, l_bh->b_data +
620 sizeof(struct gfs2_dinode));
621 spin_unlock(&sdp->sd_statfs_spin);
622
623 brelse(l_bh);
624 }
625
626out_m_bh:
627 brelse(m_bh);
628out:
629 gfs2_glock_dq_uninit(&gh);
630 return 0;
631}
632
633void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
634 s64 dinodes)
635{
636 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
637 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
638 struct buffer_head *l_bh;
639 int error;
640
641 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
642 if (error)
643 return;
644
645 mutex_lock(&sdp->sd_statfs_mutex);
646 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
647 mutex_unlock(&sdp->sd_statfs_mutex);
648
649 spin_lock(&sdp->sd_statfs_spin);
650 l_sc->sc_total += total;
651 l_sc->sc_free += free;
652 l_sc->sc_dinodes += dinodes;
653 gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
654 spin_unlock(&sdp->sd_statfs_spin);
655
656 brelse(l_bh);
657}
658
659int gfs2_statfs_sync(struct gfs2_sbd *sdp)
660{
661 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
662 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
663 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
664 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
665 struct gfs2_holder gh;
666 struct buffer_head *m_bh, *l_bh;
667 int error;
668
669 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
670 &gh);
671 if (error)
672 return error;
673
674 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
675 if (error)
676 goto out;
677
678 spin_lock(&sdp->sd_statfs_spin);
679 gfs2_statfs_change_in(m_sc, m_bh->b_data +
680 sizeof(struct gfs2_dinode));
681 if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
682 spin_unlock(&sdp->sd_statfs_spin);
683 goto out_bh;
684 }
685 spin_unlock(&sdp->sd_statfs_spin);
686
687 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
688 if (error)
689 goto out_bh;
690
691 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
692 if (error)
693 goto out_bh2;
694
695 mutex_lock(&sdp->sd_statfs_mutex);
696 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
697 mutex_unlock(&sdp->sd_statfs_mutex);
698
699 spin_lock(&sdp->sd_statfs_spin);
700 m_sc->sc_total += l_sc->sc_total;
701 m_sc->sc_free += l_sc->sc_free;
702 m_sc->sc_dinodes += l_sc->sc_dinodes;
703 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
704 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
705 0, sizeof(struct gfs2_statfs_change));
706 spin_unlock(&sdp->sd_statfs_spin);
707
708 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
709 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
710
711 gfs2_trans_end(sdp);
712
713out_bh2:
714 brelse(l_bh);
715out_bh:
716 brelse(m_bh);
717out:
718 gfs2_glock_dq_uninit(&gh);
719 return error;
720}
721
722/**
723 * gfs2_statfs_i - Do a statfs
724 * @sdp: the filesystem
725 * @sg: the sg structure
726 *
727 * Returns: errno
728 */
729
730int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
731{
732 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
733 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
734
735 spin_lock(&sdp->sd_statfs_spin);
736
737 *sc = *m_sc;
738 sc->sc_total += l_sc->sc_total;
739 sc->sc_free += l_sc->sc_free;
740 sc->sc_dinodes += l_sc->sc_dinodes;
741
742 spin_unlock(&sdp->sd_statfs_spin);
743
744 if (sc->sc_free < 0)
745 sc->sc_free = 0;
746 if (sc->sc_free > sc->sc_total)
747 sc->sc_free = sc->sc_total;
748 if (sc->sc_dinodes < 0)
749 sc->sc_dinodes = 0;
750
751 return 0;
752}
753
754/**
755 * statfs_fill - fill in the sg for a given RG
756 * @rgd: the RG
757 * @sc: the sc structure
758 *
759 * Returns: 0 on success, -ESTALE if the LVB is invalid
760 */
761
762static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
763 struct gfs2_statfs_change *sc)
764{
765 gfs2_rgrp_verify(rgd);
766 sc->sc_total += rgd->rd_ri.ri_data;
767 sc->sc_free += rgd->rd_rg.rg_free;
768 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
769 return 0;
770}
771
772/**
773 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
774 * @sdp: the filesystem
775 * @sc: the sc info that will be returned
776 *
777 * Any error (other than a signal) will cause this routine to fall back
778 * to the synchronous version.
779 *
780 * FIXME: This really shouldn't busy wait like this.
781 *
782 * Returns: errno
783 */
784
785int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
786{
787 struct gfs2_holder ri_gh;
788 struct gfs2_rgrpd *rgd_next;
789 struct gfs2_holder *gha, *gh;
790 unsigned int slots = 64;
791 unsigned int x;
792 int done;
793 int error = 0, err;
794
795 memset(sc, 0, sizeof(struct gfs2_statfs_change));
796 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
797 if (!gha)
798 return -ENOMEM;
799
800 error = gfs2_rindex_hold(sdp, &ri_gh);
801 if (error)
802 goto out;
803
804 rgd_next = gfs2_rgrpd_get_first(sdp);
805
806 for (;;) {
807 done = 1;
808
809 for (x = 0; x < slots; x++) {
810 gh = gha + x;
811
812 if (gh->gh_gl && gfs2_glock_poll(gh)) {
813 err = gfs2_glock_wait(gh);
814 if (err) {
815 gfs2_holder_uninit(gh);
816 error = err;
817 } else {
818 if (!error)
819 error = statfs_slow_fill(
820 gh->gh_gl->gl_object, sc);
821 gfs2_glock_dq_uninit(gh);
822 }
823 }
824
825 if (gh->gh_gl)
826 done = 0;
827 else if (rgd_next && !error) {
828 error = gfs2_glock_nq_init(rgd_next->rd_gl,
829 LM_ST_SHARED,
830 GL_ASYNC,
831 gh);
832 rgd_next = gfs2_rgrpd_get_next(rgd_next);
833 done = 0;
834 }
835
836 if (signal_pending(current))
837 error = -ERESTARTSYS;
838 }
839
840 if (done)
841 break;
842
843 yield();
844 }
845
846 gfs2_glock_dq_uninit(&ri_gh);
847
848out:
849 kfree(gha);
850 return error;
851}
852
853struct lfcc {
854 struct list_head list;
855 struct gfs2_holder gh;
856};
857
858/**
859 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
860 * journals are clean
861 * @sdp: the file system
862 * @state: the state to put the transaction lock into
863 * @t_gh: the hold on the transaction lock
864 *
865 * Returns: errno
866 */
867
868static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
869 struct gfs2_holder *t_gh)
870{
871 struct gfs2_inode *ip;
872 struct gfs2_holder ji_gh;
873 struct gfs2_jdesc *jd;
874 struct lfcc *lfcc;
875 LIST_HEAD(list);
876 struct gfs2_log_header lh;
877 int error;
878
879 error = gfs2_jindex_hold(sdp, &ji_gh);
880 if (error)
881 return error;
882
883 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
884 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
885 if (!lfcc) {
886 error = -ENOMEM;
887 goto out;
888 }
889 ip = GFS2_I(jd->jd_inode);
890 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
891 if (error) {
892 kfree(lfcc);
893 goto out;
894 }
895 list_add(&lfcc->list, &list);
896 }
897
898 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
899 LM_FLAG_PRIORITY | GL_NOCACHE,
900 t_gh);
901
902 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
903 error = gfs2_jdesc_check(jd);
904 if (error)
905 break;
906 error = gfs2_find_jhead(jd, &lh);
907 if (error)
908 break;
909 if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
910 error = -EBUSY;
911 break;
912 }
913 }
914
915 if (error)
916 gfs2_glock_dq_uninit(t_gh);
917
918out:
919 while (!list_empty(&list)) {
920 lfcc = list_entry(list.next, struct lfcc, list);
921 list_del(&lfcc->list);
922 gfs2_glock_dq_uninit(&lfcc->gh);
923 kfree(lfcc);
924 }
925 gfs2_glock_dq_uninit(&ji_gh);
926 return error;
927}
928
929/**
930 * gfs2_freeze_fs - freezes the file system
931 * @sdp: the file system
932 *
933 * This function flushes data and meta data for all machines by
934 * aquiring the transaction log exclusively. All journals are
935 * ensured to be in a clean state as well.
936 *
937 * Returns: errno
938 */
939
940int gfs2_freeze_fs(struct gfs2_sbd *sdp)
941{
942 int error = 0;
943
944 mutex_lock(&sdp->sd_freeze_lock);
945
946 if (!sdp->sd_freeze_count++) {
947 error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
948 if (error)
949 sdp->sd_freeze_count--;
950 }
951
952 mutex_unlock(&sdp->sd_freeze_lock);
953
954 return error;
955}
956
957/**
958 * gfs2_unfreeze_fs - unfreezes the file system
959 * @sdp: the file system
960 *
961 * This function allows the file system to proceed by unlocking
962 * the exclusively held transaction lock. Other GFS2 nodes are
963 * now free to acquire the lock shared and go on with their lives.
964 *
965 */
966
967void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
968{
969 mutex_lock(&sdp->sd_freeze_lock);
970
971 if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
972 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
973
974 mutex_unlock(&sdp->sd_freeze_lock);
975}
976
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..5bb443ae0f59
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,55 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__
12
13#include "incore.h"
14
15void gfs2_tune_init(struct gfs2_tune *gt);
16
17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
19struct page *gfs2_read_super(struct super_block *sb, sector_t sector);
20
21static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
22{
23 unsigned int x;
24 spin_lock(&sdp->sd_jindex_spin);
25 x = sdp->sd_journals;
26 spin_unlock(&sdp->sd_jindex_spin);
27 return x;
28}
29
30int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
31void gfs2_jindex_free(struct gfs2_sbd *sdp);
32
33struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
34void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
35struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
36int gfs2_jdesc_check(struct gfs2_jdesc *jd);
37
38int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
39 struct gfs2_inode **ipp);
40
41int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
42int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
43
44int gfs2_statfs_init(struct gfs2_sbd *sdp);
45void gfs2_statfs_change(struct gfs2_sbd *sdp,
46 s64 total, s64 free, s64 dinodes);
47int gfs2_statfs_sync(struct gfs2_sbd *sdp);
48int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
49int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
50
51int gfs2_freeze_fs(struct gfs2_sbd *sdp);
52void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
53
54#endif /* __SUPER_DOT_H__ */
55
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..0e0ec988f731
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,583 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <asm/uaccess.h>
20
21#include "gfs2.h"
22#include "incore.h"
23#include "lm.h"
24#include "sys.h"
25#include "super.h"
26#include "glock.h"
27#include "quota.h"
28#include "util.h"
29
30char *gfs2_sys_margs;
31spinlock_t gfs2_sys_margs_lock;
32
33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
34{
35 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
36}
37
38static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
39{
40 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
41}
42
43static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
44{
45 unsigned int count;
46
47 mutex_lock(&sdp->sd_freeze_lock);
48 count = sdp->sd_freeze_count;
49 mutex_unlock(&sdp->sd_freeze_lock);
50
51 return snprintf(buf, PAGE_SIZE, "%u\n", count);
52}
53
54static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
55{
56 ssize_t ret = len;
57 int error = 0;
58 int n = simple_strtol(buf, NULL, 0);
59
60 if (!capable(CAP_SYS_ADMIN))
61 return -EACCES;
62
63 switch (n) {
64 case 0:
65 gfs2_unfreeze_fs(sdp);
66 break;
67 case 1:
68 error = gfs2_freeze_fs(sdp);
69 break;
70 default:
71 ret = -EINVAL;
72 }
73
74 if (error)
75 fs_warn(sdp, "freeze %d error %d", n, error);
76
77 return ret;
78}
79
80static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
81{
82 unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
83 return snprintf(buf, PAGE_SIZE, "%u\n", b);
84}
85
86static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
87{
88 if (!capable(CAP_SYS_ADMIN))
89 return -EACCES;
90
91 if (simple_strtol(buf, NULL, 0) != 1)
92 return -EINVAL;
93
94 gfs2_lm_withdraw(sdp,
95 "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
96 sdp->sd_fsname);
97 return len;
98}
99
100static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
101 size_t len)
102{
103 if (!capable(CAP_SYS_ADMIN))
104 return -EACCES;
105
106 if (simple_strtol(buf, NULL, 0) != 1)
107 return -EINVAL;
108
109 gfs2_statfs_sync(sdp);
110 return len;
111}
112
113static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
114{
115 if (!capable(CAP_SYS_ADMIN))
116 return -EACCES;
117
118 if (simple_strtol(buf, NULL, 0) != 1)
119 return -EINVAL;
120
121 gfs2_gl_hash_clear(sdp, NO_WAIT);
122 return len;
123}
124
125static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
126 size_t len)
127{
128 if (!capable(CAP_SYS_ADMIN))
129 return -EACCES;
130
131 if (simple_strtol(buf, NULL, 0) != 1)
132 return -EINVAL;
133
134 gfs2_quota_sync(sdp);
135 return len;
136}
137
138static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
139 size_t len)
140{
141 u32 id;
142
143 if (!capable(CAP_SYS_ADMIN))
144 return -EACCES;
145
146 id = simple_strtoul(buf, NULL, 0);
147
148 gfs2_quota_refresh(sdp, 1, id);
149 return len;
150}
151
152static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
153 size_t len)
154{
155 u32 id;
156
157 if (!capable(CAP_SYS_ADMIN))
158 return -EACCES;
159
160 id = simple_strtoul(buf, NULL, 0);
161
162 gfs2_quota_refresh(sdp, 0, id);
163 return len;
164}
165
166struct gfs2_attr {
167 struct attribute attr;
168 ssize_t (*show)(struct gfs2_sbd *, char *);
169 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
170};
171
172#define GFS2_ATTR(name, mode, show, store) \
173static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
174
175GFS2_ATTR(id, 0444, id_show, NULL);
176GFS2_ATTR(fsname, 0444, fsname_show, NULL);
177GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
178GFS2_ATTR(shrink, 0200, NULL, shrink_store);
179GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
180GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
181GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
182GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
183GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
184
185static struct attribute *gfs2_attrs[] = {
186 &gfs2_attr_id.attr,
187 &gfs2_attr_fsname.attr,
188 &gfs2_attr_freeze.attr,
189 &gfs2_attr_shrink.attr,
190 &gfs2_attr_withdraw.attr,
191 &gfs2_attr_statfs_sync.attr,
192 &gfs2_attr_quota_sync.attr,
193 &gfs2_attr_quota_refresh_user.attr,
194 &gfs2_attr_quota_refresh_group.attr,
195 NULL,
196};
197
198static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
199 char *buf)
200{
201 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
202 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
203 return a->show ? a->show(sdp, buf) : 0;
204}
205
206static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
207 const char *buf, size_t len)
208{
209 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
210 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
211 return a->store ? a->store(sdp, buf, len) : len;
212}
213
214static struct sysfs_ops gfs2_attr_ops = {
215 .show = gfs2_attr_show,
216 .store = gfs2_attr_store,
217};
218
219static struct kobj_type gfs2_ktype = {
220 .default_attrs = gfs2_attrs,
221 .sysfs_ops = &gfs2_attr_ops,
222};
223
224static struct kset gfs2_kset = {
225 .subsys = &fs_subsys,
226 .kobj = {.name = "gfs2"},
227 .ktype = &gfs2_ktype,
228};
229
230/*
231 * display struct lm_lockstruct fields
232 */
233
234struct lockstruct_attr {
235 struct attribute attr;
236 ssize_t (*show)(struct gfs2_sbd *, char *);
237};
238
239#define LOCKSTRUCT_ATTR(name, fmt) \
240static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
241{ \
242 return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
243} \
244static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
245
246LOCKSTRUCT_ATTR(jid, "%u\n");
247LOCKSTRUCT_ATTR(first, "%u\n");
248LOCKSTRUCT_ATTR(lvb_size, "%u\n");
249LOCKSTRUCT_ATTR(flags, "%d\n");
250
251static struct attribute *lockstruct_attrs[] = {
252 &lockstruct_attr_jid.attr,
253 &lockstruct_attr_first.attr,
254 &lockstruct_attr_lvb_size.attr,
255 &lockstruct_attr_flags.attr,
256 NULL,
257};
258
259/*
260 * display struct gfs2_args fields
261 */
262
263struct args_attr {
264 struct attribute attr;
265 ssize_t (*show)(struct gfs2_sbd *, char *);
266};
267
268#define ARGS_ATTR(name, fmt) \
269static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
270{ \
271 return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \
272} \
273static struct args_attr args_attr_##name = __ATTR_RO(name)
274
275ARGS_ATTR(lockproto, "%s\n");
276ARGS_ATTR(locktable, "%s\n");
277ARGS_ATTR(hostdata, "%s\n");
278ARGS_ATTR(spectator, "%d\n");
279ARGS_ATTR(ignore_local_fs, "%d\n");
280ARGS_ATTR(localcaching, "%d\n");
281ARGS_ATTR(localflocks, "%d\n");
282ARGS_ATTR(debug, "%d\n");
283ARGS_ATTR(upgrade, "%d\n");
284ARGS_ATTR(num_glockd, "%u\n");
285ARGS_ATTR(posix_acl, "%d\n");
286ARGS_ATTR(quota, "%u\n");
287ARGS_ATTR(suiddir, "%d\n");
288ARGS_ATTR(data, "%d\n");
289
290/* one oddball doesn't fit the macro mold */
291static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
292{
293 return snprintf(buf, PAGE_SIZE, "%d\n",
294 !!test_bit(SDF_NOATIME, &sdp->sd_flags));
295}
296static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
297
298static struct attribute *args_attrs[] = {
299 &args_attr_lockproto.attr,
300 &args_attr_locktable.attr,
301 &args_attr_hostdata.attr,
302 &args_attr_spectator.attr,
303 &args_attr_ignore_local_fs.attr,
304 &args_attr_localcaching.attr,
305 &args_attr_localflocks.attr,
306 &args_attr_debug.attr,
307 &args_attr_upgrade.attr,
308 &args_attr_num_glockd.attr,
309 &args_attr_posix_acl.attr,
310 &args_attr_quota.attr,
311 &args_attr_suiddir.attr,
312 &args_attr_data.attr,
313 &args_attr_noatime.attr,
314 NULL,
315};
316
317/*
318 * display counters from superblock
319 */
320
321struct counters_attr {
322 struct attribute attr;
323 ssize_t (*show)(struct gfs2_sbd *, char *);
324};
325
326#define COUNTERS_ATTR(name, fmt) \
327static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
328{ \
329 return snprintf(buf, PAGE_SIZE, fmt, \
330 (unsigned int)atomic_read(&sdp->sd_##name)); \
331} \
332static struct counters_attr counters_attr_##name = __ATTR_RO(name)
333
334COUNTERS_ATTR(glock_count, "%u\n");
335COUNTERS_ATTR(glock_held_count, "%u\n");
336COUNTERS_ATTR(inode_count, "%u\n");
337COUNTERS_ATTR(reclaimed, "%u\n");
338
339static struct attribute *counters_attrs[] = {
340 &counters_attr_glock_count.attr,
341 &counters_attr_glock_held_count.attr,
342 &counters_attr_inode_count.attr,
343 &counters_attr_reclaimed.attr,
344 NULL,
345};
346
347/*
348 * get and set struct gfs2_tune fields
349 */
350
351static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
352{
353 return snprintf(buf, PAGE_SIZE, "%u %u\n",
354 sdp->sd_tune.gt_quota_scale_num,
355 sdp->sd_tune.gt_quota_scale_den);
356}
357
358static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
359 size_t len)
360{
361 struct gfs2_tune *gt = &sdp->sd_tune;
362 unsigned int x, y;
363
364 if (!capable(CAP_SYS_ADMIN))
365 return -EACCES;
366
367 if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
368 return -EINVAL;
369
370 spin_lock(&gt->gt_spin);
371 gt->gt_quota_scale_num = x;
372 gt->gt_quota_scale_den = y;
373 spin_unlock(&gt->gt_spin);
374 return len;
375}
376
377static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
378 int check_zero, const char *buf, size_t len)
379{
380 struct gfs2_tune *gt = &sdp->sd_tune;
381 unsigned int x;
382
383 if (!capable(CAP_SYS_ADMIN))
384 return -EACCES;
385
386 x = simple_strtoul(buf, NULL, 0);
387
388 if (check_zero && !x)
389 return -EINVAL;
390
391 spin_lock(&gt->gt_spin);
392 *field = x;
393 spin_unlock(&gt->gt_spin);
394 return len;
395}
396
397struct tune_attr {
398 struct attribute attr;
399 ssize_t (*show)(struct gfs2_sbd *, char *);
400 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
401};
402
403#define TUNE_ATTR_3(name, show, store) \
404static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
405
406#define TUNE_ATTR_2(name, store) \
407static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
408{ \
409 return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name); \
410} \
411TUNE_ATTR_3(name, name##_show, store)
412
413#define TUNE_ATTR(name, check_zero) \
414static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
415{ \
416 return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
417} \
418TUNE_ATTR_2(name, name##_store)
419
420#define TUNE_ATTR_DAEMON(name, process) \
421static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
422{ \
423 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
424 wake_up_process(sdp->sd_##process); \
425 return r; \
426} \
427TUNE_ATTR_2(name, name##_store)
428
429TUNE_ATTR(ilimit, 0);
430TUNE_ATTR(ilimit_tries, 0);
431TUNE_ATTR(ilimit_min, 0);
432TUNE_ATTR(demote_secs, 0);
433TUNE_ATTR(incore_log_blocks, 0);
434TUNE_ATTR(log_flush_secs, 0);
435TUNE_ATTR(jindex_refresh_secs, 0);
436TUNE_ATTR(quota_warn_period, 0);
437TUNE_ATTR(quota_quantum, 0);
438TUNE_ATTR(atime_quantum, 0);
439TUNE_ATTR(max_readahead, 0);
440TUNE_ATTR(complain_secs, 0);
441TUNE_ATTR(reclaim_limit, 0);
442TUNE_ATTR(prefetch_secs, 0);
443TUNE_ATTR(statfs_slow, 0);
444TUNE_ATTR(new_files_jdata, 0);
445TUNE_ATTR(new_files_directio, 0);
446TUNE_ATTR(quota_simul_sync, 1);
447TUNE_ATTR(quota_cache_secs, 1);
448TUNE_ATTR(max_atomic_write, 1);
449TUNE_ATTR(stall_secs, 1);
450TUNE_ATTR(entries_per_readdir, 1);
451TUNE_ATTR(greedy_default, 1);
452TUNE_ATTR(greedy_quantum, 1);
453TUNE_ATTR(greedy_max, 1);
454TUNE_ATTR(statfs_quantum, 1);
455TUNE_ATTR_DAEMON(scand_secs, scand_process);
456TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
457TUNE_ATTR_DAEMON(logd_secs, logd_process);
458TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
459TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
460
461static struct attribute *tune_attrs[] = {
462 &tune_attr_ilimit.attr,
463 &tune_attr_ilimit_tries.attr,
464 &tune_attr_ilimit_min.attr,
465 &tune_attr_demote_secs.attr,
466 &tune_attr_incore_log_blocks.attr,
467 &tune_attr_log_flush_secs.attr,
468 &tune_attr_jindex_refresh_secs.attr,
469 &tune_attr_quota_warn_period.attr,
470 &tune_attr_quota_quantum.attr,
471 &tune_attr_atime_quantum.attr,
472 &tune_attr_max_readahead.attr,
473 &tune_attr_complain_secs.attr,
474 &tune_attr_reclaim_limit.attr,
475 &tune_attr_prefetch_secs.attr,
476 &tune_attr_statfs_slow.attr,
477 &tune_attr_quota_simul_sync.attr,
478 &tune_attr_quota_cache_secs.attr,
479 &tune_attr_max_atomic_write.attr,
480 &tune_attr_stall_secs.attr,
481 &tune_attr_entries_per_readdir.attr,
482 &tune_attr_greedy_default.attr,
483 &tune_attr_greedy_quantum.attr,
484 &tune_attr_greedy_max.attr,
485 &tune_attr_statfs_quantum.attr,
486 &tune_attr_scand_secs.attr,
487 &tune_attr_recoverd_secs.attr,
488 &tune_attr_logd_secs.attr,
489 &tune_attr_quotad_secs.attr,
490 &tune_attr_quota_scale.attr,
491 &tune_attr_new_files_jdata.attr,
492 &tune_attr_new_files_directio.attr,
493 NULL,
494};
495
496static struct attribute_group lockstruct_group = {
497 .name = "lockstruct",
498 .attrs = lockstruct_attrs,
499};
500
501static struct attribute_group counters_group = {
502 .name = "counters",
503 .attrs = counters_attrs,
504};
505
506static struct attribute_group args_group = {
507 .name = "args",
508 .attrs = args_attrs,
509};
510
511static struct attribute_group tune_group = {
512 .name = "tune",
513 .attrs = tune_attrs,
514};
515
516int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
517{
518 int error;
519
520 sdp->sd_kobj.kset = &gfs2_kset;
521 sdp->sd_kobj.ktype = &gfs2_ktype;
522
523 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
524 if (error)
525 goto fail;
526
527 error = kobject_register(&sdp->sd_kobj);
528 if (error)
529 goto fail;
530
531 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
532 if (error)
533 goto fail_reg;
534
535 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
536 if (error)
537 goto fail_lockstruct;
538
539 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
540 if (error)
541 goto fail_counters;
542
543 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
544 if (error)
545 goto fail_args;
546
547 return 0;
548
549fail_args:
550 sysfs_remove_group(&sdp->sd_kobj, &args_group);
551fail_counters:
552 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
553fail_lockstruct:
554 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
555fail_reg:
556 kobject_unregister(&sdp->sd_kobj);
557fail:
558 fs_err(sdp, "error %d adding sysfs files", error);
559 return error;
560}
561
562void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
563{
564 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
565 sysfs_remove_group(&sdp->sd_kobj, &args_group);
566 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
567 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
568 kobject_unregister(&sdp->sd_kobj);
569}
570
571int gfs2_sys_init(void)
572{
573 gfs2_sys_margs = NULL;
574 spin_lock_init(&gfs2_sys_margs_lock);
575 return kset_register(&gfs2_kset);
576}
577
578void gfs2_sys_uninit(void)
579{
580 kfree(gfs2_sys_margs);
581 kset_unregister(&gfs2_kset);
582}
583
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..1ca8cdac5304
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __SYS_DOT_H__
11#define __SYS_DOT_H__
12
13#include <linux/spinlock.h>
14struct gfs2_sbd;
15
16/* Allow args to be passed to GFS2 when using an initial ram disk */
17extern char *gfs2_sys_margs;
18extern spinlock_t gfs2_sys_margs_lock;
19
20int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
21void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
22
23int gfs2_sys_init(void);
24void gfs2_sys_uninit(void);
25
26#endif /* __SYS_DOT_H__ */
27
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..f8dabf8446bb
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "glock.h"
22#include "log.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "trans.h"
26#include "util.h"
27
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes)
30{
31 struct gfs2_trans *tr;
32 int error;
33
34 BUG_ON(current->journal_info);
35 BUG_ON(blocks == 0 && revokes == 0);
36
37 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
38 if (!tr)
39 return -ENOMEM;
40
41 tr->tr_ip = (unsigned long)__builtin_return_address(0);
42 tr->tr_blocks = blocks;
43 tr->tr_revokes = revokes;
44 tr->tr_reserved = 1;
45 if (blocks)
46 tr->tr_reserved += 6 + blocks;
47 if (revokes)
48 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
49 sizeof(u64));
50 INIT_LIST_HEAD(&tr->tr_list_buf);
51
52 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
53
54 error = gfs2_glock_nq(&tr->tr_t_gh);
55 if (error)
56 goto fail_holder_uninit;
57
58 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
59 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
60 error = -EROFS;
61 goto fail_gunlock;
62 }
63
64 error = gfs2_log_reserve(sdp, tr->tr_reserved);
65 if (error)
66 goto fail_gunlock;
67
68 current->journal_info = tr;
69
70 return 0;
71
72fail_gunlock:
73 gfs2_glock_dq(&tr->tr_t_gh);
74
75fail_holder_uninit:
76 gfs2_holder_uninit(&tr->tr_t_gh);
77 kfree(tr);
78
79 return error;
80}
81
82void gfs2_trans_end(struct gfs2_sbd *sdp)
83{
84 struct gfs2_trans *tr = current->journal_info;
85
86 BUG_ON(!tr);
87 current->journal_info = NULL;
88
89 if (!tr->tr_touched) {
90 gfs2_log_release(sdp, tr->tr_reserved);
91 gfs2_glock_dq(&tr->tr_t_gh);
92 gfs2_holder_uninit(&tr->tr_t_gh);
93 kfree(tr);
94 return;
95 }
96
97 if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
98 fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
99 tr->tr_num_buf, tr->tr_blocks);
100 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
101 }
102 if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
103 fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
104 tr->tr_num_revoke, tr->tr_revokes);
105 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
106 }
107
108 gfs2_log_commit(sdp, tr);
109 gfs2_glock_dq(&tr->tr_t_gh);
110 gfs2_holder_uninit(&tr->tr_t_gh);
111 kfree(tr);
112
113 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
114 gfs2_log_flush(sdp, NULL);
115}
116
117void gfs2_trans_add_gl(struct gfs2_glock *gl)
118{
119 lops_add(gl->gl_sbd, &gl->gl_le);
120}
121
122/**
123 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
124 * @gl: the glock the buffer belongs to
125 * @bh: The buffer to add
126 * @meta: True in the case of adding metadata
127 *
128 */
129
130void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
131{
132 struct gfs2_sbd *sdp = gl->gl_sbd;
133 struct gfs2_bufdata *bd;
134
135 bd = bh->b_private;
136 if (bd)
137 gfs2_assert(sdp, bd->bd_gl == gl);
138 else {
139 gfs2_attach_bufdata(gl, bh, meta);
140 bd = bh->b_private;
141 }
142 lops_add(sdp, &bd->bd_le);
143}
144
145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
146{
147 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
148 GFP_NOFS | __GFP_NOFAIL);
149 lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
150 rv->rv_blkno = blkno;
151 lops_add(sdp, &rv->rv_le);
152}
153
154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
155{
156 struct gfs2_revoke *rv;
157 int found = 0;
158
159 gfs2_log_lock(sdp);
160
161 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
162 if (rv->rv_blkno == blkno) {
163 list_del(&rv->rv_le.le_list);
164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
165 sdp->sd_log_num_revoke--;
166 found = 1;
167 break;
168 }
169 }
170
171 gfs2_log_unlock(sdp);
172
173 if (found) {
174 struct gfs2_trans *tr = current->journal_info;
175 kfree(rv);
176 tr->tr_num_revoke_rm++;
177 }
178}
179
180void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
181{
182 lops_add(rgd->rd_sbd, &rgd->rd_le);
183}
184
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..23d4cbe1de5b
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __TRANS_DOT_H__
11#define __TRANS_DOT_H__
12
13#include <linux/buffer_head.h>
14struct gfs2_sbd;
15struct gfs2_rgrpd;
16struct gfs2_glock;
17
18#define RES_DINODE 1
19#define RES_INDIRECT 1
20#define RES_JDATA 1
21#define RES_DATA 1
22#define RES_LEAF 1
23#define RES_RG_BIT 2
24#define RES_EATTR 1
25#define RES_STATFS 1
26#define RES_QUOTA 2
27
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes);
30
31void gfs2_trans_end(struct gfs2_sbd *sdp);
32
33void gfs2_trans_add_gl(struct gfs2_glock *gl);
34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno);
36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
37void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
38
39#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..196c604faadc
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "glock.h"
23#include "lm.h"
24#include "util.h"
25
26kmem_cache_t *gfs2_glock_cachep __read_mostly;
27kmem_cache_t *gfs2_inode_cachep __read_mostly;
28kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
29
30void gfs2_assert_i(struct gfs2_sbd *sdp)
31{
32 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
33 sdp->sd_fsname);
34}
35
36/**
37 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
38 * Returns: -1 if this call withdrew the machine,
39 * -2 if it was already withdrawn
40 */
41
42int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
43 const char *function, char *file, unsigned int line)
44{
45 int me;
46 me = gfs2_lm_withdraw(sdp,
47 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
48 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
49 sdp->sd_fsname, assertion,
50 sdp->sd_fsname, function, file, line);
51 dump_stack();
52 return (me) ? -1 : -2;
53}
54
55/**
56 * gfs2_assert_warn_i - Print a message to the console if @assertion is false
57 * Returns: -1 if we printed something
58 * -2 if we didn't
59 */
60
61int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
62 const char *function, char *file, unsigned int line)
63{
64 if (time_before(jiffies,
65 sdp->sd_last_warning +
66 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
67 return -2;
68
69 printk(KERN_WARNING
70 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
71 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
72 sdp->sd_fsname, assertion,
73 sdp->sd_fsname, function, file, line);
74
75 if (sdp->sd_args.ar_debug)
76 BUG();
77 else
78 dump_stack();
79
80 sdp->sd_last_warning = jiffies;
81
82 return -1;
83}
84
85/**
86 * gfs2_consist_i - Flag a filesystem consistency error and withdraw
87 * Returns: -1 if this call withdrew the machine,
88 * 0 if it was already withdrawn
89 */
90
91int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
92 char *file, unsigned int line)
93{
94 int rv;
95 rv = gfs2_lm_withdraw(sdp,
96 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
97 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
98 sdp->sd_fsname,
99 sdp->sd_fsname, function, file, line);
100 return rv;
101}
102
103/**
104 * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
105 * Returns: -1 if this call withdrew the machine,
106 * 0 if it was already withdrawn
107 */
108
109int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
110 const char *function, char *file, unsigned int line)
111{
112 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
113 int rv;
114 rv = gfs2_lm_withdraw(sdp,
115 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
116 "GFS2: fsid=%s: inode = %llu %llu\n"
117 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
118 sdp->sd_fsname,
119 sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
120 (unsigned long long)ip->i_num.no_addr,
121 sdp->sd_fsname, function, file, line);
122 return rv;
123}
124
125/**
126 * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
127 * Returns: -1 if this call withdrew the machine,
128 * 0 if it was already withdrawn
129 */
130
131int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
132 const char *function, char *file, unsigned int line)
133{
134 struct gfs2_sbd *sdp = rgd->rd_sbd;
135 int rv;
136 rv = gfs2_lm_withdraw(sdp,
137 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
138 "GFS2: fsid=%s: RG = %llu\n"
139 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
140 sdp->sd_fsname,
141 sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
142 sdp->sd_fsname, function, file, line);
143 return rv;
144}
145
146/**
147 * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
148 * Returns: -1 if this call withdrew the machine,
149 * -2 if it was already withdrawn
150 */
151
152int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
153 const char *type, const char *function, char *file,
154 unsigned int line)
155{
156 int me;
157 me = gfs2_lm_withdraw(sdp,
158 "GFS2: fsid=%s: fatal: invalid metadata block\n"
159 "GFS2: fsid=%s: bh = %llu (%s)\n"
160 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
161 sdp->sd_fsname,
162 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
163 sdp->sd_fsname, function, file, line);
164 return (me) ? -1 : -2;
165}
166
167/**
168 * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
169 * Returns: -1 if this call withdrew the machine,
170 * -2 if it was already withdrawn
171 */
172
173int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
174 u16 type, u16 t, const char *function,
175 char *file, unsigned int line)
176{
177 int me;
178 me = gfs2_lm_withdraw(sdp,
179 "GFS2: fsid=%s: fatal: invalid metadata block\n"
180 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
181 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
182 sdp->sd_fsname,
183 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
184 sdp->sd_fsname, function, file, line);
185 return (me) ? -1 : -2;
186}
187
188/**
189 * gfs2_io_error_i - Flag an I/O error and withdraw
190 * Returns: -1 if this call withdrew the machine,
191 * 0 if it was already withdrawn
192 */
193
194int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
195 unsigned int line)
196{
197 int rv;
198 rv = gfs2_lm_withdraw(sdp,
199 "GFS2: fsid=%s: fatal: I/O error\n"
200 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
201 sdp->sd_fsname,
202 sdp->sd_fsname, function, file, line);
203 return rv;
204}
205
206/**
207 * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
208 * Returns: -1 if this call withdrew the machine,
209 * 0 if it was already withdrawn
210 */
211
212int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
213 const char *function, char *file, unsigned int line)
214{
215 int rv;
216 rv = gfs2_lm_withdraw(sdp,
217 "GFS2: fsid=%s: fatal: I/O error\n"
218 "GFS2: fsid=%s: block = %llu\n"
219 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
220 sdp->sd_fsname,
221 sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
222 sdp->sd_fsname, function, file, line);
223 return rv;
224}
225
226void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
227 unsigned int bit, int new_value)
228{
229 unsigned int c, o, b = bit;
230 int old_value;
231
232 c = b / (8 * PAGE_SIZE);
233 b %= 8 * PAGE_SIZE;
234 o = b / 8;
235 b %= 8;
236
237 old_value = (bitmap[c][o] & (1 << b));
238 gfs2_assert_withdraw(sdp, !old_value != !new_value);
239
240 if (new_value)
241 bitmap[c][o] |= 1 << b;
242 else
243 bitmap[c][o] &= ~(1 << b);
244}
245
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..76a50899fe9e
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,170 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__
12
13#include "incore.h"
14
15#define fs_printk(level, fs, fmt, arg...) \
16 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
17
18#define fs_info(fs, fmt, arg...) \
19 fs_printk(KERN_INFO , fs , fmt , ## arg)
20
21#define fs_warn(fs, fmt, arg...) \
22 fs_printk(KERN_WARNING , fs , fmt , ## arg)
23
24#define fs_err(fs, fmt, arg...) \
25 fs_printk(KERN_ERR, fs , fmt , ## arg)
26
27
28void gfs2_assert_i(struct gfs2_sbd *sdp);
29
30#define gfs2_assert(sdp, assertion) \
31do { \
32 if (unlikely(!(assertion))) { \
33 gfs2_assert_i(sdp); \
34 BUG(); \
35 } \
36} while (0)
37
38
39int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
40 const char *function, char *file, unsigned int line);
41
42#define gfs2_assert_withdraw(sdp, assertion) \
43((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
44 __FUNCTION__, __FILE__, __LINE__))
45
46
47int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
48 const char *function, char *file, unsigned int line);
49
50#define gfs2_assert_warn(sdp, assertion) \
51((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
52 __FUNCTION__, __FILE__, __LINE__))
53
54
55int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
56 const char *function, char *file, unsigned int line);
57
58#define gfs2_consist(sdp) \
59gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
60
61
62int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
63 const char *function, char *file, unsigned int line);
64
65#define gfs2_consist_inode(ip) \
66gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
67
68
69int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
70 const char *function, char *file, unsigned int line);
71
72#define gfs2_consist_rgrpd(rgd) \
73gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
74
75
76int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
77 const char *type, const char *function,
78 char *file, unsigned int line);
79
80static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
81 struct buffer_head *bh,
82 const char *function,
83 char *file, unsigned int line)
84{
85 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
86 u32 magic = mh->mh_magic;
87 magic = be32_to_cpu(magic);
88 if (unlikely(magic != GFS2_MAGIC))
89 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
90 file, line);
91 return 0;
92}
93
94#define gfs2_meta_check(sdp, bh) \
95gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
96
97
98int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
99 u16 type, u16 t,
100 const char *function,
101 char *file, unsigned int line);
102
103static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
104 struct buffer_head *bh,
105 u16 type,
106 const char *function,
107 char *file, unsigned int line)
108{
109 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
110 u32 magic = mh->mh_magic;
111 u16 t = be32_to_cpu(mh->mh_type);
112 magic = be32_to_cpu(magic);
113 if (unlikely(magic != GFS2_MAGIC))
114 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
115 file, line);
116 if (unlikely(t != type))
117 return gfs2_metatype_check_ii(sdp, bh, type, t, function,
118 file, line);
119 return 0;
120}
121
122#define gfs2_metatype_check(sdp, bh, type) \
123gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
124
125static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
126 u16 format)
127{
128 struct gfs2_meta_header *mh;
129 mh = (struct gfs2_meta_header *)bh->b_data;
130 mh->mh_type = cpu_to_be32(type);
131 mh->mh_format = cpu_to_be32(format);
132}
133
134
135int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
136 char *file, unsigned int line);
137
138#define gfs2_io_error(sdp) \
139gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
140
141
142int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
143 const char *function, char *file, unsigned int line);
144
145#define gfs2_io_error_bh(sdp, bh) \
146gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
147
148
149extern kmem_cache_t *gfs2_glock_cachep;
150extern kmem_cache_t *gfs2_inode_cachep;
151extern kmem_cache_t *gfs2_bufdata_cachep;
152
153static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
154 unsigned int *p)
155{
156 unsigned int x;
157 spin_lock(&gt->gt_spin);
158 x = *p;
159 spin_unlock(&gt->gt_spin);
160 return x;
161}
162
163#define gfs2_tune_get(sdp, field) \
164gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
165
166void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
167 unsigned int bit, int new_value);
168
169#endif /* __UTIL_DOT_H__ */
170
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index bcf6ee36e065..7faef8544f32 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -60,14 +60,14 @@ void hpfs_read_inode(struct inode *i)
60 if (hpfs_sb(i->i_sb)->sb_eas) { 60 if (hpfs_sb(i->i_sb)->sb_eas) {
61 if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) { 61 if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) {
62 if (ea_size == 2) { 62 if (ea_size == 2) {
63 i->i_uid = le16_to_cpu(*(u16*)ea); 63 i->i_uid = le16_to_cpu(*(__le16*)ea);
64 hpfs_inode->i_ea_uid = 1; 64 hpfs_inode->i_ea_uid = 1;
65 } 65 }
66 kfree(ea); 66 kfree(ea);
67 } 67 }
68 if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) { 68 if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) {
69 if (ea_size == 2) { 69 if (ea_size == 2) {
70 i->i_gid = le16_to_cpu(*(u16*)ea); 70 i->i_gid = le16_to_cpu(*(__le16*)ea);
71 hpfs_inode->i_ea_gid = 1; 71 hpfs_inode->i_ea_gid = 1;
72 } 72 }
73 kfree(ea); 73 kfree(ea);
@@ -87,7 +87,7 @@ void hpfs_read_inode(struct inode *i)
87 int rdev = 0; 87 int rdev = 0;
88 umode_t mode = hpfs_sb(sb)->sb_mode; 88 umode_t mode = hpfs_sb(sb)->sb_mode;
89 if (ea_size == 2) { 89 if (ea_size == 2) {
90 mode = le16_to_cpu(*(u16*)ea); 90 mode = le16_to_cpu(*(__le16*)ea);
91 hpfs_inode->i_ea_mode = 1; 91 hpfs_inode->i_ea_mode = 1;
92 } 92 }
93 kfree(ea); 93 kfree(ea);
@@ -95,7 +95,7 @@ void hpfs_read_inode(struct inode *i)
95 if (S_ISBLK(mode) || S_ISCHR(mode)) { 95 if (S_ISBLK(mode) || S_ISCHR(mode)) {
96 if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) { 96 if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) {
97 if (ea_size == 4) 97 if (ea_size == 4)
98 rdev = le32_to_cpu(*(u32*)ea); 98 rdev = le32_to_cpu(*(__le32*)ea);
99 kfree(ea); 99 kfree(ea);
100 } 100 }
101 } 101 }
@@ -148,7 +148,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
148 we'd better not overwrite them 148 we'd better not overwrite them
149 hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino); 149 hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
150 } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) { 150 } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
151 u32 ea; 151 __le32 ea;
152 if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) { 152 if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
153 ea = cpu_to_le32(i->i_uid); 153 ea = cpu_to_le32(i->i_uid);
154 hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2); 154 hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2);
@@ -165,6 +165,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
165 && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333)) 165 && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333))
166 | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) { 166 | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) {
167 ea = cpu_to_le32(i->i_mode); 167 ea = cpu_to_le32(i->i_mode);
168 /* sick, but legal */
168 hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2); 169 hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2);
169 hpfs_inode->i_ea_mode = 1; 170 hpfs_inode->i_ea_mode = 1;
170 } 171 }
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index dcb6d2e988b8..642675fc394a 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -572,7 +572,7 @@ struct hppfs_dirent {
572}; 572};
573 573
574static int hppfs_filldir(void *d, const char *name, int size, 574static int hppfs_filldir(void *d, const char *name, int size,
575 loff_t offset, ino_t inode, unsigned int type) 575 loff_t offset, u64 inode, unsigned int type)
576{ 576{
577 struct hppfs_dirent *dirent = d; 577 struct hppfs_dirent *dirent = d;
578 578
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5e03b2f67b93..4ee3f006b861 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -293,7 +293,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
293 if (h_vm_pgoff >= h_pgoff) 293 if (h_vm_pgoff >= h_pgoff)
294 v_offset = 0; 294 v_offset = 0;
295 295
296 unmap_hugepage_range(vma, 296 __unmap_hugepage_range(vma,
297 vma->vm_start + v_offset, vma->vm_end); 297 vma->vm_start + v_offset, vma->vm_end);
298 } 298 }
299} 299}
diff --git a/fs/inode.c b/fs/inode.c
index bf6bec4e54ff..d9a21d122926 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -162,7 +162,7 @@ static struct inode *alloc_inode(struct super_block *sb)
162 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 162 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
163 mapping->backing_dev_info = bdi; 163 mapping->backing_dev_info = bdi;
164 } 164 }
165 inode->i_private = 0; 165 inode->i_private = NULL;
166 inode->i_mapping = mapping; 166 inode->i_mapping = mapping;
167 } 167 }
168 return inode; 168 return inode;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 6dc6721d9e82..89e8da112a75 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -150,11 +150,6 @@ int ioprio_best(unsigned short aprio, unsigned short bprio)
150 unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); 150 unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
151 unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); 151 unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
152 152
153 if (!ioprio_valid(aprio))
154 return bprio;
155 if (!ioprio_valid(bprio))
156 return aprio;
157
158 if (aclass == IOPRIO_CLASS_NONE) 153 if (aclass == IOPRIO_CLASS_NONE)
159 aclass = IOPRIO_CLASS_BE; 154 aclass = IOPRIO_CLASS_BE;
160 if (bclass == IOPRIO_CLASS_NONE) 155 if (bclass == IOPRIO_CLASS_NONE)
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 81a90e170ac3..fb8fe7a9ddc6 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -14,9 +14,9 @@
14 * Convert Unicode 16 to UTF-8 or ASCII. 14 * Convert Unicode 16 to UTF-8 or ASCII.
15 */ 15 */
16static int 16static int
17uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls) 17uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
18{ 18{
19 wchar_t *ip, ch; 19 __be16 *ip, ch;
20 unsigned char *op; 20 unsigned char *op;
21 21
22 ip = uni; 22 ip = uni;
@@ -24,8 +24,8 @@ uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls)
24 24
25 while ((ch = get_unaligned(ip)) && len) { 25 while ((ch = get_unaligned(ip)) && len) {
26 int llen; 26 int llen;
27 ch = be16_to_cpu(ch); 27 llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE);
28 if ((llen = nls->uni2char(ch, op, NLS_MAX_CHARSET_SIZE)) > 0) 28 if (llen > 0)
29 op += llen; 29 op += llen;
30 else 30 else
31 *op++ = '?'; 31 *op++ = '?';
@@ -82,7 +82,7 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
82 len = wcsntombs_be(outname, de->name, 82 len = wcsntombs_be(outname, de->name,
83 de->name_len[0] >> 1, PAGE_SIZE); 83 de->name_len[0] >> 1, PAGE_SIZE);
84 } else { 84 } else {
85 len = uni16_to_x8(outname, (u16 *) de->name, 85 len = uni16_to_x8(outname, (__be16 *) de->name,
86 de->name_len[0] >> 1, nls); 86 de->name_len[0] >> 1, nls);
87 } 87 }
88 if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) { 88 if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) {
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index e7ba0c30e071..c04b3a14a3e9 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
6 * (C) 1991 Linus Torvalds - minix filesystem 6 * (C) 1991 Linus Torvalds - minix filesystem
7 */ 7 */
8 8
9#include <linux/config.h> /* Joliet? */
10#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
11#include "isofs.h" 10#include "isofs.h"
12 11
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index c518dd8fe60a..b85c686b60db 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -725,6 +725,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
725 __FUNCTION__); 725 __FUNCTION__);
726 kfree(journal); 726 kfree(journal);
727 journal = NULL; 727 journal = NULL;
728 goto out;
728 } 729 }
729 journal->j_dev = bdev; 730 journal->j_dev = bdev;
730 journal->j_fs_dev = fs_dev; 731 journal->j_fs_dev = fs_dev;
@@ -735,7 +736,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
735 J_ASSERT(bh != NULL); 736 J_ASSERT(bh != NULL);
736 journal->j_sb_buffer = bh; 737 journal->j_sb_buffer = bh;
737 journal->j_superblock = (journal_superblock_t *)bh->b_data; 738 journal->j_superblock = (journal_superblock_t *)bh->b_data;
738 739out:
739 return journal; 740 return journal;
740} 741}
741 742
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e1b3c8af4d17..d5c63047a8b3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1314,13 +1314,14 @@ int journal_stop(handle_t *handle)
1314 int old_handle_count, err; 1314 int old_handle_count, err;
1315 pid_t pid; 1315 pid_t pid;
1316 1316
1317 J_ASSERT(transaction->t_updates > 0);
1318 J_ASSERT(journal_current_handle() == handle); 1317 J_ASSERT(journal_current_handle() == handle);
1319 1318
1320 if (is_handle_aborted(handle)) 1319 if (is_handle_aborted(handle))
1321 err = -EIO; 1320 err = -EIO;
1322 else 1321 else {
1322 J_ASSERT(transaction->t_updates > 0);
1323 err = 0; 1323 err = 0;
1324 }
1324 1325
1325 if (--handle->h_ref > 0) { 1326 if (--handle->h_ref > 0) {
1326 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, 1327 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
new file mode 100644
index 000000000000..802a3413872a
--- /dev/null
+++ b/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the linux journaling routines.
3#
4
5obj-$(CONFIG_JBD2) += jbd2.o
6
7jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
new file mode 100644
index 000000000000..68039fa9a566
--- /dev/null
+++ b/fs/jbd2/checkpoint.c
@@ -0,0 +1,697 @@
1/*
2 * linux/fs/checkpoint.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Checkpoint routines for the generic filesystem journaling code.
13 * Part of the ext2fs journaling system.
14 *
15 * Checkpointing is the process of ensuring that a section of the log is
16 * committed fully to disk, so that that portion of the log can be
17 * reused.
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd2.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25
26/*
27 * Unlink a buffer from a transaction checkpoint list.
28 *
29 * Called with j_list_lock held.
30 */
31static inline void __buffer_unlink_first(struct journal_head *jh)
32{
33 transaction_t *transaction = jh->b_cp_transaction;
34
35 jh->b_cpnext->b_cpprev = jh->b_cpprev;
36 jh->b_cpprev->b_cpnext = jh->b_cpnext;
37 if (transaction->t_checkpoint_list == jh) {
38 transaction->t_checkpoint_list = jh->b_cpnext;
39 if (transaction->t_checkpoint_list == jh)
40 transaction->t_checkpoint_list = NULL;
41 }
42}
43
44/*
45 * Unlink a buffer from a transaction checkpoint(io) list.
46 *
47 * Called with j_list_lock held.
48 */
49static inline void __buffer_unlink(struct journal_head *jh)
50{
51 transaction_t *transaction = jh->b_cp_transaction;
52
53 __buffer_unlink_first(jh);
54 if (transaction->t_checkpoint_io_list == jh) {
55 transaction->t_checkpoint_io_list = jh->b_cpnext;
56 if (transaction->t_checkpoint_io_list == jh)
57 transaction->t_checkpoint_io_list = NULL;
58 }
59}
60
61/*
62 * Move a buffer from the checkpoint list to the checkpoint io list
63 *
64 * Called with j_list_lock held
65 */
66static inline void __buffer_relink_io(struct journal_head *jh)
67{
68 transaction_t *transaction = jh->b_cp_transaction;
69
70 __buffer_unlink_first(jh);
71
72 if (!transaction->t_checkpoint_io_list) {
73 jh->b_cpnext = jh->b_cpprev = jh;
74 } else {
75 jh->b_cpnext = transaction->t_checkpoint_io_list;
76 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
77 jh->b_cpprev->b_cpnext = jh;
78 jh->b_cpnext->b_cpprev = jh;
79 }
80 transaction->t_checkpoint_io_list = jh;
81}
82
83/*
84 * Try to release a checkpointed buffer from its transaction.
85 * Returns 1 if we released it and 2 if we also released the
86 * whole transaction.
87 *
88 * Requires j_list_lock
89 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
90 */
91static int __try_to_free_cp_buf(struct journal_head *jh)
92{
93 int ret = 0;
94 struct buffer_head *bh = jh2bh(jh);
95
96 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
97 JBUFFER_TRACE(jh, "remove from checkpoint list");
98 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
99 jbd_unlock_bh_state(bh);
100 jbd2_journal_remove_journal_head(bh);
101 BUFFER_TRACE(bh, "release");
102 __brelse(bh);
103 } else {
104 jbd_unlock_bh_state(bh);
105 }
106 return ret;
107}
108
109/*
110 * __jbd2_log_wait_for_space: wait until there is space in the journal.
111 *
112 * Called under j-state_lock *only*. It will be unlocked if we have to wait
113 * for a checkpoint to free up some space in the log.
114 */
115void __jbd2_log_wait_for_space(journal_t *journal)
116{
117 int nblocks;
118 assert_spin_locked(&journal->j_state_lock);
119
120 nblocks = jbd_space_needed(journal);
121 while (__jbd2_log_space_left(journal) < nblocks) {
122 if (journal->j_flags & JBD2_ABORT)
123 return;
124 spin_unlock(&journal->j_state_lock);
125 mutex_lock(&journal->j_checkpoint_mutex);
126
127 /*
128 * Test again, another process may have checkpointed while we
129 * were waiting for the checkpoint lock
130 */
131 spin_lock(&journal->j_state_lock);
132 nblocks = jbd_space_needed(journal);
133 if (__jbd2_log_space_left(journal) < nblocks) {
134 spin_unlock(&journal->j_state_lock);
135 jbd2_log_do_checkpoint(journal);
136 spin_lock(&journal->j_state_lock);
137 }
138 mutex_unlock(&journal->j_checkpoint_mutex);
139 }
140}
141
142/*
143 * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
144 * The caller must restart a list walk. Wait for someone else to run
145 * jbd_unlock_bh_state().
146 */
147static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
148 __releases(journal->j_list_lock)
149{
150 get_bh(bh);
151 spin_unlock(&journal->j_list_lock);
152 jbd_lock_bh_state(bh);
153 jbd_unlock_bh_state(bh);
154 put_bh(bh);
155}
156
157/*
158 * Clean up transaction's list of buffers submitted for io.
159 * We wait for any pending IO to complete and remove any clean
160 * buffers. Note that we take the buffers in the opposite ordering
161 * from the one in which they were submitted for IO.
162 *
163 * Called with j_list_lock held.
164 */
165static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
166{
167 struct journal_head *jh;
168 struct buffer_head *bh;
169 tid_t this_tid;
170 int released = 0;
171
172 this_tid = transaction->t_tid;
173restart:
174 /* Did somebody clean up the transaction in the meanwhile? */
175 if (journal->j_checkpoint_transactions != transaction ||
176 transaction->t_tid != this_tid)
177 return;
178 while (!released && transaction->t_checkpoint_io_list) {
179 jh = transaction->t_checkpoint_io_list;
180 bh = jh2bh(jh);
181 if (!jbd_trylock_bh_state(bh)) {
182 jbd_sync_bh(journal, bh);
183 spin_lock(&journal->j_list_lock);
184 goto restart;
185 }
186 if (buffer_locked(bh)) {
187 atomic_inc(&bh->b_count);
188 spin_unlock(&journal->j_list_lock);
189 jbd_unlock_bh_state(bh);
190 wait_on_buffer(bh);
191 /* the journal_head may have gone by now */
192 BUFFER_TRACE(bh, "brelse");
193 __brelse(bh);
194 spin_lock(&journal->j_list_lock);
195 goto restart;
196 }
197 /*
198 * Now in whatever state the buffer currently is, we know that
199 * it has been written out and so we can drop it from the list
200 */
201 released = __jbd2_journal_remove_checkpoint(jh);
202 jbd_unlock_bh_state(bh);
203 jbd2_journal_remove_journal_head(bh);
204 __brelse(bh);
205 }
206}
207
208#define NR_BATCH 64
209
210static void
211__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
212{
213 int i;
214
215 ll_rw_block(SWRITE, *batch_count, bhs);
216 for (i = 0; i < *batch_count; i++) {
217 struct buffer_head *bh = bhs[i];
218 clear_buffer_jwrite(bh);
219 BUFFER_TRACE(bh, "brelse");
220 __brelse(bh);
221 }
222 *batch_count = 0;
223}
224
225/*
226 * Try to flush one buffer from the checkpoint list to disk.
227 *
228 * Return 1 if something happened which requires us to abort the current
229 * scan of the checkpoint list.
230 *
231 * Called with j_list_lock held and drops it if 1 is returned
232 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
233 */
234static int __process_buffer(journal_t *journal, struct journal_head *jh,
235 struct buffer_head **bhs, int *batch_count)
236{
237 struct buffer_head *bh = jh2bh(jh);
238 int ret = 0;
239
240 if (buffer_locked(bh)) {
241 atomic_inc(&bh->b_count);
242 spin_unlock(&journal->j_list_lock);
243 jbd_unlock_bh_state(bh);
244 wait_on_buffer(bh);
245 /* the journal_head may have gone by now */
246 BUFFER_TRACE(bh, "brelse");
247 __brelse(bh);
248 ret = 1;
249 } else if (jh->b_transaction != NULL) {
250 transaction_t *t = jh->b_transaction;
251 tid_t tid = t->t_tid;
252
253 spin_unlock(&journal->j_list_lock);
254 jbd_unlock_bh_state(bh);
255 jbd2_log_start_commit(journal, tid);
256 jbd2_log_wait_commit(journal, tid);
257 ret = 1;
258 } else if (!buffer_dirty(bh)) {
259 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
260 BUFFER_TRACE(bh, "remove from checkpoint");
261 __jbd2_journal_remove_checkpoint(jh);
262 spin_unlock(&journal->j_list_lock);
263 jbd_unlock_bh_state(bh);
264 jbd2_journal_remove_journal_head(bh);
265 __brelse(bh);
266 ret = 1;
267 } else {
268 /*
269 * Important: we are about to write the buffer, and
270 * possibly block, while still holding the journal lock.
271 * We cannot afford to let the transaction logic start
272 * messing around with this buffer before we write it to
273 * disk, as that would break recoverability.
274 */
275 BUFFER_TRACE(bh, "queue");
276 get_bh(bh);
277 J_ASSERT_BH(bh, !buffer_jwrite(bh));
278 set_buffer_jwrite(bh);
279 bhs[*batch_count] = bh;
280 __buffer_relink_io(jh);
281 jbd_unlock_bh_state(bh);
282 (*batch_count)++;
283 if (*batch_count == NR_BATCH) {
284 spin_unlock(&journal->j_list_lock);
285 __flush_batch(journal, bhs, batch_count);
286 ret = 1;
287 }
288 }
289 return ret;
290}
291
292/*
293 * Perform an actual checkpoint. We take the first transaction on the
294 * list of transactions to be checkpointed and send all its buffers
295 * to disk. We submit larger chunks of data at once.
296 *
297 * The journal should be locked before calling this function.
298 */
299int jbd2_log_do_checkpoint(journal_t *journal)
300{
301 transaction_t *transaction;
302 tid_t this_tid;
303 int result;
304
305 jbd_debug(1, "Start checkpoint\n");
306
307 /*
308 * First thing: if there are any transactions in the log which
309 * don't need checkpointing, just eliminate them from the
310 * journal straight away.
311 */
312 result = jbd2_cleanup_journal_tail(journal);
313 jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
314 if (result <= 0)
315 return result;
316
317 /*
318 * OK, we need to start writing disk blocks. Take one transaction
319 * and write it.
320 */
321 spin_lock(&journal->j_list_lock);
322 if (!journal->j_checkpoint_transactions)
323 goto out;
324 transaction = journal->j_checkpoint_transactions;
325 this_tid = transaction->t_tid;
326restart:
327 /*
328 * If someone cleaned up this transaction while we slept, we're
329 * done (maybe it's a new transaction, but it fell at the same
330 * address).
331 */
332 if (journal->j_checkpoint_transactions == transaction &&
333 transaction->t_tid == this_tid) {
334 int batch_count = 0;
335 struct buffer_head *bhs[NR_BATCH];
336 struct journal_head *jh;
337 int retry = 0;
338
339 while (!retry && transaction->t_checkpoint_list) {
340 struct buffer_head *bh;
341
342 jh = transaction->t_checkpoint_list;
343 bh = jh2bh(jh);
344 if (!jbd_trylock_bh_state(bh)) {
345 jbd_sync_bh(journal, bh);
346 retry = 1;
347 break;
348 }
349 retry = __process_buffer(journal, jh, bhs,&batch_count);
350 if (!retry && lock_need_resched(&journal->j_list_lock)){
351 spin_unlock(&journal->j_list_lock);
352 retry = 1;
353 break;
354 }
355 }
356
357 if (batch_count) {
358 if (!retry) {
359 spin_unlock(&journal->j_list_lock);
360 retry = 1;
361 }
362 __flush_batch(journal, bhs, &batch_count);
363 }
364
365 if (retry) {
366 spin_lock(&journal->j_list_lock);
367 goto restart;
368 }
369 /*
370 * Now we have cleaned up the first transaction's checkpoint
371 * list. Let's clean up the second one
372 */
373 __wait_cp_io(journal, transaction);
374 }
375out:
376 spin_unlock(&journal->j_list_lock);
377 result = jbd2_cleanup_journal_tail(journal);
378 if (result < 0)
379 return result;
380 return 0;
381}
382
383/*
384 * Check the list of checkpoint transactions for the journal to see if
385 * we have already got rid of any since the last update of the log tail
386 * in the journal superblock. If so, we can instantly roll the
387 * superblock forward to remove those transactions from the log.
388 *
389 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
390 *
391 * Called with the journal lock held.
392 *
393 * This is the only part of the journaling code which really needs to be
394 * aware of transaction aborts. Checkpointing involves writing to the
395 * main filesystem area rather than to the journal, so it can proceed
396 * even in abort state, but we must not update the journal superblock if
397 * we have an abort error outstanding.
398 */
399
400int jbd2_cleanup_journal_tail(journal_t *journal)
401{
402 transaction_t * transaction;
403 tid_t first_tid;
404 unsigned long blocknr, freed;
405
406 /* OK, work out the oldest transaction remaining in the log, and
407 * the log block it starts at.
408 *
409 * If the log is now empty, we need to work out which is the
410 * next transaction ID we will write, and where it will
411 * start. */
412
413 spin_lock(&journal->j_state_lock);
414 spin_lock(&journal->j_list_lock);
415 transaction = journal->j_checkpoint_transactions;
416 if (transaction) {
417 first_tid = transaction->t_tid;
418 blocknr = transaction->t_log_start;
419 } else if ((transaction = journal->j_committing_transaction) != NULL) {
420 first_tid = transaction->t_tid;
421 blocknr = transaction->t_log_start;
422 } else if ((transaction = journal->j_running_transaction) != NULL) {
423 first_tid = transaction->t_tid;
424 blocknr = journal->j_head;
425 } else {
426 first_tid = journal->j_transaction_sequence;
427 blocknr = journal->j_head;
428 }
429 spin_unlock(&journal->j_list_lock);
430 J_ASSERT(blocknr != 0);
431
432 /* If the oldest pinned transaction is at the tail of the log
433 already then there's not much we can do right now. */
434 if (journal->j_tail_sequence == first_tid) {
435 spin_unlock(&journal->j_state_lock);
436 return 1;
437 }
438
439 /* OK, update the superblock to recover the freed space.
440 * Physical blocks come first: have we wrapped beyond the end of
441 * the log? */
442 freed = blocknr - journal->j_tail;
443 if (blocknr < journal->j_tail)
444 freed = freed + journal->j_last - journal->j_first;
445
446 jbd_debug(1,
447 "Cleaning journal tail from %d to %d (offset %lu), "
448 "freeing %lu\n",
449 journal->j_tail_sequence, first_tid, blocknr, freed);
450
451 journal->j_free += freed;
452 journal->j_tail_sequence = first_tid;
453 journal->j_tail = blocknr;
454 spin_unlock(&journal->j_state_lock);
455 if (!(journal->j_flags & JBD2_ABORT))
456 jbd2_journal_update_superblock(journal, 1);
457 return 0;
458}
459
460
461/* Checkpoint list management */
462
463/*
464 * journal_clean_one_cp_list
465 *
466 * Find all the written-back checkpoint buffers in the given list and release them.
467 *
468 * Called with the journal locked.
469 * Called with j_list_lock held.
470 * Returns number of bufers reaped (for debug)
471 */
472
473static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
474{
475 struct journal_head *last_jh;
476 struct journal_head *next_jh = jh;
477 int ret, freed = 0;
478
479 *released = 0;
480 if (!jh)
481 return 0;
482
483 last_jh = jh->b_cpprev;
484 do {
485 jh = next_jh;
486 next_jh = jh->b_cpnext;
487 /* Use trylock because of the ranking */
488 if (jbd_trylock_bh_state(jh2bh(jh))) {
489 ret = __try_to_free_cp_buf(jh);
490 if (ret) {
491 freed++;
492 if (ret == 2) {
493 *released = 1;
494 return freed;
495 }
496 }
497 }
498 /*
499 * This function only frees up some memory
500 * if possible so we dont have an obligation
501 * to finish processing. Bail out if preemption
502 * requested:
503 */
504 if (need_resched())
505 return freed;
506 } while (jh != last_jh);
507
508 return freed;
509}
510
511/*
512 * journal_clean_checkpoint_list
513 *
514 * Find all the written-back checkpoint buffers in the journal and release them.
515 *
516 * Called with the journal locked.
517 * Called with j_list_lock held.
518 * Returns number of buffers reaped (for debug)
519 */
520
521int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
522{
523 transaction_t *transaction, *last_transaction, *next_transaction;
524 int ret = 0;
525 int released;
526
527 transaction = journal->j_checkpoint_transactions;
528 if (!transaction)
529 goto out;
530
531 last_transaction = transaction->t_cpprev;
532 next_transaction = transaction;
533 do {
534 transaction = next_transaction;
535 next_transaction = transaction->t_cpnext;
536 ret += journal_clean_one_cp_list(transaction->
537 t_checkpoint_list, &released);
538 /*
539 * This function only frees up some memory if possible so we
540 * dont have an obligation to finish processing. Bail out if
541 * preemption requested:
542 */
543 if (need_resched())
544 goto out;
545 if (released)
546 continue;
547 /*
548 * It is essential that we are as careful as in the case of
549 * t_checkpoint_list with removing the buffer from the list as
550 * we can possibly see not yet submitted buffers on io_list
551 */
552 ret += journal_clean_one_cp_list(transaction->
553 t_checkpoint_io_list, &released);
554 if (need_resched())
555 goto out;
556 } while (transaction != last_transaction);
557out:
558 return ret;
559}
560
561/*
562 * journal_remove_checkpoint: called after a buffer has been committed
563 * to disk (either by being write-back flushed to disk, or being
564 * committed to the log).
565 *
566 * We cannot safely clean a transaction out of the log until all of the
567 * buffer updates committed in that transaction have safely been stored
568 * elsewhere on disk. To achieve this, all of the buffers in a
569 * transaction need to be maintained on the transaction's checkpoint
570 * lists until they have been rewritten, at which point this function is
571 * called to remove the buffer from the existing transaction's
572 * checkpoint lists.
573 *
574 * The function returns 1 if it frees the transaction, 0 otherwise.
575 *
576 * This function is called with the journal locked.
577 * This function is called with j_list_lock held.
578 * This function is called with jbd_lock_bh_state(jh2bh(jh))
579 */
580
581int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
582{
583 transaction_t *transaction;
584 journal_t *journal;
585 int ret = 0;
586
587 JBUFFER_TRACE(jh, "entry");
588
589 if ((transaction = jh->b_cp_transaction) == NULL) {
590 JBUFFER_TRACE(jh, "not on transaction");
591 goto out;
592 }
593 journal = transaction->t_journal;
594
595 __buffer_unlink(jh);
596 jh->b_cp_transaction = NULL;
597
598 if (transaction->t_checkpoint_list != NULL ||
599 transaction->t_checkpoint_io_list != NULL)
600 goto out;
601 JBUFFER_TRACE(jh, "transaction has no more buffers");
602
603 /*
604 * There is one special case to worry about: if we have just pulled the
605 * buffer off a committing transaction's forget list, then even if the
606 * checkpoint list is empty, the transaction obviously cannot be
607 * dropped!
608 *
609 * The locking here around j_committing_transaction is a bit sleazy.
610 * See the comment at the end of jbd2_journal_commit_transaction().
611 */
612 if (transaction == journal->j_committing_transaction) {
613 JBUFFER_TRACE(jh, "belongs to committing transaction");
614 goto out;
615 }
616
617 /* OK, that was the last buffer for the transaction: we can now
618 safely remove this transaction from the log */
619
620 __jbd2_journal_drop_transaction(journal, transaction);
621
622 /* Just in case anybody was waiting for more transactions to be
623 checkpointed... */
624 wake_up(&journal->j_wait_logspace);
625 ret = 1;
626out:
627 JBUFFER_TRACE(jh, "exit");
628 return ret;
629}
630
631/*
632 * journal_insert_checkpoint: put a committed buffer onto a checkpoint
633 * list so that we know when it is safe to clean the transaction out of
634 * the log.
635 *
636 * Called with the journal locked.
637 * Called with j_list_lock held.
638 */
639void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
640 transaction_t *transaction)
641{
642 JBUFFER_TRACE(jh, "entry");
643 J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
644 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
645
646 jh->b_cp_transaction = transaction;
647
648 if (!transaction->t_checkpoint_list) {
649 jh->b_cpnext = jh->b_cpprev = jh;
650 } else {
651 jh->b_cpnext = transaction->t_checkpoint_list;
652 jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
653 jh->b_cpprev->b_cpnext = jh;
654 jh->b_cpnext->b_cpprev = jh;
655 }
656 transaction->t_checkpoint_list = jh;
657}
658
659/*
660 * We've finished with this transaction structure: adios...
661 *
662 * The transaction must have no links except for the checkpoint by this
663 * point.
664 *
665 * Called with the journal locked.
666 * Called with j_list_lock held.
667 */
668
669void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
670{
671 assert_spin_locked(&journal->j_list_lock);
672 if (transaction->t_cpnext) {
673 transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
674 transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
675 if (journal->j_checkpoint_transactions == transaction)
676 journal->j_checkpoint_transactions =
677 transaction->t_cpnext;
678 if (journal->j_checkpoint_transactions == transaction)
679 journal->j_checkpoint_transactions = NULL;
680 }
681
682 J_ASSERT(transaction->t_state == T_FINISHED);
683 J_ASSERT(transaction->t_buffers == NULL);
684 J_ASSERT(transaction->t_sync_datalist == NULL);
685 J_ASSERT(transaction->t_forget == NULL);
686 J_ASSERT(transaction->t_iobuf_list == NULL);
687 J_ASSERT(transaction->t_shadow_list == NULL);
688 J_ASSERT(transaction->t_log_list == NULL);
689 J_ASSERT(transaction->t_checkpoint_list == NULL);
690 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
691 J_ASSERT(transaction->t_updates == 0);
692 J_ASSERT(journal->j_committing_transaction != transaction);
693 J_ASSERT(journal->j_running_transaction != transaction);
694
695 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
696 kfree(transaction);
697}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
new file mode 100644
index 000000000000..70b2ae1ef281
--- /dev/null
+++ b/fs/jbd2/commit.c
@@ -0,0 +1,920 @@
1/*
2 * linux/fs/jbd2/commit.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd2.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/smp_lock.h>
24
25/*
26 * Default IO end handler for temporary BJ_IO buffer_heads.
27 */
28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29{
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
36}
37
38/*
39 * When an ext3-ordered file is truncated, it is possible that many pages are
40 * not sucessfully freed, because they are attached to a committing transaction.
41 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
45 *
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
48 *
49 * Called under lock_journal(), and possibly under journal_datalist_lock. The
50 * caller provided us with a ref against the buffer, and we drop that here.
51 */
52static void release_buffer_page(struct buffer_head *bh)
53{
54 struct page *page;
55
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
65
66 /* OK, it's a truncated page */
67 if (TestSetPageLocked(page))
68 goto nope;
69
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
76
77nope:
78 __brelse(bh);
79}
80
81/*
82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83 * held. For ranking reasons we must trylock. If we lose, schedule away and
84 * return 0. j_list_lock is dropped in this case.
85 */
86static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87{
88 if (!jbd_trylock_bh_state(bh)) {
89 spin_unlock(&journal->j_list_lock);
90 schedule();
91 return 0;
92 }
93 return 1;
94}
95
96/* Done it all: now write the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write
99 * entirely.
100 *
101 * Returns 1 if the journal needs to be aborted or 0 on success
102 */
103static int journal_write_commit_record(journal_t *journal,
104 transaction_t *commit_transaction)
105{
106 struct journal_head *descriptor;
107 struct buffer_head *bh;
108 int i, ret;
109 int barrier_done = 0;
110
111 if (is_journal_aborted(journal))
112 return 0;
113
114 descriptor = jbd2_journal_get_descriptor_buffer(journal);
115 if (!descriptor)
116 return 1;
117
118 bh = jh2bh(descriptor);
119
120 /* AKPM: buglet - add `i' to tmp! */
121 for (i = 0; i < bh->b_size; i += 512) {
122 journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
124 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 }
127
128 JBUFFER_TRACE(descriptor, "write commit block");
129 set_buffer_dirty(bh);
130 if (journal->j_flags & JBD2_BARRIER) {
131 set_buffer_ordered(bh);
132 barrier_done = 1;
133 }
134 ret = sync_dirty_buffer(bh);
135 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want
138 * to remember if we sent a barrier request
139 */
140 if (ret == -EOPNOTSUPP && barrier_done) {
141 char b[BDEVNAME_SIZE];
142
143 printk(KERN_WARNING
144 "JBD: barrier-based sync failed on %s - "
145 "disabling barriers\n",
146 bdevname(journal->j_dev, b));
147 spin_lock(&journal->j_state_lock);
148 journal->j_flags &= ~JBD2_BARRIER;
149 spin_unlock(&journal->j_state_lock);
150
151 /* And try again, without the barrier */
152 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
156 }
157 put_bh(bh); /* One for getblk() */
158 jbd2_journal_put_journal_head(descriptor);
159
160 return (ret == -EIO);
161}
162
163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{
165 int i;
166
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
171 }
172}
173
174/*
175 * Submit all the data buffers to disk
176 */
177static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
179{
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
185
186 /*
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
190 *
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
193 */
194write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
197
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
202
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
210 */
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
220 }
221 locked = 1;
222 }
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
227 }
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
238 }
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
242 __jbd2_journal_file_buffer(jh, commit_transaction,
243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
250 }
251 }
252 else {
253 BUFFER_TRACE(bh, "writeout complete: unfile");
254 __jbd2_journal_unfile_buffer(jh);
255 jbd_unlock_bh_state(bh);
256 if (locked)
257 unlock_buffer(bh);
258 jbd2_journal_remove_journal_head(bh);
259 /* Once for our safety reference, once for
260 * jbd2_journal_remove_journal_head() */
261 put_bh(bh);
262 put_bh(bh);
263 }
264
265 if (lock_need_resched(&journal->j_list_lock)) {
266 spin_unlock(&journal->j_list_lock);
267 goto write_out_data;
268 }
269 }
270 spin_unlock(&journal->j_list_lock);
271 journal_do_submit_data(wbuf, bufs);
272}
273
274static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
275 unsigned long long block)
276{
277 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
278 if (tag_bytes > JBD_TAG_SIZE32)
279 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
280}
281
282/*
283 * jbd2_journal_commit_transaction
284 *
285 * The primary function for committing a transaction to the log. This
286 * function is called by the journal thread to begin a complete commit.
287 */
288void jbd2_journal_commit_transaction(journal_t *journal)
289{
290 transaction_t *commit_transaction;
291 struct journal_head *jh, *new_jh, *descriptor;
292 struct buffer_head **wbuf = journal->j_wbuf;
293 int bufs;
294 int flags;
295 int err;
296 unsigned long long blocknr;
297 char *tagp = NULL;
298 journal_header_t *header;
299 journal_block_tag_t *tag = NULL;
300 int space_left = 0;
301 int first_tag = 0;
302 int tag_flag;
303 int i;
304 int tag_bytes = journal_tag_bytes(journal);
305
306 /*
307 * First job: lock down the current transaction and wait for
308 * all outstanding updates to complete.
309 */
310
311#ifdef COMMIT_STATS
312 spin_lock(&journal->j_list_lock);
313 summarise_journal_usage(journal);
314 spin_unlock(&journal->j_list_lock);
315#endif
316
317 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
318 if (journal->j_flags & JBD2_FLUSHED) {
319 jbd_debug(3, "super block updated\n");
320 jbd2_journal_update_superblock(journal, 1);
321 } else {
322 jbd_debug(3, "superblock not updated\n");
323 }
324
325 J_ASSERT(journal->j_running_transaction != NULL);
326 J_ASSERT(journal->j_committing_transaction == NULL);
327
328 commit_transaction = journal->j_running_transaction;
329 J_ASSERT(commit_transaction->t_state == T_RUNNING);
330
331 jbd_debug(1, "JBD: starting commit of transaction %d\n",
332 commit_transaction->t_tid);
333
334 spin_lock(&journal->j_state_lock);
335 commit_transaction->t_state = T_LOCKED;
336
337 spin_lock(&commit_transaction->t_handle_lock);
338 while (commit_transaction->t_updates) {
339 DEFINE_WAIT(wait);
340
341 prepare_to_wait(&journal->j_wait_updates, &wait,
342 TASK_UNINTERRUPTIBLE);
343 if (commit_transaction->t_updates) {
344 spin_unlock(&commit_transaction->t_handle_lock);
345 spin_unlock(&journal->j_state_lock);
346 schedule();
347 spin_lock(&journal->j_state_lock);
348 spin_lock(&commit_transaction->t_handle_lock);
349 }
350 finish_wait(&journal->j_wait_updates, &wait);
351 }
352 spin_unlock(&commit_transaction->t_handle_lock);
353
354 J_ASSERT (commit_transaction->t_outstanding_credits <=
355 journal->j_max_transaction_buffers);
356
357 /*
358 * First thing we are allowed to do is to discard any remaining
359 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
360 * that there are no such buffers: if a large filesystem
361 * operation like a truncate needs to split itself over multiple
362 * transactions, then it may try to do a jbd2_journal_restart() while
363 * there are still BJ_Reserved buffers outstanding. These must
364 * be released cleanly from the current transaction.
365 *
366 * In this case, the filesystem must still reserve write access
367 * again before modifying the buffer in the new transaction, but
368 * we do not require it to remember exactly which old buffers it
369 * has reserved. This is consistent with the existing behaviour
370 * that multiple jbd2_journal_get_write_access() calls to the same
371 * buffer are perfectly permissable.
372 */
373 while (commit_transaction->t_reserved_list) {
374 jh = commit_transaction->t_reserved_list;
375 JBUFFER_TRACE(jh, "reserved, unused: refile");
376 /*
377 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
378 * leave undo-committed data.
379 */
380 if (jh->b_committed_data) {
381 struct buffer_head *bh = jh2bh(jh);
382
383 jbd_lock_bh_state(bh);
384 jbd2_slab_free(jh->b_committed_data, bh->b_size);
385 jh->b_committed_data = NULL;
386 jbd_unlock_bh_state(bh);
387 }
388 jbd2_journal_refile_buffer(journal, jh);
389 }
390
391 /*
392 * Now try to drop any written-back buffers from the journal's
393 * checkpoint lists. We do this *before* commit because it potentially
394 * frees some memory
395 */
396 spin_lock(&journal->j_list_lock);
397 __jbd2_journal_clean_checkpoint_list(journal);
398 spin_unlock(&journal->j_list_lock);
399
400 jbd_debug (3, "JBD: commit phase 1\n");
401
402 /*
403 * Switch to a new revoke table.
404 */
405 jbd2_journal_switch_revoke_table(journal);
406
407 commit_transaction->t_state = T_FLUSH;
408 journal->j_committing_transaction = commit_transaction;
409 journal->j_running_transaction = NULL;
410 commit_transaction->t_log_start = journal->j_head;
411 wake_up(&journal->j_wait_transaction_locked);
412 spin_unlock(&journal->j_state_lock);
413
414 jbd_debug (3, "JBD: commit phase 2\n");
415
416 /*
417 * First, drop modified flag: all accesses to the buffers
418 * will be tracked for a new trasaction only -bzzz
419 */
420 spin_lock(&journal->j_list_lock);
421 if (commit_transaction->t_buffers) {
422 new_jh = jh = commit_transaction->t_buffers->b_tnext;
423 do {
424 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
425 new_jh->b_modified == 0);
426 new_jh->b_modified = 0;
427 new_jh = new_jh->b_tnext;
428 } while (new_jh != jh);
429 }
430 spin_unlock(&journal->j_list_lock);
431
432 /*
433 * Now start flushing things to disk, in the order they appear
434 * on the transaction lists. Data blocks go first.
435 */
436 err = 0;
437 journal_submit_data_buffers(journal, commit_transaction);
438
439 /*
440 * Wait for all previously submitted IO to complete.
441 */
442 spin_lock(&journal->j_list_lock);
443 while (commit_transaction->t_locked_list) {
444 struct buffer_head *bh;
445
446 jh = commit_transaction->t_locked_list->b_tprev;
447 bh = jh2bh(jh);
448 get_bh(bh);
449 if (buffer_locked(bh)) {
450 spin_unlock(&journal->j_list_lock);
451 wait_on_buffer(bh);
452 if (unlikely(!buffer_uptodate(bh)))
453 err = -EIO;
454 spin_lock(&journal->j_list_lock);
455 }
456 if (!inverted_lock(journal, bh)) {
457 put_bh(bh);
458 spin_lock(&journal->j_list_lock);
459 continue;
460 }
461 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
462 __jbd2_journal_unfile_buffer(jh);
463 jbd_unlock_bh_state(bh);
464 jbd2_journal_remove_journal_head(bh);
465 put_bh(bh);
466 } else {
467 jbd_unlock_bh_state(bh);
468 }
469 put_bh(bh);
470 cond_resched_lock(&journal->j_list_lock);
471 }
472 spin_unlock(&journal->j_list_lock);
473
474 if (err)
475 __jbd2_journal_abort_hard(journal);
476
477 jbd2_journal_write_revoke_records(journal, commit_transaction);
478
479 jbd_debug(3, "JBD: commit phase 2\n");
480
481 /*
482 * If we found any dirty or locked buffers, then we should have
483 * looped back up to the write_out_data label. If there weren't
484 * any then journal_clean_data_list should have wiped the list
485 * clean by now, so check that it is in fact empty.
486 */
487 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
488
489 jbd_debug (3, "JBD: commit phase 3\n");
490
491 /*
492 * Way to go: we have now written out all of the data for a
493 * transaction! Now comes the tricky part: we need to write out
494 * metadata. Loop over the transaction's entire buffer list:
495 */
496 commit_transaction->t_state = T_COMMIT;
497
498 descriptor = NULL;
499 bufs = 0;
500 while (commit_transaction->t_buffers) {
501
502 /* Find the next buffer to be journaled... */
503
504 jh = commit_transaction->t_buffers;
505
506 /* If we're in abort mode, we just un-journal the buffer and
507 release it for background writing. */
508
509 if (is_journal_aborted(journal)) {
510 JBUFFER_TRACE(jh, "journal is aborting: refile");
511 jbd2_journal_refile_buffer(journal, jh);
512 /* If that was the last one, we need to clean up
513 * any descriptor buffers which may have been
514 * already allocated, even if we are now
515 * aborting. */
516 if (!commit_transaction->t_buffers)
517 goto start_journal_io;
518 continue;
519 }
520
521 /* Make sure we have a descriptor block in which to
522 record the metadata buffer. */
523
524 if (!descriptor) {
525 struct buffer_head *bh;
526
527 J_ASSERT (bufs == 0);
528
529 jbd_debug(4, "JBD: get descriptor\n");
530
531 descriptor = jbd2_journal_get_descriptor_buffer(journal);
532 if (!descriptor) {
533 __jbd2_journal_abort_hard(journal);
534 continue;
535 }
536
537 bh = jh2bh(descriptor);
538 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
539 (unsigned long long)bh->b_blocknr, bh->b_data);
540 header = (journal_header_t *)&bh->b_data[0];
541 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
542 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
543 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
544
545 tagp = &bh->b_data[sizeof(journal_header_t)];
546 space_left = bh->b_size - sizeof(journal_header_t);
547 first_tag = 1;
548 set_buffer_jwrite(bh);
549 set_buffer_dirty(bh);
550 wbuf[bufs++] = bh;
551
552 /* Record it so that we can wait for IO
553 completion later */
554 BUFFER_TRACE(bh, "ph3: file as descriptor");
555 jbd2_journal_file_buffer(descriptor, commit_transaction,
556 BJ_LogCtl);
557 }
558
559 /* Where is the buffer to be written? */
560
561 err = jbd2_journal_next_log_block(journal, &blocknr);
562 /* If the block mapping failed, just abandon the buffer
563 and repeat this loop: we'll fall into the
564 refile-on-abort condition above. */
565 if (err) {
566 __jbd2_journal_abort_hard(journal);
567 continue;
568 }
569
570 /*
571 * start_this_handle() uses t_outstanding_credits to determine
572 * the free space in the log, but this counter is changed
573 * by jbd2_journal_next_log_block() also.
574 */
575 commit_transaction->t_outstanding_credits--;
576
577 /* Bump b_count to prevent truncate from stumbling over
578 the shadowed buffer! @@@ This can go if we ever get
579 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
580 atomic_inc(&jh2bh(jh)->b_count);
581
582 /* Make a temporary IO buffer with which to write it out
583 (this will requeue both the metadata buffer and the
584 temporary IO buffer). new_bh goes on BJ_IO*/
585
586 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
587 /*
588 * akpm: jbd2_journal_write_metadata_buffer() sets
589 * new_bh->b_transaction to commit_transaction.
590 * We need to clean this up before we release new_bh
591 * (which is of type BJ_IO)
592 */
593 JBUFFER_TRACE(jh, "ph3: write metadata");
594 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
595 jh, &new_jh, blocknr);
596 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
597 wbuf[bufs++] = jh2bh(new_jh);
598
599 /* Record the new block's tag in the current descriptor
600 buffer */
601
602 tag_flag = 0;
603 if (flags & 1)
604 tag_flag |= JBD2_FLAG_ESCAPE;
605 if (!first_tag)
606 tag_flag |= JBD2_FLAG_SAME_UUID;
607
608 tag = (journal_block_tag_t *) tagp;
609 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
610 tag->t_flags = cpu_to_be32(tag_flag);
611 tagp += tag_bytes;
612 space_left -= tag_bytes;
613
614 if (first_tag) {
615 memcpy (tagp, journal->j_uuid, 16);
616 tagp += 16;
617 space_left -= 16;
618 first_tag = 0;
619 }
620
621 /* If there's no more to do, or if the descriptor is full,
622 let the IO rip! */
623
624 if (bufs == journal->j_wbufsize ||
625 commit_transaction->t_buffers == NULL ||
626 space_left < tag_bytes + 16) {
627
628 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
629
630 /* Write an end-of-descriptor marker before
631 submitting the IOs. "tag" still points to
632 the last tag we set up. */
633
634 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
635
636start_journal_io:
637 for (i = 0; i < bufs; i++) {
638 struct buffer_head *bh = wbuf[i];
639 lock_buffer(bh);
640 clear_buffer_dirty(bh);
641 set_buffer_uptodate(bh);
642 bh->b_end_io = journal_end_buffer_io_sync;
643 submit_bh(WRITE, bh);
644 }
645 cond_resched();
646
647 /* Force a new descriptor to be generated next
648 time round the loop. */
649 descriptor = NULL;
650 bufs = 0;
651 }
652 }
653
654 /* Lo and behold: we have just managed to send a transaction to
655 the log. Before we can commit it, wait for the IO so far to
656 complete. Control buffers being written are on the
657 transaction's t_log_list queue, and metadata buffers are on
658 the t_iobuf_list queue.
659
660 Wait for the buffers in reverse order. That way we are
661 less likely to be woken up until all IOs have completed, and
662 so we incur less scheduling load.
663 */
664
665 jbd_debug(3, "JBD: commit phase 4\n");
666
667 /*
668 * akpm: these are BJ_IO, and j_list_lock is not needed.
669 * See __journal_try_to_free_buffer.
670 */
671wait_for_iobuf:
672 while (commit_transaction->t_iobuf_list != NULL) {
673 struct buffer_head *bh;
674
675 jh = commit_transaction->t_iobuf_list->b_tprev;
676 bh = jh2bh(jh);
677 if (buffer_locked(bh)) {
678 wait_on_buffer(bh);
679 goto wait_for_iobuf;
680 }
681 if (cond_resched())
682 goto wait_for_iobuf;
683
684 if (unlikely(!buffer_uptodate(bh)))
685 err = -EIO;
686
687 clear_buffer_jwrite(bh);
688
689 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
690 jbd2_journal_unfile_buffer(journal, jh);
691
692 /*
693 * ->t_iobuf_list should contain only dummy buffer_heads
694 * which were created by jbd2_journal_write_metadata_buffer().
695 */
696 BUFFER_TRACE(bh, "dumping temporary bh");
697 jbd2_journal_put_journal_head(jh);
698 __brelse(bh);
699 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
700 free_buffer_head(bh);
701
702 /* We also have to unlock and free the corresponding
703 shadowed buffer */
704 jh = commit_transaction->t_shadow_list->b_tprev;
705 bh = jh2bh(jh);
706 clear_bit(BH_JWrite, &bh->b_state);
707 J_ASSERT_BH(bh, buffer_jbddirty(bh));
708
709 /* The metadata is now released for reuse, but we need
710 to remember it against this transaction so that when
711 we finally commit, we can do any checkpointing
712 required. */
713 JBUFFER_TRACE(jh, "file as BJ_Forget");
714 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
715 /* Wake up any transactions which were waiting for this
716 IO to complete */
717 wake_up_bit(&bh->b_state, BH_Unshadow);
718 JBUFFER_TRACE(jh, "brelse shadowed buffer");
719 __brelse(bh);
720 }
721
722 J_ASSERT (commit_transaction->t_shadow_list == NULL);
723
724 jbd_debug(3, "JBD: commit phase 5\n");
725
726 /* Here we wait for the revoke record and descriptor record buffers */
727 wait_for_ctlbuf:
728 while (commit_transaction->t_log_list != NULL) {
729 struct buffer_head *bh;
730
731 jh = commit_transaction->t_log_list->b_tprev;
732 bh = jh2bh(jh);
733 if (buffer_locked(bh)) {
734 wait_on_buffer(bh);
735 goto wait_for_ctlbuf;
736 }
737 if (cond_resched())
738 goto wait_for_ctlbuf;
739
740 if (unlikely(!buffer_uptodate(bh)))
741 err = -EIO;
742
743 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
744 clear_buffer_jwrite(bh);
745 jbd2_journal_unfile_buffer(journal, jh);
746 jbd2_journal_put_journal_head(jh);
747 __brelse(bh); /* One for getblk */
748 /* AKPM: bforget here */
749 }
750
751 jbd_debug(3, "JBD: commit phase 6\n");
752
753 if (journal_write_commit_record(journal, commit_transaction))
754 err = -EIO;
755
756 if (err)
757 __jbd2_journal_abort_hard(journal);
758
759 /* End of a transaction! Finally, we can do checkpoint
760 processing: any buffers committed as a result of this
761 transaction can be removed from any checkpoint list it was on
762 before. */
763
764 jbd_debug(3, "JBD: commit phase 7\n");
765
766 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
767 J_ASSERT(commit_transaction->t_buffers == NULL);
768 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
769 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
770 J_ASSERT(commit_transaction->t_shadow_list == NULL);
771 J_ASSERT(commit_transaction->t_log_list == NULL);
772
773restart_loop:
774 /*
775 * As there are other places (journal_unmap_buffer()) adding buffers
776 * to this list we have to be careful and hold the j_list_lock.
777 */
778 spin_lock(&journal->j_list_lock);
779 while (commit_transaction->t_forget) {
780 transaction_t *cp_transaction;
781 struct buffer_head *bh;
782
783 jh = commit_transaction->t_forget;
784 spin_unlock(&journal->j_list_lock);
785 bh = jh2bh(jh);
786 jbd_lock_bh_state(bh);
787 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
788 jh->b_transaction == journal->j_running_transaction);
789
790 /*
791 * If there is undo-protected committed data against
792 * this buffer, then we can remove it now. If it is a
793 * buffer needing such protection, the old frozen_data
794 * field now points to a committed version of the
795 * buffer, so rotate that field to the new committed
796 * data.
797 *
798 * Otherwise, we can just throw away the frozen data now.
799 */
800 if (jh->b_committed_data) {
801 jbd2_slab_free(jh->b_committed_data, bh->b_size);
802 jh->b_committed_data = NULL;
803 if (jh->b_frozen_data) {
804 jh->b_committed_data = jh->b_frozen_data;
805 jh->b_frozen_data = NULL;
806 }
807 } else if (jh->b_frozen_data) {
808 jbd2_slab_free(jh->b_frozen_data, bh->b_size);
809 jh->b_frozen_data = NULL;
810 }
811
812 spin_lock(&journal->j_list_lock);
813 cp_transaction = jh->b_cp_transaction;
814 if (cp_transaction) {
815 JBUFFER_TRACE(jh, "remove from old cp transaction");
816 __jbd2_journal_remove_checkpoint(jh);
817 }
818
819 /* Only re-checkpoint the buffer_head if it is marked
820 * dirty. If the buffer was added to the BJ_Forget list
821 * by jbd2_journal_forget, it may no longer be dirty and
822 * there's no point in keeping a checkpoint record for
823 * it. */
824
825 /* A buffer which has been freed while still being
826 * journaled by a previous transaction may end up still
827 * being dirty here, but we want to avoid writing back
828 * that buffer in the future now that the last use has
829 * been committed. That's not only a performance gain,
830 * it also stops aliasing problems if the buffer is left
831 * behind for writeback and gets reallocated for another
832 * use in a different page. */
833 if (buffer_freed(bh)) {
834 clear_buffer_freed(bh);
835 clear_buffer_jbddirty(bh);
836 }
837
838 if (buffer_jbddirty(bh)) {
839 JBUFFER_TRACE(jh, "add to new checkpointing trans");
840 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
841 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
842 __jbd2_journal_refile_buffer(jh);
843 jbd_unlock_bh_state(bh);
844 } else {
845 J_ASSERT_BH(bh, !buffer_dirty(bh));
846 /* The buffer on BJ_Forget list and not jbddirty means
847 * it has been freed by this transaction and hence it
848 * could not have been reallocated until this
849 * transaction has committed. *BUT* it could be
850 * reallocated once we have written all the data to
851 * disk and before we process the buffer on BJ_Forget
852 * list. */
853 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
854 __jbd2_journal_refile_buffer(jh);
855 if (!jh->b_transaction) {
856 jbd_unlock_bh_state(bh);
857 /* needs a brelse */
858 jbd2_journal_remove_journal_head(bh);
859 release_buffer_page(bh);
860 } else
861 jbd_unlock_bh_state(bh);
862 }
863 cond_resched_lock(&journal->j_list_lock);
864 }
865 spin_unlock(&journal->j_list_lock);
866 /*
867 * This is a bit sleazy. We borrow j_list_lock to protect
868 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
869 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
870 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
871 */
872 spin_lock(&journal->j_state_lock);
873 spin_lock(&journal->j_list_lock);
874 /*
875 * Now recheck if some buffers did not get attached to the transaction
876 * while the lock was dropped...
877 */
878 if (commit_transaction->t_forget) {
879 spin_unlock(&journal->j_list_lock);
880 spin_unlock(&journal->j_state_lock);
881 goto restart_loop;
882 }
883
884 /* Done with this transaction! */
885
886 jbd_debug(3, "JBD: commit phase 8\n");
887
888 J_ASSERT(commit_transaction->t_state == T_COMMIT);
889
890 commit_transaction->t_state = T_FINISHED;
891 J_ASSERT(commit_transaction == journal->j_committing_transaction);
892 journal->j_commit_sequence = commit_transaction->t_tid;
893 journal->j_committing_transaction = NULL;
894 spin_unlock(&journal->j_state_lock);
895
896 if (commit_transaction->t_checkpoint_list == NULL) {
897 __jbd2_journal_drop_transaction(journal, commit_transaction);
898 } else {
899 if (journal->j_checkpoint_transactions == NULL) {
900 journal->j_checkpoint_transactions = commit_transaction;
901 commit_transaction->t_cpnext = commit_transaction;
902 commit_transaction->t_cpprev = commit_transaction;
903 } else {
904 commit_transaction->t_cpnext =
905 journal->j_checkpoint_transactions;
906 commit_transaction->t_cpprev =
907 commit_transaction->t_cpnext->t_cpprev;
908 commit_transaction->t_cpnext->t_cpprev =
909 commit_transaction;
910 commit_transaction->t_cpprev->t_cpnext =
911 commit_transaction;
912 }
913 }
914 spin_unlock(&journal->j_list_lock);
915
916 jbd_debug(1, "JBD: commit %d complete, head %d\n",
917 journal->j_commit_sequence, journal->j_tail_sequence);
918
919 wake_up(&journal->j_wait_done_commit);
920}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
new file mode 100644
index 000000000000..c60f378b0f76
--- /dev/null
+++ b/fs/jbd2/journal.c
@@ -0,0 +1,2084 @@
1/*
2 * linux/fs/jbd2/journal.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem journal-writing code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages journals: areas of disk reserved for logging
16 * transactional updates. This includes the kernel journaling thread
17 * which is responsible for scheduling updates to the log.
18 *
19 * We do not actually manage the physical storage of the journal in this
20 * file: that is left to a per-journal policy function, which allows us
21 * to store the journal within a filesystem-specified area for ext2
22 * journaling (ext2 can use a reserved inode for storing the log).
23 */
24
25#include <linux/module.h>
26#include <linux/time.h>
27#include <linux/fs.h>
28#include <linux/jbd2.h>
29#include <linux/errno.h>
30#include <linux/slab.h>
31#include <linux/smp_lock.h>
32#include <linux/init.h>
33#include <linux/mm.h>
34#include <linux/suspend.h>
35#include <linux/pagemap.h>
36#include <linux/kthread.h>
37#include <linux/poison.h>
38#include <linux/proc_fs.h>
39
40#include <asm/uaccess.h>
41#include <asm/page.h>
42
43EXPORT_SYMBOL(jbd2_journal_start);
44EXPORT_SYMBOL(jbd2_journal_restart);
45EXPORT_SYMBOL(jbd2_journal_extend);
46EXPORT_SYMBOL(jbd2_journal_stop);
47EXPORT_SYMBOL(jbd2_journal_lock_updates);
48EXPORT_SYMBOL(jbd2_journal_unlock_updates);
49EXPORT_SYMBOL(jbd2_journal_get_write_access);
50EXPORT_SYMBOL(jbd2_journal_get_create_access);
51EXPORT_SYMBOL(jbd2_journal_get_undo_access);
52EXPORT_SYMBOL(jbd2_journal_dirty_data);
53EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
54EXPORT_SYMBOL(jbd2_journal_release_buffer);
55EXPORT_SYMBOL(jbd2_journal_forget);
56#if 0
57EXPORT_SYMBOL(journal_sync_buffer);
58#endif
59EXPORT_SYMBOL(jbd2_journal_flush);
60EXPORT_SYMBOL(jbd2_journal_revoke);
61
62EXPORT_SYMBOL(jbd2_journal_init_dev);
63EXPORT_SYMBOL(jbd2_journal_init_inode);
64EXPORT_SYMBOL(jbd2_journal_update_format);
65EXPORT_SYMBOL(jbd2_journal_check_used_features);
66EXPORT_SYMBOL(jbd2_journal_check_available_features);
67EXPORT_SYMBOL(jbd2_journal_set_features);
68EXPORT_SYMBOL(jbd2_journal_create);
69EXPORT_SYMBOL(jbd2_journal_load);
70EXPORT_SYMBOL(jbd2_journal_destroy);
71EXPORT_SYMBOL(jbd2_journal_update_superblock);
72EXPORT_SYMBOL(jbd2_journal_abort);
73EXPORT_SYMBOL(jbd2_journal_errno);
74EXPORT_SYMBOL(jbd2_journal_ack_err);
75EXPORT_SYMBOL(jbd2_journal_clear_err);
76EXPORT_SYMBOL(jbd2_log_wait_commit);
77EXPORT_SYMBOL(jbd2_journal_start_commit);
78EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
79EXPORT_SYMBOL(jbd2_journal_wipe);
80EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
81EXPORT_SYMBOL(jbd2_journal_invalidatepage);
82EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
83EXPORT_SYMBOL(jbd2_journal_force_commit);
84
85static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
86static void __journal_abort_soft (journal_t *journal, int errno);
87static int jbd2_journal_create_jbd_slab(size_t slab_size);
88
89/*
90 * Helper function used to manage commit timeouts
91 */
92
93static void commit_timeout(unsigned long __data)
94{
95 struct task_struct * p = (struct task_struct *) __data;
96
97 wake_up_process(p);
98}
99
100/*
101 * kjournald2: The main thread function used to manage a logging device
102 * journal.
103 *
104 * This kernel thread is responsible for two things:
105 *
106 * 1) COMMIT: Every so often we need to commit the current state of the
107 * filesystem to disk. The journal thread is responsible for writing
108 * all of the metadata buffers to disk.
109 *
110 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
111 * of the data in that part of the log has been rewritten elsewhere on
112 * the disk. Flushing these old buffers to reclaim space in the log is
113 * known as checkpointing, and this thread is responsible for that job.
114 */
115
116static int kjournald2(void *arg)
117{
118 journal_t *journal = arg;
119 transaction_t *transaction;
120
121 /*
122 * Set up an interval timer which can be used to trigger a commit wakeup
123 * after the commit interval expires
124 */
125 setup_timer(&journal->j_commit_timer, commit_timeout,
126 (unsigned long)current);
127
128 /* Record that the journal thread is running */
129 journal->j_task = current;
130 wake_up(&journal->j_wait_done_commit);
131
132 printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n",
133 journal->j_commit_interval / HZ);
134
135 /*
136 * And now, wait forever for commit wakeup events.
137 */
138 spin_lock(&journal->j_state_lock);
139
140loop:
141 if (journal->j_flags & JBD2_UNMOUNT)
142 goto end_loop;
143
144 jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
145 journal->j_commit_sequence, journal->j_commit_request);
146
147 if (journal->j_commit_sequence != journal->j_commit_request) {
148 jbd_debug(1, "OK, requests differ\n");
149 spin_unlock(&journal->j_state_lock);
150 del_timer_sync(&journal->j_commit_timer);
151 jbd2_journal_commit_transaction(journal);
152 spin_lock(&journal->j_state_lock);
153 goto loop;
154 }
155
156 wake_up(&journal->j_wait_done_commit);
157 if (freezing(current)) {
158 /*
159 * The simpler the better. Flushing journal isn't a
160 * good idea, because that depends on threads that may
161 * be already stopped.
162 */
163 jbd_debug(1, "Now suspending kjournald2\n");
164 spin_unlock(&journal->j_state_lock);
165 refrigerator();
166 spin_lock(&journal->j_state_lock);
167 } else {
168 /*
169 * We assume on resume that commits are already there,
170 * so we don't sleep
171 */
172 DEFINE_WAIT(wait);
173 int should_sleep = 1;
174
175 prepare_to_wait(&journal->j_wait_commit, &wait,
176 TASK_INTERRUPTIBLE);
177 if (journal->j_commit_sequence != journal->j_commit_request)
178 should_sleep = 0;
179 transaction = journal->j_running_transaction;
180 if (transaction && time_after_eq(jiffies,
181 transaction->t_expires))
182 should_sleep = 0;
183 if (journal->j_flags & JBD2_UNMOUNT)
184 should_sleep = 0;
185 if (should_sleep) {
186 spin_unlock(&journal->j_state_lock);
187 schedule();
188 spin_lock(&journal->j_state_lock);
189 }
190 finish_wait(&journal->j_wait_commit, &wait);
191 }
192
193 jbd_debug(1, "kjournald2 wakes\n");
194
195 /*
196 * Were we woken up by a commit wakeup event?
197 */
198 transaction = journal->j_running_transaction;
199 if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
200 journal->j_commit_request = transaction->t_tid;
201 jbd_debug(1, "woke because of timeout\n");
202 }
203 goto loop;
204
205end_loop:
206 spin_unlock(&journal->j_state_lock);
207 del_timer_sync(&journal->j_commit_timer);
208 journal->j_task = NULL;
209 wake_up(&journal->j_wait_done_commit);
210 jbd_debug(1, "Journal thread exiting.\n");
211 return 0;
212}
213
214static void jbd2_journal_start_thread(journal_t *journal)
215{
216 kthread_run(kjournald2, journal, "kjournald2");
217 wait_event(journal->j_wait_done_commit, journal->j_task != 0);
218}
219
220static void journal_kill_thread(journal_t *journal)
221{
222 spin_lock(&journal->j_state_lock);
223 journal->j_flags |= JBD2_UNMOUNT;
224
225 while (journal->j_task) {
226 wake_up(&journal->j_wait_commit);
227 spin_unlock(&journal->j_state_lock);
228 wait_event(journal->j_wait_done_commit, journal->j_task == 0);
229 spin_lock(&journal->j_state_lock);
230 }
231 spin_unlock(&journal->j_state_lock);
232}
233
234/*
235 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
236 *
237 * Writes a metadata buffer to a given disk block. The actual IO is not
238 * performed but a new buffer_head is constructed which labels the data
239 * to be written with the correct destination disk block.
240 *
241 * Any magic-number escaping which needs to be done will cause a
242 * copy-out here. If the buffer happens to start with the
243 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
244 * magic number is only written to the log for descripter blocks. In
245 * this case, we copy the data and replace the first word with 0, and we
246 * return a result code which indicates that this buffer needs to be
247 * marked as an escaped buffer in the corresponding log descriptor
248 * block. The missing word can then be restored when the block is read
249 * during recovery.
250 *
251 * If the source buffer has already been modified by a new transaction
252 * since we took the last commit snapshot, we use the frozen copy of
253 * that data for IO. If we end up using the existing buffer_head's data
254 * for the write, then we *have* to lock the buffer to prevent anyone
255 * else from using and possibly modifying it while the IO is in
256 * progress.
257 *
258 * The function returns a pointer to the buffer_heads to be used for IO.
259 *
260 * We assume that the journal has already been locked in this function.
261 *
262 * Return value:
263 * <0: Error
264 * >=0: Finished OK
265 *
266 * On success:
267 * Bit 0 set == escape performed on the data
268 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
269 */
270
271int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
272 struct journal_head *jh_in,
273 struct journal_head **jh_out,
274 unsigned long long blocknr)
275{
276 int need_copy_out = 0;
277 int done_copy_out = 0;
278 int do_escape = 0;
279 char *mapped_data;
280 struct buffer_head *new_bh;
281 struct journal_head *new_jh;
282 struct page *new_page;
283 unsigned int new_offset;
284 struct buffer_head *bh_in = jh2bh(jh_in);
285
286 /*
287 * The buffer really shouldn't be locked: only the current committing
288 * transaction is allowed to write it, so nobody else is allowed
289 * to do any IO.
290 *
291 * akpm: except if we're journalling data, and write() output is
292 * also part of a shared mapping, and another thread has
293 * decided to launch a writepage() against this buffer.
294 */
295 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
296
297 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
298
299 /*
300 * If a new transaction has already done a buffer copy-out, then
301 * we use that version of the data for the commit.
302 */
303 jbd_lock_bh_state(bh_in);
304repeat:
305 if (jh_in->b_frozen_data) {
306 done_copy_out = 1;
307 new_page = virt_to_page(jh_in->b_frozen_data);
308 new_offset = offset_in_page(jh_in->b_frozen_data);
309 } else {
310 new_page = jh2bh(jh_in)->b_page;
311 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
312 }
313
314 mapped_data = kmap_atomic(new_page, KM_USER0);
315 /*
316 * Check for escaping
317 */
318 if (*((__be32 *)(mapped_data + new_offset)) ==
319 cpu_to_be32(JBD2_MAGIC_NUMBER)) {
320 need_copy_out = 1;
321 do_escape = 1;
322 }
323 kunmap_atomic(mapped_data, KM_USER0);
324
325 /*
326 * Do we need to do a data copy?
327 */
328 if (need_copy_out && !done_copy_out) {
329 char *tmp;
330
331 jbd_unlock_bh_state(bh_in);
332 tmp = jbd2_slab_alloc(bh_in->b_size, GFP_NOFS);
333 jbd_lock_bh_state(bh_in);
334 if (jh_in->b_frozen_data) {
335 jbd2_slab_free(tmp, bh_in->b_size);
336 goto repeat;
337 }
338
339 jh_in->b_frozen_data = tmp;
340 mapped_data = kmap_atomic(new_page, KM_USER0);
341 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
342 kunmap_atomic(mapped_data, KM_USER0);
343
344 new_page = virt_to_page(tmp);
345 new_offset = offset_in_page(tmp);
346 done_copy_out = 1;
347 }
348
349 /*
350 * Did we need to do an escaping? Now we've done all the
351 * copying, we can finally do so.
352 */
353 if (do_escape) {
354 mapped_data = kmap_atomic(new_page, KM_USER0);
355 *((unsigned int *)(mapped_data + new_offset)) = 0;
356 kunmap_atomic(mapped_data, KM_USER0);
357 }
358
359 /* keep subsequent assertions sane */
360 new_bh->b_state = 0;
361 init_buffer(new_bh, NULL, NULL);
362 atomic_set(&new_bh->b_count, 1);
363 jbd_unlock_bh_state(bh_in);
364
365 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
366
367 set_bh_page(new_bh, new_page, new_offset);
368 new_jh->b_transaction = NULL;
369 new_bh->b_size = jh2bh(jh_in)->b_size;
370 new_bh->b_bdev = transaction->t_journal->j_dev;
371 new_bh->b_blocknr = blocknr;
372 set_buffer_mapped(new_bh);
373 set_buffer_dirty(new_bh);
374
375 *jh_out = new_jh;
376
377 /*
378 * The to-be-written buffer needs to get moved to the io queue,
379 * and the original buffer whose contents we are shadowing or
380 * copying is moved to the transaction's shadow queue.
381 */
382 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
383 jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
384 JBUFFER_TRACE(new_jh, "file as BJ_IO");
385 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
386
387 return do_escape | (done_copy_out << 1);
388}
389
390/*
391 * Allocation code for the journal file. Manage the space left in the
392 * journal, so that we can begin checkpointing when appropriate.
393 */
394
395/*
396 * __jbd2_log_space_left: Return the number of free blocks left in the journal.
397 *
398 * Called with the journal already locked.
399 *
400 * Called under j_state_lock
401 */
402
403int __jbd2_log_space_left(journal_t *journal)
404{
405 int left = journal->j_free;
406
407 assert_spin_locked(&journal->j_state_lock);
408
409 /*
410 * Be pessimistic here about the number of those free blocks which
411 * might be required for log descriptor control blocks.
412 */
413
414#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
415
416 left -= MIN_LOG_RESERVED_BLOCKS;
417
418 if (left <= 0)
419 return 0;
420 left -= (left >> 3);
421 return left;
422}
423
424/*
425 * Called under j_state_lock. Returns true if a transaction was started.
426 */
427int __jbd2_log_start_commit(journal_t *journal, tid_t target)
428{
429 /*
430 * Are we already doing a recent enough commit?
431 */
432 if (!tid_geq(journal->j_commit_request, target)) {
433 /*
434 * We want a new commit: OK, mark the request and wakup the
435 * commit thread. We do _not_ do the commit ourselves.
436 */
437
438 journal->j_commit_request = target;
439 jbd_debug(1, "JBD: requesting commit %d/%d\n",
440 journal->j_commit_request,
441 journal->j_commit_sequence);
442 wake_up(&journal->j_wait_commit);
443 return 1;
444 }
445 return 0;
446}
447
448int jbd2_log_start_commit(journal_t *journal, tid_t tid)
449{
450 int ret;
451
452 spin_lock(&journal->j_state_lock);
453 ret = __jbd2_log_start_commit(journal, tid);
454 spin_unlock(&journal->j_state_lock);
455 return ret;
456}
457
458/*
459 * Force and wait upon a commit if the calling process is not within
460 * transaction. This is used for forcing out undo-protected data which contains
461 * bitmaps, when the fs is running out of space.
462 *
463 * We can only force the running transaction if we don't have an active handle;
464 * otherwise, we will deadlock.
465 *
466 * Returns true if a transaction was started.
467 */
468int jbd2_journal_force_commit_nested(journal_t *journal)
469{
470 transaction_t *transaction = NULL;
471 tid_t tid;
472
473 spin_lock(&journal->j_state_lock);
474 if (journal->j_running_transaction && !current->journal_info) {
475 transaction = journal->j_running_transaction;
476 __jbd2_log_start_commit(journal, transaction->t_tid);
477 } else if (journal->j_committing_transaction)
478 transaction = journal->j_committing_transaction;
479
480 if (!transaction) {
481 spin_unlock(&journal->j_state_lock);
482 return 0; /* Nothing to retry */
483 }
484
485 tid = transaction->t_tid;
486 spin_unlock(&journal->j_state_lock);
487 jbd2_log_wait_commit(journal, tid);
488 return 1;
489}
490
491/*
492 * Start a commit of the current running transaction (if any). Returns true
493 * if a transaction was started, and fills its tid in at *ptid
494 */
495int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
496{
497 int ret = 0;
498
499 spin_lock(&journal->j_state_lock);
500 if (journal->j_running_transaction) {
501 tid_t tid = journal->j_running_transaction->t_tid;
502
503 ret = __jbd2_log_start_commit(journal, tid);
504 if (ret && ptid)
505 *ptid = tid;
506 } else if (journal->j_committing_transaction && ptid) {
507 /*
508 * If ext3_write_super() recently started a commit, then we
509 * have to wait for completion of that transaction
510 */
511 *ptid = journal->j_committing_transaction->t_tid;
512 ret = 1;
513 }
514 spin_unlock(&journal->j_state_lock);
515 return ret;
516}
517
518/*
519 * Wait for a specified commit to complete.
520 * The caller may not hold the journal lock.
521 */
522int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
523{
524 int err = 0;
525
526#ifdef CONFIG_JBD_DEBUG
527 spin_lock(&journal->j_state_lock);
528 if (!tid_geq(journal->j_commit_request, tid)) {
529 printk(KERN_EMERG
530 "%s: error: j_commit_request=%d, tid=%d\n",
531 __FUNCTION__, journal->j_commit_request, tid);
532 }
533 spin_unlock(&journal->j_state_lock);
534#endif
535 spin_lock(&journal->j_state_lock);
536 while (tid_gt(tid, journal->j_commit_sequence)) {
537 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
538 tid, journal->j_commit_sequence);
539 wake_up(&journal->j_wait_commit);
540 spin_unlock(&journal->j_state_lock);
541 wait_event(journal->j_wait_done_commit,
542 !tid_gt(tid, journal->j_commit_sequence));
543 spin_lock(&journal->j_state_lock);
544 }
545 spin_unlock(&journal->j_state_lock);
546
547 if (unlikely(is_journal_aborted(journal))) {
548 printk(KERN_EMERG "journal commit I/O error\n");
549 err = -EIO;
550 }
551 return err;
552}
553
554/*
555 * Log buffer allocation routines:
556 */
557
558int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
559{
560 unsigned long blocknr;
561
562 spin_lock(&journal->j_state_lock);
563 J_ASSERT(journal->j_free > 1);
564
565 blocknr = journal->j_head;
566 journal->j_head++;
567 journal->j_free--;
568 if (journal->j_head == journal->j_last)
569 journal->j_head = journal->j_first;
570 spin_unlock(&journal->j_state_lock);
571 return jbd2_journal_bmap(journal, blocknr, retp);
572}
573
574/*
575 * Conversion of logical to physical block numbers for the journal
576 *
577 * On external journals the journal blocks are identity-mapped, so
578 * this is a no-op. If needed, we can use j_blk_offset - everything is
579 * ready.
580 */
581int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
582 unsigned long long *retp)
583{
584 int err = 0;
585 unsigned long long ret;
586
587 if (journal->j_inode) {
588 ret = bmap(journal->j_inode, blocknr);
589 if (ret)
590 *retp = ret;
591 else {
592 char b[BDEVNAME_SIZE];
593
594 printk(KERN_ALERT "%s: journal block not found "
595 "at offset %lu on %s\n",
596 __FUNCTION__,
597 blocknr,
598 bdevname(journal->j_dev, b));
599 err = -EIO;
600 __journal_abort_soft(journal, err);
601 }
602 } else {
603 *retp = blocknr; /* +journal->j_blk_offset */
604 }
605 return err;
606}
607
608/*
609 * We play buffer_head aliasing tricks to write data/metadata blocks to
610 * the journal without copying their contents, but for journal
611 * descriptor blocks we do need to generate bona fide buffers.
612 *
613 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
614 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
615 * But we don't bother doing that, so there will be coherency problems with
616 * mmaps of blockdevs which hold live JBD-controlled filesystems.
617 */
618struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
619{
620 struct buffer_head *bh;
621 unsigned long long blocknr;
622 int err;
623
624 err = jbd2_journal_next_log_block(journal, &blocknr);
625
626 if (err)
627 return NULL;
628
629 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
630 lock_buffer(bh);
631 memset(bh->b_data, 0, journal->j_blocksize);
632 set_buffer_uptodate(bh);
633 unlock_buffer(bh);
634 BUFFER_TRACE(bh, "return this buffer");
635 return jbd2_journal_add_journal_head(bh);
636}
637
638/*
639 * Management for journal control blocks: functions to create and
640 * destroy journal_t structures, and to initialise and read existing
641 * journal blocks from disk. */
642
643/* First: create and setup a journal_t object in memory. We initialise
644 * very few fields yet: that has to wait until we have created the
645 * journal structures from from scratch, or loaded them from disk. */
646
647static journal_t * journal_init_common (void)
648{
649 journal_t *journal;
650 int err;
651
652 journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
653 if (!journal)
654 goto fail;
655 memset(journal, 0, sizeof(*journal));
656
657 init_waitqueue_head(&journal->j_wait_transaction_locked);
658 init_waitqueue_head(&journal->j_wait_logspace);
659 init_waitqueue_head(&journal->j_wait_done_commit);
660 init_waitqueue_head(&journal->j_wait_checkpoint);
661 init_waitqueue_head(&journal->j_wait_commit);
662 init_waitqueue_head(&journal->j_wait_updates);
663 mutex_init(&journal->j_barrier);
664 mutex_init(&journal->j_checkpoint_mutex);
665 spin_lock_init(&journal->j_revoke_lock);
666 spin_lock_init(&journal->j_list_lock);
667 spin_lock_init(&journal->j_state_lock);
668
669 journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
670
671 /* The journal is marked for error until we succeed with recovery! */
672 journal->j_flags = JBD2_ABORT;
673
674 /* Set up a default-sized revoke table for the new mount. */
675 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
676 if (err) {
677 kfree(journal);
678 goto fail;
679 }
680 return journal;
681fail:
682 return NULL;
683}
684
685/* jbd2_journal_init_dev and jbd2_journal_init_inode:
686 *
687 * Create a journal structure assigned some fixed set of disk blocks to
688 * the journal. We don't actually touch those disk blocks yet, but we
689 * need to set up all of the mapping information to tell the journaling
690 * system where the journal blocks are.
691 *
692 */
693
694/**
695 * journal_t * jbd2_journal_init_dev() - creates an initialises a journal structure
696 * @bdev: Block device on which to create the journal
697 * @fs_dev: Device which hold journalled filesystem for this journal.
698 * @start: Block nr Start of journal.
699 * @len: Length of the journal in blocks.
700 * @blocksize: blocksize of journalling device
701 * @returns: a newly created journal_t *
702 *
703 * jbd2_journal_init_dev creates a journal which maps a fixed contiguous
704 * range of blocks on an arbitrary block device.
705 *
706 */
707journal_t * jbd2_journal_init_dev(struct block_device *bdev,
708 struct block_device *fs_dev,
709 unsigned long long start, int len, int blocksize)
710{
711 journal_t *journal = journal_init_common();
712 struct buffer_head *bh;
713 int n;
714
715 if (!journal)
716 return NULL;
717
718 /* journal descriptor can store up to n blocks -bzzz */
719 journal->j_blocksize = blocksize;
720 n = journal->j_blocksize / sizeof(journal_block_tag_t);
721 journal->j_wbufsize = n;
722 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
723 if (!journal->j_wbuf) {
724 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
725 __FUNCTION__);
726 kfree(journal);
727 journal = NULL;
728 goto out;
729 }
730 journal->j_dev = bdev;
731 journal->j_fs_dev = fs_dev;
732 journal->j_blk_offset = start;
733 journal->j_maxlen = len;
734
735 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
736 J_ASSERT(bh != NULL);
737 journal->j_sb_buffer = bh;
738 journal->j_superblock = (journal_superblock_t *)bh->b_data;
739out:
740 return journal;
741}
742
743/**
744 * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
745 * @inode: An inode to create the journal in
746 *
747 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
748 * the journal. The inode must exist already, must support bmap() and
749 * must have all data blocks preallocated.
750 */
751journal_t * jbd2_journal_init_inode (struct inode *inode)
752{
753 struct buffer_head *bh;
754 journal_t *journal = journal_init_common();
755 int err;
756 int n;
757 unsigned long long blocknr;
758
759 if (!journal)
760 return NULL;
761
762 journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
763 journal->j_inode = inode;
764 jbd_debug(1,
765 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
766 journal, inode->i_sb->s_id, inode->i_ino,
767 (long long) inode->i_size,
768 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
769
770 journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
771 journal->j_blocksize = inode->i_sb->s_blocksize;
772
773 /* journal descriptor can store up to n blocks -bzzz */
774 n = journal->j_blocksize / sizeof(journal_block_tag_t);
775 journal->j_wbufsize = n;
776 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
777 if (!journal->j_wbuf) {
778 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
779 __FUNCTION__);
780 kfree(journal);
781 return NULL;
782 }
783
784 err = jbd2_journal_bmap(journal, 0, &blocknr);
785 /* If that failed, give up */
786 if (err) {
787 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
788 __FUNCTION__);
789 kfree(journal);
790 return NULL;
791 }
792
793 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
794 J_ASSERT(bh != NULL);
795 journal->j_sb_buffer = bh;
796 journal->j_superblock = (journal_superblock_t *)bh->b_data;
797
798 return journal;
799}
800
801/*
802 * If the journal init or create aborts, we need to mark the journal
803 * superblock as being NULL to prevent the journal destroy from writing
804 * back a bogus superblock.
805 */
806static void journal_fail_superblock (journal_t *journal)
807{
808 struct buffer_head *bh = journal->j_sb_buffer;
809 brelse(bh);
810 journal->j_sb_buffer = NULL;
811}
812
813/*
814 * Given a journal_t structure, initialise the various fields for
815 * startup of a new journaling session. We use this both when creating
816 * a journal, and after recovering an old journal to reset it for
817 * subsequent use.
818 */
819
820static int journal_reset(journal_t *journal)
821{
822 journal_superblock_t *sb = journal->j_superblock;
823 unsigned long long first, last;
824
825 first = be32_to_cpu(sb->s_first);
826 last = be32_to_cpu(sb->s_maxlen);
827
828 journal->j_first = first;
829 journal->j_last = last;
830
831 journal->j_head = first;
832 journal->j_tail = first;
833 journal->j_free = last - first;
834
835 journal->j_tail_sequence = journal->j_transaction_sequence;
836 journal->j_commit_sequence = journal->j_transaction_sequence - 1;
837 journal->j_commit_request = journal->j_commit_sequence;
838
839 journal->j_max_transaction_buffers = journal->j_maxlen / 4;
840
841 /* Add the dynamic fields and write it to disk. */
842 jbd2_journal_update_superblock(journal, 1);
843 jbd2_journal_start_thread(journal);
844 return 0;
845}
846
847/**
848 * int jbd2_journal_create() - Initialise the new journal file
849 * @journal: Journal to create. This structure must have been initialised
850 *
851 * Given a journal_t structure which tells us which disk blocks we can
852 * use, create a new journal superblock and initialise all of the
853 * journal fields from scratch.
854 **/
855int jbd2_journal_create(journal_t *journal)
856{
857 unsigned long long blocknr;
858 struct buffer_head *bh;
859 journal_superblock_t *sb;
860 int i, err;
861
862 if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
863 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
864 journal->j_maxlen);
865 journal_fail_superblock(journal);
866 return -EINVAL;
867 }
868
869 if (journal->j_inode == NULL) {
870 /*
871 * We don't know what block to start at!
872 */
873 printk(KERN_EMERG
874 "%s: creation of journal on external device!\n",
875 __FUNCTION__);
876 BUG();
877 }
878
879 /* Zero out the entire journal on disk. We cannot afford to
880 have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
881 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
882 for (i = 0; i < journal->j_maxlen; i++) {
883 err = jbd2_journal_bmap(journal, i, &blocknr);
884 if (err)
885 return err;
886 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
887 lock_buffer(bh);
888 memset (bh->b_data, 0, journal->j_blocksize);
889 BUFFER_TRACE(bh, "marking dirty");
890 mark_buffer_dirty(bh);
891 BUFFER_TRACE(bh, "marking uptodate");
892 set_buffer_uptodate(bh);
893 unlock_buffer(bh);
894 __brelse(bh);
895 }
896
897 sync_blockdev(journal->j_dev);
898 jbd_debug(1, "JBD: journal cleared.\n");
899
900 /* OK, fill in the initial static fields in the new superblock */
901 sb = journal->j_superblock;
902
903 sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
904 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
905
906 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
907 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
908 sb->s_first = cpu_to_be32(1);
909
910 journal->j_transaction_sequence = 1;
911
912 journal->j_flags &= ~JBD2_ABORT;
913 journal->j_format_version = 2;
914
915 return journal_reset(journal);
916}
917
918/**
919 * void jbd2_journal_update_superblock() - Update journal sb on disk.
920 * @journal: The journal to update.
921 * @wait: Set to '0' if you don't want to wait for IO completion.
922 *
923 * Update a journal's dynamic superblock fields and write it to disk,
924 * optionally waiting for the IO to complete.
925 */
926void jbd2_journal_update_superblock(journal_t *journal, int wait)
927{
928 journal_superblock_t *sb = journal->j_superblock;
929 struct buffer_head *bh = journal->j_sb_buffer;
930
931 /*
932 * As a special case, if the on-disk copy is already marked as needing
933 * no recovery (s_start == 0) and there are no outstanding transactions
934 * in the filesystem, then we can safely defer the superblock update
935 * until the next commit by setting JBD2_FLUSHED. This avoids
936 * attempting a write to a potential-readonly device.
937 */
938 if (sb->s_start == 0 && journal->j_tail_sequence ==
939 journal->j_transaction_sequence) {
940 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
941 "(start %ld, seq %d, errno %d)\n",
942 journal->j_tail, journal->j_tail_sequence,
943 journal->j_errno);
944 goto out;
945 }
946
947 spin_lock(&journal->j_state_lock);
948 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
949 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
950
951 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
952 sb->s_start = cpu_to_be32(journal->j_tail);
953 sb->s_errno = cpu_to_be32(journal->j_errno);
954 spin_unlock(&journal->j_state_lock);
955
956 BUFFER_TRACE(bh, "marking dirty");
957 mark_buffer_dirty(bh);
958 if (wait)
959 sync_dirty_buffer(bh);
960 else
961 ll_rw_block(SWRITE, 1, &bh);
962
963out:
964 /* If we have just flushed the log (by marking s_start==0), then
965 * any future commit will have to be careful to update the
966 * superblock again to re-record the true start of the log. */
967
968 spin_lock(&journal->j_state_lock);
969 if (sb->s_start)
970 journal->j_flags &= ~JBD2_FLUSHED;
971 else
972 journal->j_flags |= JBD2_FLUSHED;
973 spin_unlock(&journal->j_state_lock);
974}
975
976/*
977 * Read the superblock for a given journal, performing initial
978 * validation of the format.
979 */
980
981static int journal_get_superblock(journal_t *journal)
982{
983 struct buffer_head *bh;
984 journal_superblock_t *sb;
985 int err = -EIO;
986
987 bh = journal->j_sb_buffer;
988
989 J_ASSERT(bh != NULL);
990 if (!buffer_uptodate(bh)) {
991 ll_rw_block(READ, 1, &bh);
992 wait_on_buffer(bh);
993 if (!buffer_uptodate(bh)) {
994 printk (KERN_ERR
995 "JBD: IO error reading journal superblock\n");
996 goto out;
997 }
998 }
999
1000 sb = journal->j_superblock;
1001
1002 err = -EINVAL;
1003
1004 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
1005 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1006 printk(KERN_WARNING "JBD: no valid journal superblock found\n");
1007 goto out;
1008 }
1009
1010 switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1011 case JBD2_SUPERBLOCK_V1:
1012 journal->j_format_version = 1;
1013 break;
1014 case JBD2_SUPERBLOCK_V2:
1015 journal->j_format_version = 2;
1016 break;
1017 default:
1018 printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
1019 goto out;
1020 }
1021
1022 if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1023 journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1024 else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1025 printk (KERN_WARNING "JBD: journal file too short\n");
1026 goto out;
1027 }
1028
1029 return 0;
1030
1031out:
1032 journal_fail_superblock(journal);
1033 return err;
1034}
1035
1036/*
1037 * Load the on-disk journal superblock and read the key fields into the
1038 * journal_t.
1039 */
1040
1041static int load_superblock(journal_t *journal)
1042{
1043 int err;
1044 journal_superblock_t *sb;
1045
1046 err = journal_get_superblock(journal);
1047 if (err)
1048 return err;
1049
1050 sb = journal->j_superblock;
1051
1052 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1053 journal->j_tail = be32_to_cpu(sb->s_start);
1054 journal->j_first = be32_to_cpu(sb->s_first);
1055 journal->j_last = be32_to_cpu(sb->s_maxlen);
1056 journal->j_errno = be32_to_cpu(sb->s_errno);
1057
1058 return 0;
1059}
1060
1061
1062/**
1063 * int jbd2_journal_load() - Read journal from disk.
1064 * @journal: Journal to act on.
1065 *
1066 * Given a journal_t structure which tells us which disk blocks contain
1067 * a journal, read the journal from disk to initialise the in-memory
1068 * structures.
1069 */
1070int jbd2_journal_load(journal_t *journal)
1071{
1072 int err;
1073 journal_superblock_t *sb;
1074
1075 err = load_superblock(journal);
1076 if (err)
1077 return err;
1078
1079 sb = journal->j_superblock;
1080 /* If this is a V2 superblock, then we have to check the
1081 * features flags on it. */
1082
1083 if (journal->j_format_version >= 2) {
1084 if ((sb->s_feature_ro_compat &
1085 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
1086 (sb->s_feature_incompat &
1087 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
1088 printk (KERN_WARNING
1089 "JBD: Unrecognised features on journal\n");
1090 return -EINVAL;
1091 }
1092 }
1093
1094 /*
1095 * Create a slab for this blocksize
1096 */
1097 err = jbd2_journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
1098 if (err)
1099 return err;
1100
1101 /* Let the recovery code check whether it needs to recover any
1102 * data from the journal. */
1103 if (jbd2_journal_recover(journal))
1104 goto recovery_error;
1105
1106 /* OK, we've finished with the dynamic journal bits:
1107 * reinitialise the dynamic contents of the superblock in memory
1108 * and reset them on disk. */
1109 if (journal_reset(journal))
1110 goto recovery_error;
1111
1112 journal->j_flags &= ~JBD2_ABORT;
1113 journal->j_flags |= JBD2_LOADED;
1114 return 0;
1115
1116recovery_error:
1117 printk (KERN_WARNING "JBD: recovery failed\n");
1118 return -EIO;
1119}
1120
1121/**
1122 * void jbd2_journal_destroy() - Release a journal_t structure.
1123 * @journal: Journal to act on.
1124 *
1125 * Release a journal_t structure once it is no longer in use by the
1126 * journaled object.
1127 */
1128void jbd2_journal_destroy(journal_t *journal)
1129{
1130 /* Wait for the commit thread to wake up and die. */
1131 journal_kill_thread(journal);
1132
1133 /* Force a final log commit */
1134 if (journal->j_running_transaction)
1135 jbd2_journal_commit_transaction(journal);
1136
1137 /* Force any old transactions to disk */
1138
1139 /* Totally anal locking here... */
1140 spin_lock(&journal->j_list_lock);
1141 while (journal->j_checkpoint_transactions != NULL) {
1142 spin_unlock(&journal->j_list_lock);
1143 jbd2_log_do_checkpoint(journal);
1144 spin_lock(&journal->j_list_lock);
1145 }
1146
1147 J_ASSERT(journal->j_running_transaction == NULL);
1148 J_ASSERT(journal->j_committing_transaction == NULL);
1149 J_ASSERT(journal->j_checkpoint_transactions == NULL);
1150 spin_unlock(&journal->j_list_lock);
1151
1152 /* We can now mark the journal as empty. */
1153 journal->j_tail = 0;
1154 journal->j_tail_sequence = ++journal->j_transaction_sequence;
1155 if (journal->j_sb_buffer) {
1156 jbd2_journal_update_superblock(journal, 1);
1157 brelse(journal->j_sb_buffer);
1158 }
1159
1160 if (journal->j_inode)
1161 iput(journal->j_inode);
1162 if (journal->j_revoke)
1163 jbd2_journal_destroy_revoke(journal);
1164 kfree(journal->j_wbuf);
1165 kfree(journal);
1166}
1167
1168
1169/**
1170 *int jbd2_journal_check_used_features () - Check if features specified are used.
1171 * @journal: Journal to check.
1172 * @compat: bitmask of compatible features
1173 * @ro: bitmask of features that force read-only mount
1174 * @incompat: bitmask of incompatible features
1175 *
1176 * Check whether the journal uses all of a given set of
1177 * features. Return true (non-zero) if it does.
1178 **/
1179
1180int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1181 unsigned long ro, unsigned long incompat)
1182{
1183 journal_superblock_t *sb;
1184
1185 if (!compat && !ro && !incompat)
1186 return 1;
1187 if (journal->j_format_version == 1)
1188 return 0;
1189
1190 sb = journal->j_superblock;
1191
1192 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1193 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1194 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1195 return 1;
1196
1197 return 0;
1198}
1199
1200/**
1201 * int jbd2_journal_check_available_features() - Check feature set in journalling layer
1202 * @journal: Journal to check.
1203 * @compat: bitmask of compatible features
1204 * @ro: bitmask of features that force read-only mount
1205 * @incompat: bitmask of incompatible features
1206 *
1207 * Check whether the journaling code supports the use of
1208 * all of a given set of features on this journal. Return true
1209 * (non-zero) if it can. */
1210
1211int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
1212 unsigned long ro, unsigned long incompat)
1213{
1214 journal_superblock_t *sb;
1215
1216 if (!compat && !ro && !incompat)
1217 return 1;
1218
1219 sb = journal->j_superblock;
1220
1221 /* We can support any known requested features iff the
1222 * superblock is in version 2. Otherwise we fail to support any
1223 * extended sb features. */
1224
1225 if (journal->j_format_version != 2)
1226 return 0;
1227
1228 if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
1229 (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
1230 (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
1231 return 1;
1232
1233 return 0;
1234}
1235
1236/**
1237 * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
1238 * @journal: Journal to act on.
1239 * @compat: bitmask of compatible features
1240 * @ro: bitmask of features that force read-only mount
1241 * @incompat: bitmask of incompatible features
1242 *
1243 * Mark a given journal feature as present on the
1244 * superblock. Returns true if the requested features could be set.
1245 *
1246 */
1247
1248int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1249 unsigned long ro, unsigned long incompat)
1250{
1251 journal_superblock_t *sb;
1252
1253 if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
1254 return 1;
1255
1256 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1257 return 0;
1258
1259 jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1260 compat, ro, incompat);
1261
1262 sb = journal->j_superblock;
1263
1264 sb->s_feature_compat |= cpu_to_be32(compat);
1265 sb->s_feature_ro_compat |= cpu_to_be32(ro);
1266 sb->s_feature_incompat |= cpu_to_be32(incompat);
1267
1268 return 1;
1269}
1270
1271
1272/**
1273 * int jbd2_journal_update_format () - Update on-disk journal structure.
1274 * @journal: Journal to act on.
1275 *
1276 * Given an initialised but unloaded journal struct, poke about in the
1277 * on-disk structure to update it to the most recent supported version.
1278 */
1279int jbd2_journal_update_format (journal_t *journal)
1280{
1281 journal_superblock_t *sb;
1282 int err;
1283
1284 err = journal_get_superblock(journal);
1285 if (err)
1286 return err;
1287
1288 sb = journal->j_superblock;
1289
1290 switch (be32_to_cpu(sb->s_header.h_blocktype)) {
1291 case JBD2_SUPERBLOCK_V2:
1292 return 0;
1293 case JBD2_SUPERBLOCK_V1:
1294 return journal_convert_superblock_v1(journal, sb);
1295 default:
1296 break;
1297 }
1298 return -EINVAL;
1299}
1300
1301static int journal_convert_superblock_v1(journal_t *journal,
1302 journal_superblock_t *sb)
1303{
1304 int offset, blocksize;
1305 struct buffer_head *bh;
1306
1307 printk(KERN_WARNING
1308 "JBD: Converting superblock from version 1 to 2.\n");
1309
1310 /* Pre-initialise new fields to zero */
1311 offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
1312 blocksize = be32_to_cpu(sb->s_blocksize);
1313 memset(&sb->s_feature_compat, 0, blocksize-offset);
1314
1315 sb->s_nr_users = cpu_to_be32(1);
1316 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1317 journal->j_format_version = 2;
1318
1319 bh = journal->j_sb_buffer;
1320 BUFFER_TRACE(bh, "marking dirty");
1321 mark_buffer_dirty(bh);
1322 sync_dirty_buffer(bh);
1323 return 0;
1324}
1325
1326
1327/**
1328 * int jbd2_journal_flush () - Flush journal
1329 * @journal: Journal to act on.
1330 *
1331 * Flush all data for a given journal to disk and empty the journal.
1332 * Filesystems can use this when remounting readonly to ensure that
1333 * recovery does not need to happen on remount.
1334 */
1335
1336int jbd2_journal_flush(journal_t *journal)
1337{
1338 int err = 0;
1339 transaction_t *transaction = NULL;
1340 unsigned long old_tail;
1341
1342 spin_lock(&journal->j_state_lock);
1343
1344 /* Force everything buffered to the log... */
1345 if (journal->j_running_transaction) {
1346 transaction = journal->j_running_transaction;
1347 __jbd2_log_start_commit(journal, transaction->t_tid);
1348 } else if (journal->j_committing_transaction)
1349 transaction = journal->j_committing_transaction;
1350
1351 /* Wait for the log commit to complete... */
1352 if (transaction) {
1353 tid_t tid = transaction->t_tid;
1354
1355 spin_unlock(&journal->j_state_lock);
1356 jbd2_log_wait_commit(journal, tid);
1357 } else {
1358 spin_unlock(&journal->j_state_lock);
1359 }
1360
1361 /* ...and flush everything in the log out to disk. */
1362 spin_lock(&journal->j_list_lock);
1363 while (!err && journal->j_checkpoint_transactions != NULL) {
1364 spin_unlock(&journal->j_list_lock);
1365 err = jbd2_log_do_checkpoint(journal);
1366 spin_lock(&journal->j_list_lock);
1367 }
1368 spin_unlock(&journal->j_list_lock);
1369 jbd2_cleanup_journal_tail(journal);
1370
1371 /* Finally, mark the journal as really needing no recovery.
1372 * This sets s_start==0 in the underlying superblock, which is
1373 * the magic code for a fully-recovered superblock. Any future
1374 * commits of data to the journal will restore the current
1375 * s_start value. */
1376 spin_lock(&journal->j_state_lock);
1377 old_tail = journal->j_tail;
1378 journal->j_tail = 0;
1379 spin_unlock(&journal->j_state_lock);
1380 jbd2_journal_update_superblock(journal, 1);
1381 spin_lock(&journal->j_state_lock);
1382 journal->j_tail = old_tail;
1383
1384 J_ASSERT(!journal->j_running_transaction);
1385 J_ASSERT(!journal->j_committing_transaction);
1386 J_ASSERT(!journal->j_checkpoint_transactions);
1387 J_ASSERT(journal->j_head == journal->j_tail);
1388 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1389 spin_unlock(&journal->j_state_lock);
1390 return err;
1391}
1392
1393/**
1394 * int jbd2_journal_wipe() - Wipe journal contents
1395 * @journal: Journal to act on.
1396 * @write: flag (see below)
1397 *
1398 * Wipe out all of the contents of a journal, safely. This will produce
1399 * a warning if the journal contains any valid recovery information.
1400 * Must be called between journal_init_*() and jbd2_journal_load().
1401 *
1402 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
1403 * we merely suppress recovery.
1404 */
1405
1406int jbd2_journal_wipe(journal_t *journal, int write)
1407{
1408 journal_superblock_t *sb;
1409 int err = 0;
1410
1411 J_ASSERT (!(journal->j_flags & JBD2_LOADED));
1412
1413 err = load_superblock(journal);
1414 if (err)
1415 return err;
1416
1417 sb = journal->j_superblock;
1418
1419 if (!journal->j_tail)
1420 goto no_recovery;
1421
1422 printk (KERN_WARNING "JBD: %s recovery information on journal\n",
1423 write ? "Clearing" : "Ignoring");
1424
1425 err = jbd2_journal_skip_recovery(journal);
1426 if (write)
1427 jbd2_journal_update_superblock(journal, 1);
1428
1429 no_recovery:
1430 return err;
1431}
1432
1433/*
1434 * journal_dev_name: format a character string to describe on what
1435 * device this journal is present.
1436 */
1437
1438static const char *journal_dev_name(journal_t *journal, char *buffer)
1439{
1440 struct block_device *bdev;
1441
1442 if (journal->j_inode)
1443 bdev = journal->j_inode->i_sb->s_bdev;
1444 else
1445 bdev = journal->j_dev;
1446
1447 return bdevname(bdev, buffer);
1448}
1449
1450/*
1451 * Journal abort has very specific semantics, which we describe
1452 * for journal abort.
1453 *
1454 * Two internal function, which provide abort to te jbd layer
1455 * itself are here.
1456 */
1457
1458/*
1459 * Quick version for internal journal use (doesn't lock the journal).
1460 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
1461 * and don't attempt to make any other journal updates.
1462 */
1463void __jbd2_journal_abort_hard(journal_t *journal)
1464{
1465 transaction_t *transaction;
1466 char b[BDEVNAME_SIZE];
1467
1468 if (journal->j_flags & JBD2_ABORT)
1469 return;
1470
1471 printk(KERN_ERR "Aborting journal on device %s.\n",
1472 journal_dev_name(journal, b));
1473
1474 spin_lock(&journal->j_state_lock);
1475 journal->j_flags |= JBD2_ABORT;
1476 transaction = journal->j_running_transaction;
1477 if (transaction)
1478 __jbd2_log_start_commit(journal, transaction->t_tid);
1479 spin_unlock(&journal->j_state_lock);
1480}
1481
1482/* Soft abort: record the abort error status in the journal superblock,
1483 * but don't do any other IO. */
1484static void __journal_abort_soft (journal_t *journal, int errno)
1485{
1486 if (journal->j_flags & JBD2_ABORT)
1487 return;
1488
1489 if (!journal->j_errno)
1490 journal->j_errno = errno;
1491
1492 __jbd2_journal_abort_hard(journal);
1493
1494 if (errno)
1495 jbd2_journal_update_superblock(journal, 1);
1496}
1497
1498/**
1499 * void jbd2_journal_abort () - Shutdown the journal immediately.
1500 * @journal: the journal to shutdown.
1501 * @errno: an error number to record in the journal indicating
1502 * the reason for the shutdown.
1503 *
1504 * Perform a complete, immediate shutdown of the ENTIRE
1505 * journal (not of a single transaction). This operation cannot be
1506 * undone without closing and reopening the journal.
1507 *
1508 * The jbd2_journal_abort function is intended to support higher level error
1509 * recovery mechanisms such as the ext2/ext3 remount-readonly error
1510 * mode.
1511 *
1512 * Journal abort has very specific semantics. Any existing dirty,
1513 * unjournaled buffers in the main filesystem will still be written to
1514 * disk by bdflush, but the journaling mechanism will be suspended
1515 * immediately and no further transaction commits will be honoured.
1516 *
1517 * Any dirty, journaled buffers will be written back to disk without
1518 * hitting the journal. Atomicity cannot be guaranteed on an aborted
1519 * filesystem, but we _do_ attempt to leave as much data as possible
1520 * behind for fsck to use for cleanup.
1521 *
1522 * Any attempt to get a new transaction handle on a journal which is in
1523 * ABORT state will just result in an -EROFS error return. A
1524 * jbd2_journal_stop on an existing handle will return -EIO if we have
1525 * entered abort state during the update.
1526 *
1527 * Recursive transactions are not disturbed by journal abort until the
1528 * final jbd2_journal_stop, which will receive the -EIO error.
1529 *
1530 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
1531 * which will be recorded (if possible) in the journal superblock. This
1532 * allows a client to record failure conditions in the middle of a
1533 * transaction without having to complete the transaction to record the
1534 * failure to disk. ext3_error, for example, now uses this
1535 * functionality.
1536 *
1537 * Errors which originate from within the journaling layer will NOT
1538 * supply an errno; a null errno implies that absolutely no further
1539 * writes are done to the journal (unless there are any already in
1540 * progress).
1541 *
1542 */
1543
1544void jbd2_journal_abort(journal_t *journal, int errno)
1545{
1546 __journal_abort_soft(journal, errno);
1547}
1548
1549/**
1550 * int jbd2_journal_errno () - returns the journal's error state.
1551 * @journal: journal to examine.
1552 *
1553 * This is the errno numbet set with jbd2_journal_abort(), the last
1554 * time the journal was mounted - if the journal was stopped
1555 * without calling abort this will be 0.
1556 *
1557 * If the journal has been aborted on this mount time -EROFS will
1558 * be returned.
1559 */
1560int jbd2_journal_errno(journal_t *journal)
1561{
1562 int err;
1563
1564 spin_lock(&journal->j_state_lock);
1565 if (journal->j_flags & JBD2_ABORT)
1566 err = -EROFS;
1567 else
1568 err = journal->j_errno;
1569 spin_unlock(&journal->j_state_lock);
1570 return err;
1571}
1572
1573/**
1574 * int jbd2_journal_clear_err () - clears the journal's error state
1575 * @journal: journal to act on.
1576 *
1577 * An error must be cleared or Acked to take a FS out of readonly
1578 * mode.
1579 */
1580int jbd2_journal_clear_err(journal_t *journal)
1581{
1582 int err = 0;
1583
1584 spin_lock(&journal->j_state_lock);
1585 if (journal->j_flags & JBD2_ABORT)
1586 err = -EROFS;
1587 else
1588 journal->j_errno = 0;
1589 spin_unlock(&journal->j_state_lock);
1590 return err;
1591}
1592
1593/**
1594 * void jbd2_journal_ack_err() - Ack journal err.
1595 * @journal: journal to act on.
1596 *
1597 * An error must be cleared or Acked to take a FS out of readonly
1598 * mode.
1599 */
1600void jbd2_journal_ack_err(journal_t *journal)
1601{
1602 spin_lock(&journal->j_state_lock);
1603 if (journal->j_errno)
1604 journal->j_flags |= JBD2_ACK_ERR;
1605 spin_unlock(&journal->j_state_lock);
1606}
1607
1608int jbd2_journal_blocks_per_page(struct inode *inode)
1609{
1610 return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1611}
1612
1613/*
1614 * helper functions to deal with 32 or 64bit block numbers.
1615 */
1616size_t journal_tag_bytes(journal_t *journal)
1617{
1618 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
1619 return JBD_TAG_SIZE64;
1620 else
1621 return JBD_TAG_SIZE32;
1622}
1623
1624/*
1625 * Simple support for retrying memory allocations. Introduced to help to
1626 * debug different VM deadlock avoidance strategies.
1627 */
1628void * __jbd2_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
1629{
1630 return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
1631}
1632
1633/*
1634 * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
1635 * and allocate frozen and commit buffers from these slabs.
1636 *
1637 * Reason for doing this is to avoid, SLAB_DEBUG - since it could
1638 * cause bh to cross page boundary.
1639 */
1640
1641#define JBD_MAX_SLABS 5
1642#define JBD_SLAB_INDEX(size) (size >> 11)
1643
1644static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
1645static const char *jbd_slab_names[JBD_MAX_SLABS] = {
1646 "jbd2_1k", "jbd2_2k", "jbd2_4k", NULL, "jbd2_8k"
1647};
1648
1649static void jbd2_journal_destroy_jbd_slabs(void)
1650{
1651 int i;
1652
1653 for (i = 0; i < JBD_MAX_SLABS; i++) {
1654 if (jbd_slab[i])
1655 kmem_cache_destroy(jbd_slab[i]);
1656 jbd_slab[i] = NULL;
1657 }
1658}
1659
1660static int jbd2_journal_create_jbd_slab(size_t slab_size)
1661{
1662 int i = JBD_SLAB_INDEX(slab_size);
1663
1664 BUG_ON(i >= JBD_MAX_SLABS);
1665
1666 /*
1667 * Check if we already have a slab created for this size
1668 */
1669 if (jbd_slab[i])
1670 return 0;
1671
1672 /*
1673 * Create a slab and force alignment to be same as slabsize -
1674 * this will make sure that allocations won't cross the page
1675 * boundary.
1676 */
1677 jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
1678 slab_size, slab_size, 0, NULL, NULL);
1679 if (!jbd_slab[i]) {
1680 printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
1681 return -ENOMEM;
1682 }
1683 return 0;
1684}
1685
1686void * jbd2_slab_alloc(size_t size, gfp_t flags)
1687{
1688 int idx;
1689
1690 idx = JBD_SLAB_INDEX(size);
1691 BUG_ON(jbd_slab[idx] == NULL);
1692 return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
1693}
1694
1695void jbd2_slab_free(void *ptr, size_t size)
1696{
1697 int idx;
1698
1699 idx = JBD_SLAB_INDEX(size);
1700 BUG_ON(jbd_slab[idx] == NULL);
1701 kmem_cache_free(jbd_slab[idx], ptr);
1702}
1703
1704/*
1705 * Journal_head storage management
1706 */
1707static kmem_cache_t *jbd2_journal_head_cache;
1708#ifdef CONFIG_JBD_DEBUG
1709static atomic_t nr_journal_heads = ATOMIC_INIT(0);
1710#endif
1711
1712static int journal_init_jbd2_journal_head_cache(void)
1713{
1714 int retval;
1715
1716 J_ASSERT(jbd2_journal_head_cache == 0);
1717 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
1718 sizeof(struct journal_head),
1719 0, /* offset */
1720 0, /* flags */
1721 NULL, /* ctor */
1722 NULL); /* dtor */
1723 retval = 0;
1724 if (jbd2_journal_head_cache == 0) {
1725 retval = -ENOMEM;
1726 printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
1727 }
1728 return retval;
1729}
1730
1731static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
1732{
1733 J_ASSERT(jbd2_journal_head_cache != NULL);
1734 kmem_cache_destroy(jbd2_journal_head_cache);
1735 jbd2_journal_head_cache = NULL;
1736}
1737
1738/*
1739 * journal_head splicing and dicing
1740 */
1741static struct journal_head *journal_alloc_journal_head(void)
1742{
1743 struct journal_head *ret;
1744 static unsigned long last_warning;
1745
1746#ifdef CONFIG_JBD_DEBUG
1747 atomic_inc(&nr_journal_heads);
1748#endif
1749 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1750 if (ret == 0) {
1751 jbd_debug(1, "out of memory for journal_head\n");
1752 if (time_after(jiffies, last_warning + 5*HZ)) {
1753 printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
1754 __FUNCTION__);
1755 last_warning = jiffies;
1756 }
1757 while (ret == 0) {
1758 yield();
1759 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
1760 }
1761 }
1762 return ret;
1763}
1764
1765static void journal_free_journal_head(struct journal_head *jh)
1766{
1767#ifdef CONFIG_JBD_DEBUG
1768 atomic_dec(&nr_journal_heads);
1769 memset(jh, JBD_POISON_FREE, sizeof(*jh));
1770#endif
1771 kmem_cache_free(jbd2_journal_head_cache, jh);
1772}
1773
1774/*
1775 * A journal_head is attached to a buffer_head whenever JBD has an
1776 * interest in the buffer.
1777 *
1778 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
1779 * is set. This bit is tested in core kernel code where we need to take
1780 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
1781 * there.
1782 *
1783 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
1784 *
1785 * When a buffer has its BH_JBD bit set it is immune from being released by
1786 * core kernel code, mainly via ->b_count.
1787 *
1788 * A journal_head may be detached from its buffer_head when the journal_head's
1789 * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
1790 * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the
1791 * journal_head can be dropped if needed.
1792 *
1793 * Various places in the kernel want to attach a journal_head to a buffer_head
1794 * _before_ attaching the journal_head to a transaction. To protect the
1795 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
1796 * journal_head's b_jcount refcount by one. The caller must call
1797 * jbd2_journal_put_journal_head() to undo this.
1798 *
1799 * So the typical usage would be:
1800 *
1801 * (Attach a journal_head if needed. Increments b_jcount)
1802 * struct journal_head *jh = jbd2_journal_add_journal_head(bh);
1803 * ...
1804 * jh->b_transaction = xxx;
1805 * jbd2_journal_put_journal_head(jh);
1806 *
1807 * Now, the journal_head's b_jcount is zero, but it is safe from being released
1808 * because it has a non-zero b_transaction.
1809 */
1810
1811/*
1812 * Give a buffer_head a journal_head.
1813 *
1814 * Doesn't need the journal lock.
1815 * May sleep.
1816 */
1817struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
1818{
1819 struct journal_head *jh;
1820 struct journal_head *new_jh = NULL;
1821
1822repeat:
1823 if (!buffer_jbd(bh)) {
1824 new_jh = journal_alloc_journal_head();
1825 memset(new_jh, 0, sizeof(*new_jh));
1826 }
1827
1828 jbd_lock_bh_journal_head(bh);
1829 if (buffer_jbd(bh)) {
1830 jh = bh2jh(bh);
1831 } else {
1832 J_ASSERT_BH(bh,
1833 (atomic_read(&bh->b_count) > 0) ||
1834 (bh->b_page && bh->b_page->mapping));
1835
1836 if (!new_jh) {
1837 jbd_unlock_bh_journal_head(bh);
1838 goto repeat;
1839 }
1840
1841 jh = new_jh;
1842 new_jh = NULL; /* We consumed it */
1843 set_buffer_jbd(bh);
1844 bh->b_private = jh;
1845 jh->b_bh = bh;
1846 get_bh(bh);
1847 BUFFER_TRACE(bh, "added journal_head");
1848 }
1849 jh->b_jcount++;
1850 jbd_unlock_bh_journal_head(bh);
1851 if (new_jh)
1852 journal_free_journal_head(new_jh);
1853 return bh->b_private;
1854}
1855
1856/*
1857 * Grab a ref against this buffer_head's journal_head. If it ended up not
1858 * having a journal_head, return NULL
1859 */
1860struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
1861{
1862 struct journal_head *jh = NULL;
1863
1864 jbd_lock_bh_journal_head(bh);
1865 if (buffer_jbd(bh)) {
1866 jh = bh2jh(bh);
1867 jh->b_jcount++;
1868 }
1869 jbd_unlock_bh_journal_head(bh);
1870 return jh;
1871}
1872
1873static void __journal_remove_journal_head(struct buffer_head *bh)
1874{
1875 struct journal_head *jh = bh2jh(bh);
1876
1877 J_ASSERT_JH(jh, jh->b_jcount >= 0);
1878
1879 get_bh(bh);
1880 if (jh->b_jcount == 0) {
1881 if (jh->b_transaction == NULL &&
1882 jh->b_next_transaction == NULL &&
1883 jh->b_cp_transaction == NULL) {
1884 J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
1885 J_ASSERT_BH(bh, buffer_jbd(bh));
1886 J_ASSERT_BH(bh, jh2bh(jh) == bh);
1887 BUFFER_TRACE(bh, "remove journal_head");
1888 if (jh->b_frozen_data) {
1889 printk(KERN_WARNING "%s: freeing "
1890 "b_frozen_data\n",
1891 __FUNCTION__);
1892 jbd2_slab_free(jh->b_frozen_data, bh->b_size);
1893 }
1894 if (jh->b_committed_data) {
1895 printk(KERN_WARNING "%s: freeing "
1896 "b_committed_data\n",
1897 __FUNCTION__);
1898 jbd2_slab_free(jh->b_committed_data, bh->b_size);
1899 }
1900 bh->b_private = NULL;
1901 jh->b_bh = NULL; /* debug, really */
1902 clear_buffer_jbd(bh);
1903 __brelse(bh);
1904 journal_free_journal_head(jh);
1905 } else {
1906 BUFFER_TRACE(bh, "journal_head was locked");
1907 }
1908 }
1909}
1910
1911/*
1912 * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction
1913 * and has a zero b_jcount then remove and release its journal_head. If we did
1914 * see that the buffer is not used by any transaction we also "logically"
1915 * decrement ->b_count.
1916 *
1917 * We in fact take an additional increment on ->b_count as a convenience,
1918 * because the caller usually wants to do additional things with the bh
1919 * after calling here.
1920 * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some
1921 * time. Once the caller has run __brelse(), the buffer is eligible for
1922 * reaping by try_to_free_buffers().
1923 */
1924void jbd2_journal_remove_journal_head(struct buffer_head *bh)
1925{
1926 jbd_lock_bh_journal_head(bh);
1927 __journal_remove_journal_head(bh);
1928 jbd_unlock_bh_journal_head(bh);
1929}
1930
1931/*
1932 * Drop a reference on the passed journal_head. If it fell to zero then try to
1933 * release the journal_head from the buffer_head.
1934 */
1935void jbd2_journal_put_journal_head(struct journal_head *jh)
1936{
1937 struct buffer_head *bh = jh2bh(jh);
1938
1939 jbd_lock_bh_journal_head(bh);
1940 J_ASSERT_JH(jh, jh->b_jcount > 0);
1941 --jh->b_jcount;
1942 if (!jh->b_jcount && !jh->b_transaction) {
1943 __journal_remove_journal_head(bh);
1944 __brelse(bh);
1945 }
1946 jbd_unlock_bh_journal_head(bh);
1947}
1948
1949/*
1950 * /proc tunables
1951 */
1952#if defined(CONFIG_JBD_DEBUG)
1953int jbd2_journal_enable_debug;
1954EXPORT_SYMBOL(jbd2_journal_enable_debug);
1955#endif
1956
1957#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
1958
1959static struct proc_dir_entry *proc_jbd_debug;
1960
1961static int read_jbd_debug(char *page, char **start, off_t off,
1962 int count, int *eof, void *data)
1963{
1964 int ret;
1965
1966 ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug);
1967 *eof = 1;
1968 return ret;
1969}
1970
1971static int write_jbd_debug(struct file *file, const char __user *buffer,
1972 unsigned long count, void *data)
1973{
1974 char buf[32];
1975
1976 if (count > ARRAY_SIZE(buf) - 1)
1977 count = ARRAY_SIZE(buf) - 1;
1978 if (copy_from_user(buf, buffer, count))
1979 return -EFAULT;
1980 buf[ARRAY_SIZE(buf) - 1] = '\0';
1981 jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10);
1982 return count;
1983}
1984
1985#define JBD_PROC_NAME "sys/fs/jbd2-debug"
1986
1987static void __init create_jbd_proc_entry(void)
1988{
1989 proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
1990 if (proc_jbd_debug) {
1991 /* Why is this so hard? */
1992 proc_jbd_debug->read_proc = read_jbd_debug;
1993 proc_jbd_debug->write_proc = write_jbd_debug;
1994 }
1995}
1996
1997static void __exit jbd2_remove_jbd_proc_entry(void)
1998{
1999 if (proc_jbd_debug)
2000 remove_proc_entry(JBD_PROC_NAME, NULL);
2001}
2002
2003#else
2004
2005#define create_jbd_proc_entry() do {} while (0)
2006#define jbd2_remove_jbd_proc_entry() do {} while (0)
2007
2008#endif
2009
2010kmem_cache_t *jbd2_handle_cache;
2011
2012static int __init journal_init_handle_cache(void)
2013{
2014 jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
2015 sizeof(handle_t),
2016 0, /* offset */
2017 0, /* flags */
2018 NULL, /* ctor */
2019 NULL); /* dtor */
2020 if (jbd2_handle_cache == NULL) {
2021 printk(KERN_EMERG "JBD: failed to create handle cache\n");
2022 return -ENOMEM;
2023 }
2024 return 0;
2025}
2026
2027static void jbd2_journal_destroy_handle_cache(void)
2028{
2029 if (jbd2_handle_cache)
2030 kmem_cache_destroy(jbd2_handle_cache);
2031}
2032
2033/*
2034 * Module startup and shutdown
2035 */
2036
2037static int __init journal_init_caches(void)
2038{
2039 int ret;
2040
2041 ret = jbd2_journal_init_revoke_caches();
2042 if (ret == 0)
2043 ret = journal_init_jbd2_journal_head_cache();
2044 if (ret == 0)
2045 ret = journal_init_handle_cache();
2046 return ret;
2047}
2048
2049static void jbd2_journal_destroy_caches(void)
2050{
2051 jbd2_journal_destroy_revoke_caches();
2052 jbd2_journal_destroy_jbd2_journal_head_cache();
2053 jbd2_journal_destroy_handle_cache();
2054 jbd2_journal_destroy_jbd_slabs();
2055}
2056
2057static int __init journal_init(void)
2058{
2059 int ret;
2060
2061 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2062
2063 ret = journal_init_caches();
2064 if (ret != 0)
2065 jbd2_journal_destroy_caches();
2066 create_jbd_proc_entry();
2067 return ret;
2068}
2069
2070static void __exit journal_exit(void)
2071{
2072#ifdef CONFIG_JBD_DEBUG
2073 int n = atomic_read(&nr_journal_heads);
2074 if (n)
2075 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
2076#endif
2077 jbd2_remove_jbd_proc_entry();
2078 jbd2_journal_destroy_caches();
2079}
2080
2081MODULE_LICENSE("GPL");
2082module_init(journal_init);
2083module_exit(journal_exit);
2084
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
new file mode 100644
index 000000000000..9f10acafaf70
--- /dev/null
+++ b/fs/jbd2/recovery.c
@@ -0,0 +1,609 @@
1/*
2 * linux/fs/recovery.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 *
6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal recovery routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#ifndef __KERNEL__
17#include "jfs_user.h"
18#else
19#include <linux/time.h>
20#include <linux/fs.h>
21#include <linux/jbd2.h>
22#include <linux/errno.h>
23#include <linux/slab.h>
24#endif
25
26/*
27 * Maintain information about the progress of the recovery job, so that
28 * the different passes can carry information between them.
29 */
30struct recovery_info
31{
32 tid_t start_transaction;
33 tid_t end_transaction;
34
35 int nr_replays;
36 int nr_revokes;
37 int nr_revoke_hits;
38};
39
40enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
41static int do_one_pass(journal_t *journal,
42 struct recovery_info *info, enum passtype pass);
43static int scan_revoke_records(journal_t *, struct buffer_head *,
44 tid_t, struct recovery_info *);
45
46#ifdef __KERNEL__
47
48/* Release readahead buffers after use */
49static void journal_brelse_array(struct buffer_head *b[], int n)
50{
51 while (--n >= 0)
52 brelse (b[n]);
53}
54
55
56/*
57 * When reading from the journal, we are going through the block device
58 * layer directly and so there is no readahead being done for us. We
59 * need to implement any readahead ourselves if we want it to happen at
60 * all. Recovery is basically one long sequential read, so make sure we
61 * do the IO in reasonably large chunks.
62 *
63 * This is not so critical that we need to be enormously clever about
64 * the readahead size, though. 128K is a purely arbitrary, good-enough
65 * fixed value.
66 */
67
68#define MAXBUF 8
69static int do_readahead(journal_t *journal, unsigned int start)
70{
71 int err;
72 unsigned int max, nbufs, next;
73 unsigned long long blocknr;
74 struct buffer_head *bh;
75
76 struct buffer_head * bufs[MAXBUF];
77
78 /* Do up to 128K of readahead */
79 max = start + (128 * 1024 / journal->j_blocksize);
80 if (max > journal->j_maxlen)
81 max = journal->j_maxlen;
82
83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
84 * a time to the block device IO layer. */
85
86 nbufs = 0;
87
88 for (next = start; next < max; next++) {
89 err = jbd2_journal_bmap(journal, next, &blocknr);
90
91 if (err) {
92 printk (KERN_ERR "JBD: bad block at offset %u\n",
93 next);
94 goto failed;
95 }
96
97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
98 if (!bh) {
99 err = -ENOMEM;
100 goto failed;
101 }
102
103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
104 bufs[nbufs++] = bh;
105 if (nbufs == MAXBUF) {
106 ll_rw_block(READ, nbufs, bufs);
107 journal_brelse_array(bufs, nbufs);
108 nbufs = 0;
109 }
110 } else
111 brelse(bh);
112 }
113
114 if (nbufs)
115 ll_rw_block(READ, nbufs, bufs);
116 err = 0;
117
118failed:
119 if (nbufs)
120 journal_brelse_array(bufs, nbufs);
121 return err;
122}
123
124#endif /* __KERNEL__ */
125
126
127/*
128 * Read a block from the journal
129 */
130
131static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset)
133{
134 int err;
135 unsigned long long blocknr;
136 struct buffer_head *bh;
137
138 *bhp = NULL;
139
140 if (offset >= journal->j_maxlen) {
141 printk(KERN_ERR "JBD: corrupted journal superblock\n");
142 return -EIO;
143 }
144
145 err = jbd2_journal_bmap(journal, offset, &blocknr);
146
147 if (err) {
148 printk (KERN_ERR "JBD: bad block at offset %u\n",
149 offset);
150 return err;
151 }
152
153 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
154 if (!bh)
155 return -ENOMEM;
156
157 if (!buffer_uptodate(bh)) {
158 /* If this is a brand new buffer, start readahead.
159 Otherwise, we assume we are already reading it. */
160 if (!buffer_req(bh))
161 do_readahead(journal, offset);
162 wait_on_buffer(bh);
163 }
164
165 if (!buffer_uptodate(bh)) {
166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
167 offset);
168 brelse(bh);
169 return -EIO;
170 }
171
172 *bhp = bh;
173 return 0;
174}
175
176
177/*
178 * Count the number of in-use tags in a journal descriptor block.
179 */
180
181static int count_tags(journal_t *journal, struct buffer_head *bh)
182{
183 char * tagp;
184 journal_block_tag_t * tag;
185 int nr = 0, size = journal->j_blocksize;
186 int tag_bytes = journal_tag_bytes(journal);
187
188 tagp = &bh->b_data[sizeof(journal_header_t)];
189
190 while ((tagp - bh->b_data + tag_bytes) <= size) {
191 tag = (journal_block_tag_t *) tagp;
192
193 nr++;
194 tagp += tag_bytes;
195 if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
196 tagp += 16;
197
198 if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
199 break;
200 }
201
202 return nr;
203}
204
205
206/* Make sure we wrap around the log correctly! */
207#define wrap(journal, var) \
208do { \
209 if (var >= (journal)->j_last) \
210 var -= ((journal)->j_last - (journal)->j_first); \
211} while (0)
212
213/**
214 * jbd2_journal_recover - recovers a on-disk journal
215 * @journal: the journal to recover
216 *
217 * The primary function for recovering the log contents when mounting a
218 * journaled device.
219 *
220 * Recovery is done in three passes. In the first pass, we look for the
221 * end of the log. In the second, we assemble the list of revoke
222 * blocks. In the third and final pass, we replay any un-revoked blocks
223 * in the log.
224 */
225int jbd2_journal_recover(journal_t *journal)
226{
227 int err;
228 journal_superblock_t * sb;
229
230 struct recovery_info info;
231
232 memset(&info, 0, sizeof(info));
233 sb = journal->j_superblock;
234
235 /*
236 * The journal superblock's s_start field (the current log head)
237 * is always zero if, and only if, the journal was cleanly
238 * unmounted.
239 */
240
241 if (!sb->s_start) {
242 jbd_debug(1, "No recovery required, last transaction %d\n",
243 be32_to_cpu(sb->s_sequence));
244 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
245 return 0;
246 }
247
248 err = do_one_pass(journal, &info, PASS_SCAN);
249 if (!err)
250 err = do_one_pass(journal, &info, PASS_REVOKE);
251 if (!err)
252 err = do_one_pass(journal, &info, PASS_REPLAY);
253
254 jbd_debug(0, "JBD: recovery, exit status %d, "
255 "recovered transactions %u to %u\n",
256 err, info.start_transaction, info.end_transaction);
257 jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
258 info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
259
260 /* Restart the log at the next transaction ID, thus invalidating
261 * any existing commit records in the log. */
262 journal->j_transaction_sequence = ++info.end_transaction;
263
264 jbd2_journal_clear_revoke(journal);
265 sync_blockdev(journal->j_fs_dev);
266 return err;
267}
268
269/**
270 * jbd2_journal_skip_recovery - Start journal and wipe exiting records
271 * @journal: journal to startup
272 *
273 * Locate any valid recovery information from the journal and set up the
274 * journal structures in memory to ignore it (presumably because the
275 * caller has evidence that it is out of date).
276 * This function does'nt appear to be exorted..
277 *
278 * We perform one pass over the journal to allow us to tell the user how
279 * much recovery information is being erased, and to let us initialise
280 * the journal transaction sequence numbers to the next unused ID.
281 */
282int jbd2_journal_skip_recovery(journal_t *journal)
283{
284 int err;
285 journal_superblock_t * sb;
286
287 struct recovery_info info;
288
289 memset (&info, 0, sizeof(info));
290 sb = journal->j_superblock;
291
292 err = do_one_pass(journal, &info, PASS_SCAN);
293
294 if (err) {
295 printk(KERN_ERR "JBD: error %d scanning journal\n", err);
296 ++journal->j_transaction_sequence;
297 } else {
298#ifdef CONFIG_JBD_DEBUG
299 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
300#endif
301 jbd_debug(0,
302 "JBD: ignoring %d transaction%s from the journal.\n",
303 dropped, (dropped == 1) ? "" : "s");
304 journal->j_transaction_sequence = ++info.end_transaction;
305 }
306
307 journal->j_tail = 0;
308 return err;
309}
310
311static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
312{
313 unsigned long long block = be32_to_cpu(tag->t_blocknr);
314 if (tag_bytes > JBD_TAG_SIZE32)
315 block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
316 return block;
317}
318
319static int do_one_pass(journal_t *journal,
320 struct recovery_info *info, enum passtype pass)
321{
322 unsigned int first_commit_ID, next_commit_ID;
323 unsigned long next_log_block;
324 int err, success = 0;
325 journal_superblock_t * sb;
326 journal_header_t * tmp;
327 struct buffer_head * bh;
328 unsigned int sequence;
329 int blocktype;
330 int tag_bytes = journal_tag_bytes(journal);
331
332 /* Precompute the maximum metadata descriptors in a descriptor block */
333 int MAX_BLOCKS_PER_DESC;
334 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
335 / tag_bytes);
336
337 /*
338 * First thing is to establish what we expect to find in the log
339 * (in terms of transaction IDs), and where (in terms of log
340 * block offsets): query the superblock.
341 */
342
343 sb = journal->j_superblock;
344 next_commit_ID = be32_to_cpu(sb->s_sequence);
345 next_log_block = be32_to_cpu(sb->s_start);
346
347 first_commit_ID = next_commit_ID;
348 if (pass == PASS_SCAN)
349 info->start_transaction = first_commit_ID;
350
351 jbd_debug(1, "Starting recovery pass %d\n", pass);
352
353 /*
354 * Now we walk through the log, transaction by transaction,
355 * making sure that each transaction has a commit block in the
356 * expected place. Each complete transaction gets replayed back
357 * into the main filesystem.
358 */
359
360 while (1) {
361 int flags;
362 char * tagp;
363 journal_block_tag_t * tag;
364 struct buffer_head * obh;
365 struct buffer_head * nbh;
366
367 cond_resched(); /* We're under lock_kernel() */
368
369 /* If we already know where to stop the log traversal,
370 * check right now that we haven't gone past the end of
371 * the log. */
372
373 if (pass != PASS_SCAN)
374 if (tid_geq(next_commit_ID, info->end_transaction))
375 break;
376
377 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
378 next_commit_ID, next_log_block, journal->j_last);
379
380 /* Skip over each chunk of the transaction looking
381 * either the next descriptor block or the final commit
382 * record. */
383
384 jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
385 err = jread(&bh, journal, next_log_block);
386 if (err)
387 goto failed;
388
389 next_log_block++;
390 wrap(journal, next_log_block);
391
392 /* What kind of buffer is it?
393 *
394 * If it is a descriptor block, check that it has the
395 * expected sequence number. Otherwise, we're all done
396 * here. */
397
398 tmp = (journal_header_t *)bh->b_data;
399
400 if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
401 brelse(bh);
402 break;
403 }
404
405 blocktype = be32_to_cpu(tmp->h_blocktype);
406 sequence = be32_to_cpu(tmp->h_sequence);
407 jbd_debug(3, "Found magic %d, sequence %d\n",
408 blocktype, sequence);
409
410 if (sequence != next_commit_ID) {
411 brelse(bh);
412 break;
413 }
414
415 /* OK, we have a valid descriptor block which matches
416 * all of the sequence number checks. What are we going
417 * to do with it? That depends on the pass... */
418
419 switch(blocktype) {
420 case JBD2_DESCRIPTOR_BLOCK:
421 /* If it is a valid descriptor block, replay it
422 * in pass REPLAY; otherwise, just skip over the
423 * blocks it describes. */
424 if (pass != PASS_REPLAY) {
425 next_log_block += count_tags(journal, bh);
426 wrap(journal, next_log_block);
427 brelse(bh);
428 continue;
429 }
430
431 /* A descriptor block: we can now write all of
432 * the data blocks. Yay, useful work is finally
433 * getting done here! */
434
435 tagp = &bh->b_data[sizeof(journal_header_t)];
436 while ((tagp - bh->b_data + tag_bytes)
437 <= journal->j_blocksize) {
438 unsigned long io_block;
439
440 tag = (journal_block_tag_t *) tagp;
441 flags = be32_to_cpu(tag->t_flags);
442
443 io_block = next_log_block++;
444 wrap(journal, next_log_block);
445 err = jread(&obh, journal, io_block);
446 if (err) {
447 /* Recover what we can, but
448 * report failure at the end. */
449 success = err;
450 printk (KERN_ERR
451 "JBD: IO error %d recovering "
452 "block %ld in log\n",
453 err, io_block);
454 } else {
455 unsigned long long blocknr;
456
457 J_ASSERT(obh != NULL);
458 blocknr = read_tag_block(tag_bytes,
459 tag);
460
461 /* If the block has been
462 * revoked, then we're all done
463 * here. */
464 if (jbd2_journal_test_revoke
465 (journal, blocknr,
466 next_commit_ID)) {
467 brelse(obh);
468 ++info->nr_revoke_hits;
469 goto skip_write;
470 }
471
472 /* Find a buffer for the new
473 * data being restored */
474 nbh = __getblk(journal->j_fs_dev,
475 blocknr,
476 journal->j_blocksize);
477 if (nbh == NULL) {
478 printk(KERN_ERR
479 "JBD: Out of memory "
480 "during recovery.\n");
481 err = -ENOMEM;
482 brelse(bh);
483 brelse(obh);
484 goto failed;
485 }
486
487 lock_buffer(nbh);
488 memcpy(nbh->b_data, obh->b_data,
489 journal->j_blocksize);
490 if (flags & JBD2_FLAG_ESCAPE) {
491 *((__be32 *)bh->b_data) =
492 cpu_to_be32(JBD2_MAGIC_NUMBER);
493 }
494
495 BUFFER_TRACE(nbh, "marking dirty");
496 set_buffer_uptodate(nbh);
497 mark_buffer_dirty(nbh);
498 BUFFER_TRACE(nbh, "marking uptodate");
499 ++info->nr_replays;
500 /* ll_rw_block(WRITE, 1, &nbh); */
501 unlock_buffer(nbh);
502 brelse(obh);
503 brelse(nbh);
504 }
505
506 skip_write:
507 tagp += tag_bytes;
508 if (!(flags & JBD2_FLAG_SAME_UUID))
509 tagp += 16;
510
511 if (flags & JBD2_FLAG_LAST_TAG)
512 break;
513 }
514
515 brelse(bh);
516 continue;
517
518 case JBD2_COMMIT_BLOCK:
519 /* Found an expected commit block: not much to
520 * do other than move on to the next sequence
521 * number. */
522 brelse(bh);
523 next_commit_ID++;
524 continue;
525
526 case JBD2_REVOKE_BLOCK:
527 /* If we aren't in the REVOKE pass, then we can
528 * just skip over this block. */
529 if (pass != PASS_REVOKE) {
530 brelse(bh);
531 continue;
532 }
533
534 err = scan_revoke_records(journal, bh,
535 next_commit_ID, info);
536 brelse(bh);
537 if (err)
538 goto failed;
539 continue;
540
541 default:
542 jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
543 blocktype);
544 brelse(bh);
545 goto done;
546 }
547 }
548
549 done:
550 /*
551 * We broke out of the log scan loop: either we came to the
552 * known end of the log or we found an unexpected block in the
553 * log. If the latter happened, then we know that the "current"
554 * transaction marks the end of the valid log.
555 */
556
557 if (pass == PASS_SCAN)
558 info->end_transaction = next_commit_ID;
559 else {
560 /* It's really bad news if different passes end up at
561 * different places (but possible due to IO errors). */
562 if (info->end_transaction != next_commit_ID) {
563 printk (KERN_ERR "JBD: recovery pass %d ended at "
564 "transaction %u, expected %u\n",
565 pass, next_commit_ID, info->end_transaction);
566 if (!success)
567 success = -EIO;
568 }
569 }
570
571 return success;
572
573 failed:
574 return err;
575}
576
577
578/* Scan a revoke record, marking all blocks mentioned as revoked. */
579
580static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
581 tid_t sequence, struct recovery_info *info)
582{
583 jbd2_journal_revoke_header_t *header;
584 int offset, max;
585 int record_len = 4;
586
587 header = (jbd2_journal_revoke_header_t *) bh->b_data;
588 offset = sizeof(jbd2_journal_revoke_header_t);
589 max = be32_to_cpu(header->r_count);
590
591 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
592 record_len = 8;
593
594 while (offset + record_len <= max) {
595 unsigned long long blocknr;
596 int err;
597
598 if (record_len == 4)
599 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
600 else
601 blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
602 offset += record_len;
603 err = jbd2_journal_set_revoke(journal, blocknr, sequence);
604 if (err)
605 return err;
606 ++info->nr_revokes;
607 }
608 return 0;
609}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
new file mode 100644
index 000000000000..380d19917f37
--- /dev/null
+++ b/fs/jbd2/revoke.c
@@ -0,0 +1,712 @@
1/*
2 * linux/fs/revoke.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
5 *
6 * Copyright 2000 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal revoke routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 *
15 * Revoke is the mechanism used to prevent old log records for deleted
16 * metadata from being replayed on top of newer data using the same
17 * blocks. The revoke mechanism is used in two separate places:
18 *
19 * + Commit: during commit we write the entire list of the current
20 * transaction's revoked blocks to the journal
21 *
22 * + Recovery: during recovery we record the transaction ID of all
23 * revoked blocks. If there are multiple revoke records in the log
24 * for a single block, only the last one counts, and if there is a log
25 * entry for a block beyond the last revoke, then that log entry still
26 * gets replayed.
27 *
28 * We can get interactions between revokes and new log data within a
29 * single transaction:
30 *
31 * Block is revoked and then journaled:
32 * The desired end result is the journaling of the new block, so we
33 * cancel the revoke before the transaction commits.
34 *
35 * Block is journaled and then revoked:
36 * The revoke must take precedence over the write of the block, so we
37 * need either to cancel the journal entry or to write the revoke
38 * later in the log than the log block. In this case, we choose the
39 * latter: journaling a block cancels any revoke record for that block
40 * in the current transaction, so any revoke for that block in the
41 * transaction must have happened after the block was journaled and so
42 * the revoke must take precedence.
43 *
44 * Block is revoked and then written as data:
45 * The data write is allowed to succeed, but the revoke is _not_
46 * cancelled. We still need to prevent old log records from
47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here.
49 *
50 * Revoke information on buffers is a tri-state value:
51 *
52 * RevokeValid clear: no cached revoke status, need to look it up
53 * RevokeValid set, Revoked clear:
54 * buffer has not been revoked, and cancel_revoke
55 * need do nothing.
56 * RevokeValid set, Revoked set:
57 * buffer has been revoked.
58 */
59
60#ifndef __KERNEL__
61#include "jfs_user.h"
62#else
63#include <linux/time.h>
64#include <linux/fs.h>
65#include <linux/jbd2.h>
66#include <linux/errno.h>
67#include <linux/slab.h>
68#include <linux/list.h>
69#include <linux/smp_lock.h>
70#include <linux/init.h>
71#endif
72
73static kmem_cache_t *jbd2_revoke_record_cache;
74static kmem_cache_t *jbd2_revoke_table_cache;
75
76/* Each revoke record represents one single revoked block. During
77 journal replay, this involves recording the transaction ID of the
78 last transaction to revoke this block. */
79
80struct jbd2_revoke_record_s
81{
82 struct list_head hash;
83 tid_t sequence; /* Used for recovery only */
84 unsigned long long blocknr;
85};
86
87
88/* The revoke table is just a simple hash table of revoke records. */
89struct jbd2_revoke_table_s
90{
91 /* It is conceivable that we might want a larger hash table
92 * for recovery. Must be a power of two. */
93 int hash_size;
94 int hash_shift;
95 struct list_head *hash_table;
96};
97
98
99#ifdef __KERNEL__
100static void write_one_revoke_record(journal_t *, transaction_t *,
101 struct journal_head **, int *,
102 struct jbd2_revoke_record_s *);
103static void flush_descriptor(journal_t *, struct journal_head *, int);
104#endif
105
106/* Utility functions to maintain the revoke table */
107
108/* Borrowed from buffer.c: this is a tried and tested block hash function */
109static inline int hash(journal_t *journal, unsigned long long block)
110{
111 struct jbd2_revoke_table_s *table = journal->j_revoke;
112 int hash_shift = table->hash_shift;
113 int hash = (int)block ^ (int)((block >> 31) >> 1);
114
115 return ((hash << (hash_shift - 6)) ^
116 (hash >> 13) ^
117 (hash << (hash_shift - 12))) & (table->hash_size - 1);
118}
119
120static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
121 tid_t seq)
122{
123 struct list_head *hash_list;
124 struct jbd2_revoke_record_s *record;
125
126repeat:
127 record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
128 if (!record)
129 goto oom;
130
131 record->sequence = seq;
132 record->blocknr = blocknr;
133 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
134 spin_lock(&journal->j_revoke_lock);
135 list_add(&record->hash, hash_list);
136 spin_unlock(&journal->j_revoke_lock);
137 return 0;
138
139oom:
140 if (!journal_oom_retry)
141 return -ENOMEM;
142 jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
143 yield();
144 goto repeat;
145}
146
147/* Find a revoke record in the journal's hash table. */
148
149static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
150 unsigned long long blocknr)
151{
152 struct list_head *hash_list;
153 struct jbd2_revoke_record_s *record;
154
155 hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
156
157 spin_lock(&journal->j_revoke_lock);
158 record = (struct jbd2_revoke_record_s *) hash_list->next;
159 while (&(record->hash) != hash_list) {
160 if (record->blocknr == blocknr) {
161 spin_unlock(&journal->j_revoke_lock);
162 return record;
163 }
164 record = (struct jbd2_revoke_record_s *) record->hash.next;
165 }
166 spin_unlock(&journal->j_revoke_lock);
167 return NULL;
168}
169
170int __init jbd2_journal_init_revoke_caches(void)
171{
172 jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
173 sizeof(struct jbd2_revoke_record_s),
174 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
175 if (jbd2_revoke_record_cache == 0)
176 return -ENOMEM;
177
178 jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
179 sizeof(struct jbd2_revoke_table_s),
180 0, 0, NULL, NULL);
181 if (jbd2_revoke_table_cache == 0) {
182 kmem_cache_destroy(jbd2_revoke_record_cache);
183 jbd2_revoke_record_cache = NULL;
184 return -ENOMEM;
185 }
186 return 0;
187}
188
189void jbd2_journal_destroy_revoke_caches(void)
190{
191 kmem_cache_destroy(jbd2_revoke_record_cache);
192 jbd2_revoke_record_cache = NULL;
193 kmem_cache_destroy(jbd2_revoke_table_cache);
194 jbd2_revoke_table_cache = NULL;
195}
196
197/* Initialise the revoke table for a given journal to a given size. */
198
199int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
200{
201 int shift, tmp;
202
203 J_ASSERT (journal->j_revoke_table[0] == NULL);
204
205 shift = 0;
206 tmp = hash_size;
207 while((tmp >>= 1UL) != 0UL)
208 shift++;
209
210 journal->j_revoke_table[0] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
211 if (!journal->j_revoke_table[0])
212 return -ENOMEM;
213 journal->j_revoke = journal->j_revoke_table[0];
214
215 /* Check that the hash_size is a power of two */
216 J_ASSERT ((hash_size & (hash_size-1)) == 0);
217
218 journal->j_revoke->hash_size = hash_size;
219
220 journal->j_revoke->hash_shift = shift;
221
222 journal->j_revoke->hash_table =
223 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
224 if (!journal->j_revoke->hash_table) {
225 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
226 journal->j_revoke = NULL;
227 return -ENOMEM;
228 }
229
230 for (tmp = 0; tmp < hash_size; tmp++)
231 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
232
233 journal->j_revoke_table[1] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
234 if (!journal->j_revoke_table[1]) {
235 kfree(journal->j_revoke_table[0]->hash_table);
236 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
237 return -ENOMEM;
238 }
239
240 journal->j_revoke = journal->j_revoke_table[1];
241
242 /* Check that the hash_size is a power of two */
243 J_ASSERT ((hash_size & (hash_size-1)) == 0);
244
245 journal->j_revoke->hash_size = hash_size;
246
247 journal->j_revoke->hash_shift = shift;
248
249 journal->j_revoke->hash_table =
250 kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
251 if (!journal->j_revoke->hash_table) {
252 kfree(journal->j_revoke_table[0]->hash_table);
253 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
254 kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[1]);
255 journal->j_revoke = NULL;
256 return -ENOMEM;
257 }
258
259 for (tmp = 0; tmp < hash_size; tmp++)
260 INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
261
262 spin_lock_init(&journal->j_revoke_lock);
263
264 return 0;
265}
266
267/* Destoy a journal's revoke table. The table must already be empty! */
268
269void jbd2_journal_destroy_revoke(journal_t *journal)
270{
271 struct jbd2_revoke_table_s *table;
272 struct list_head *hash_list;
273 int i;
274
275 table = journal->j_revoke_table[0];
276 if (!table)
277 return;
278
279 for (i=0; i<table->hash_size; i++) {
280 hash_list = &table->hash_table[i];
281 J_ASSERT (list_empty(hash_list));
282 }
283
284 kfree(table->hash_table);
285 kmem_cache_free(jbd2_revoke_table_cache, table);
286 journal->j_revoke = NULL;
287
288 table = journal->j_revoke_table[1];
289 if (!table)
290 return;
291
292 for (i=0; i<table->hash_size; i++) {
293 hash_list = &table->hash_table[i];
294 J_ASSERT (list_empty(hash_list));
295 }
296
297 kfree(table->hash_table);
298 kmem_cache_free(jbd2_revoke_table_cache, table);
299 journal->j_revoke = NULL;
300}
301
302
303#ifdef __KERNEL__
304
305/*
306 * jbd2_journal_revoke: revoke a given buffer_head from the journal. This
307 * prevents the block from being replayed during recovery if we take a
308 * crash after this current transaction commits. Any subsequent
309 * metadata writes of the buffer in this transaction cancel the
310 * revoke.
311 *
312 * Note that this call may block --- it is up to the caller to make
313 * sure that there are no further calls to journal_write_metadata
314 * before the revoke is complete. In ext3, this implies calling the
315 * revoke before clearing the block bitmap when we are deleting
316 * metadata.
317 *
318 * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
319 * parameter, but does _not_ forget the buffer_head if the bh was only
320 * found implicitly.
321 *
322 * bh_in may not be a journalled buffer - it may have come off
323 * the hash tables without an attached journal_head.
324 *
325 * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
326 * by one.
327 */
328
329int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
330 struct buffer_head *bh_in)
331{
332 struct buffer_head *bh = NULL;
333 journal_t *journal;
334 struct block_device *bdev;
335 int err;
336
337 might_sleep();
338 if (bh_in)
339 BUFFER_TRACE(bh_in, "enter");
340
341 journal = handle->h_transaction->t_journal;
342 if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
343 J_ASSERT (!"Cannot set revoke feature!");
344 return -EINVAL;
345 }
346
347 bdev = journal->j_fs_dev;
348 bh = bh_in;
349
350 if (!bh) {
351 bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
352 if (bh)
353 BUFFER_TRACE(bh, "found on hash");
354 }
355#ifdef JBD_EXPENSIVE_CHECKING
356 else {
357 struct buffer_head *bh2;
358
359 /* If there is a different buffer_head lying around in
360 * memory anywhere... */
361 bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
362 if (bh2) {
363 /* ... and it has RevokeValid status... */
364 if (bh2 != bh && buffer_revokevalid(bh2))
365 /* ...then it better be revoked too,
366 * since it's illegal to create a revoke
367 * record against a buffer_head which is
368 * not marked revoked --- that would
369 * risk missing a subsequent revoke
370 * cancel. */
371 J_ASSERT_BH(bh2, buffer_revoked(bh2));
372 put_bh(bh2);
373 }
374 }
375#endif
376
377 /* We really ought not ever to revoke twice in a row without
378 first having the revoke cancelled: it's illegal to free a
379 block twice without allocating it in between! */
380 if (bh) {
381 if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
382 "inconsistent data on disk")) {
383 if (!bh_in)
384 brelse(bh);
385 return -EIO;
386 }
387 set_buffer_revoked(bh);
388 set_buffer_revokevalid(bh);
389 if (bh_in) {
390 BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
391 jbd2_journal_forget(handle, bh_in);
392 } else {
393 BUFFER_TRACE(bh, "call brelse");
394 __brelse(bh);
395 }
396 }
397
398 jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
399 err = insert_revoke_hash(journal, blocknr,
400 handle->h_transaction->t_tid);
401 BUFFER_TRACE(bh_in, "exit");
402 return err;
403}
404
405/*
406 * Cancel an outstanding revoke. For use only internally by the
407 * journaling code (called from jbd2_journal_get_write_access).
408 *
409 * We trust buffer_revoked() on the buffer if the buffer is already
410 * being journaled: if there is no revoke pending on the buffer, then we
411 * don't do anything here.
412 *
413 * This would break if it were possible for a buffer to be revoked and
414 * discarded, and then reallocated within the same transaction. In such
415 * a case we would have lost the revoked bit, but when we arrived here
416 * the second time we would still have a pending revoke to cancel. So,
417 * do not trust the Revoked bit on buffers unless RevokeValid is also
418 * set.
419 *
420 * The caller must have the journal locked.
421 */
422int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
423{
424 struct jbd2_revoke_record_s *record;
425 journal_t *journal = handle->h_transaction->t_journal;
426 int need_cancel;
427 int did_revoke = 0; /* akpm: debug */
428 struct buffer_head *bh = jh2bh(jh);
429
430 jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
431
432 /* Is the existing Revoke bit valid? If so, we trust it, and
433 * only perform the full cancel if the revoke bit is set. If
434 * not, we can't trust the revoke bit, and we need to do the
435 * full search for a revoke record. */
436 if (test_set_buffer_revokevalid(bh)) {
437 need_cancel = test_clear_buffer_revoked(bh);
438 } else {
439 need_cancel = 1;
440 clear_buffer_revoked(bh);
441 }
442
443 if (need_cancel) {
444 record = find_revoke_record(journal, bh->b_blocknr);
445 if (record) {
446 jbd_debug(4, "cancelled existing revoke on "
447 "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
448 spin_lock(&journal->j_revoke_lock);
449 list_del(&record->hash);
450 spin_unlock(&journal->j_revoke_lock);
451 kmem_cache_free(jbd2_revoke_record_cache, record);
452 did_revoke = 1;
453 }
454 }
455
456#ifdef JBD_EXPENSIVE_CHECKING
457 /* There better not be one left behind by now! */
458 record = find_revoke_record(journal, bh->b_blocknr);
459 J_ASSERT_JH(jh, record == NULL);
460#endif
461
462 /* Finally, have we just cleared revoke on an unhashed
463 * buffer_head? If so, we'd better make sure we clear the
464 * revoked status on any hashed alias too, otherwise the revoke
465 * state machine will get very upset later on. */
466 if (need_cancel) {
467 struct buffer_head *bh2;
468 bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
469 if (bh2) {
470 if (bh2 != bh)
471 clear_buffer_revoked(bh2);
472 __brelse(bh2);
473 }
474 }
475 return did_revoke;
476}
477
478/* journal_switch_revoke table select j_revoke for next transaction
479 * we do not want to suspend any processing until all revokes are
480 * written -bzzz
481 */
482void jbd2_journal_switch_revoke_table(journal_t *journal)
483{
484 int i;
485
486 if (journal->j_revoke == journal->j_revoke_table[0])
487 journal->j_revoke = journal->j_revoke_table[1];
488 else
489 journal->j_revoke = journal->j_revoke_table[0];
490
491 for (i = 0; i < journal->j_revoke->hash_size; i++)
492 INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
493}
494
495/*
496 * Write revoke records to the journal for all entries in the current
497 * revoke hash, deleting the entries as we go.
498 *
499 * Called with the journal lock held.
500 */
501
502void jbd2_journal_write_revoke_records(journal_t *journal,
503 transaction_t *transaction)
504{
505 struct journal_head *descriptor;
506 struct jbd2_revoke_record_s *record;
507 struct jbd2_revoke_table_s *revoke;
508 struct list_head *hash_list;
509 int i, offset, count;
510
511 descriptor = NULL;
512 offset = 0;
513 count = 0;
514
515 /* select revoke table for committing transaction */
516 revoke = journal->j_revoke == journal->j_revoke_table[0] ?
517 journal->j_revoke_table[1] : journal->j_revoke_table[0];
518
519 for (i = 0; i < revoke->hash_size; i++) {
520 hash_list = &revoke->hash_table[i];
521
522 while (!list_empty(hash_list)) {
523 record = (struct jbd2_revoke_record_s *)
524 hash_list->next;
525 write_one_revoke_record(journal, transaction,
526 &descriptor, &offset,
527 record);
528 count++;
529 list_del(&record->hash);
530 kmem_cache_free(jbd2_revoke_record_cache, record);
531 }
532 }
533 if (descriptor)
534 flush_descriptor(journal, descriptor, offset);
535 jbd_debug(1, "Wrote %d revoke records\n", count);
536}
537
538/*
539 * Write out one revoke record. We need to create a new descriptor
540 * block if the old one is full or if we have not already created one.
541 */
542
543static void write_one_revoke_record(journal_t *journal,
544 transaction_t *transaction,
545 struct journal_head **descriptorp,
546 int *offsetp,
547 struct jbd2_revoke_record_s *record)
548{
549 struct journal_head *descriptor;
550 int offset;
551 journal_header_t *header;
552
553 /* If we are already aborting, this all becomes a noop. We
554 still need to go round the loop in
555 jbd2_journal_write_revoke_records in order to free all of the
556 revoke records: only the IO to the journal is omitted. */
557 if (is_journal_aborted(journal))
558 return;
559
560 descriptor = *descriptorp;
561 offset = *offsetp;
562
563 /* Make sure we have a descriptor with space left for the record */
564 if (descriptor) {
565 if (offset == journal->j_blocksize) {
566 flush_descriptor(journal, descriptor, offset);
567 descriptor = NULL;
568 }
569 }
570
571 if (!descriptor) {
572 descriptor = jbd2_journal_get_descriptor_buffer(journal);
573 if (!descriptor)
574 return;
575 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
576 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
577 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
578 header->h_sequence = cpu_to_be32(transaction->t_tid);
579
580 /* Record it so that we can wait for IO completion later */
581 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
582 jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
583
584 offset = sizeof(jbd2_journal_revoke_header_t);
585 *descriptorp = descriptor;
586 }
587
588 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
589 * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
590 cpu_to_be64(record->blocknr);
591 offset += 8;
592
593 } else {
594 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
595 cpu_to_be32(record->blocknr);
596 offset += 4;
597 }
598
599 *offsetp = offset;
600}
601
602/*
603 * Flush a revoke descriptor out to the journal. If we are aborting,
604 * this is a noop; otherwise we are generating a buffer which needs to
605 * be waited for during commit, so it has to go onto the appropriate
606 * journal buffer list.
607 */
608
609static void flush_descriptor(journal_t *journal,
610 struct journal_head *descriptor,
611 int offset)
612{
613 jbd2_journal_revoke_header_t *header;
614 struct buffer_head *bh = jh2bh(descriptor);
615
616 if (is_journal_aborted(journal)) {
617 put_bh(bh);
618 return;
619 }
620
621 header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
622 header->r_count = cpu_to_be32(offset);
623 set_buffer_jwrite(bh);
624 BUFFER_TRACE(bh, "write");
625 set_buffer_dirty(bh);
626 ll_rw_block(SWRITE, 1, &bh);
627}
628#endif
629
630/*
631 * Revoke support for recovery.
632 *
633 * Recovery needs to be able to:
634 *
635 * record all revoke records, including the tid of the latest instance
636 * of each revoke in the journal
637 *
638 * check whether a given block in a given transaction should be replayed
639 * (ie. has not been revoked by a revoke record in that or a subsequent
640 * transaction)
641 *
642 * empty the revoke table after recovery.
643 */
644
645/*
646 * First, setting revoke records. We create a new revoke record for
647 * every block ever revoked in the log as we scan it for recovery, and
648 * we update the existing records if we find multiple revokes for a
649 * single block.
650 */
651
652int jbd2_journal_set_revoke(journal_t *journal,
653 unsigned long long blocknr,
654 tid_t sequence)
655{
656 struct jbd2_revoke_record_s *record;
657
658 record = find_revoke_record(journal, blocknr);
659 if (record) {
660 /* If we have multiple occurrences, only record the
661 * latest sequence number in the hashed record */
662 if (tid_gt(sequence, record->sequence))
663 record->sequence = sequence;
664 return 0;
665 }
666 return insert_revoke_hash(journal, blocknr, sequence);
667}
668
669/*
670 * Test revoke records. For a given block referenced in the log, has
671 * that block been revoked? A revoke record with a given transaction
672 * sequence number revokes all blocks in that transaction and earlier
673 * ones, but later transactions still need replayed.
674 */
675
676int jbd2_journal_test_revoke(journal_t *journal,
677 unsigned long long blocknr,
678 tid_t sequence)
679{
680 struct jbd2_revoke_record_s *record;
681
682 record = find_revoke_record(journal, blocknr);
683 if (!record)
684 return 0;
685 if (tid_gt(sequence, record->sequence))
686 return 0;
687 return 1;
688}
689
690/*
691 * Finally, once recovery is over, we need to clear the revoke table so
692 * that it can be reused by the running filesystem.
693 */
694
695void jbd2_journal_clear_revoke(journal_t *journal)
696{
697 int i;
698 struct list_head *hash_list;
699 struct jbd2_revoke_record_s *record;
700 struct jbd2_revoke_table_s *revoke;
701
702 revoke = journal->j_revoke;
703
704 for (i = 0; i < revoke->hash_size; i++) {
705 hash_list = &revoke->hash_table[i];
706 while (!list_empty(hash_list)) {
707 record = (struct jbd2_revoke_record_s*) hash_list->next;
708 list_del(&record->hash);
709 kmem_cache_free(jbd2_revoke_record_cache, record);
710 }
711 }
712}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
new file mode 100644
index 000000000000..b6cf2be845a1
--- /dev/null
+++ b/fs/jbd2/transaction.c
@@ -0,0 +1,2081 @@
1/*
2 * linux/fs/transaction.c
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Generic filesystem transaction handling code; part of the ext2fs
13 * journaling system.
14 *
15 * This file manages transactions (compound commits managed by the
16 * journaling code) and handles (individual atomic operations by the
17 * filesystem).
18 */
19
20#include <linux/time.h>
21#include <linux/fs.h>
22#include <linux/jbd2.h>
23#include <linux/errno.h>
24#include <linux/slab.h>
25#include <linux/timer.h>
26#include <linux/smp_lock.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29
30/*
31 * jbd2_get_transaction: obtain a new transaction_t object.
32 *
33 * Simply allocate and initialise a new transaction. Create it in
34 * RUNNING state and add it to the current journal (which should not
35 * have an existing running transaction: we only make a new transaction
36 * once we have started to commit the old one).
37 *
38 * Preconditions:
39 * The journal MUST be locked. We don't perform atomic mallocs on the
40 * new transaction and we can't block without protecting against other
41 * processes trying to touch the journal while it is in transition.
42 *
43 * Called under j_state_lock
44 */
45
46static transaction_t *
47jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{
49 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING;
51 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock);
54
55 /* Set up the commit timer for the new transaction. */
56 journal->j_commit_timer.expires = transaction->t_expires;
57 add_timer(&journal->j_commit_timer);
58
59 J_ASSERT(journal->j_running_transaction == NULL);
60 journal->j_running_transaction = transaction;
61
62 return transaction;
63}
64
65/*
66 * Handle management.
67 *
68 * A handle_t is an object which represents a single atomic update to a
69 * filesystem, and which tracks all of the modifications which form part
70 * of that one update.
71 */
72
73/*
74 * start_this_handle: Given a handle, deal with any locking or stalling
75 * needed to make sure that there is enough journal space for the handle
76 * to begin. Attach the handle to a transaction and set up the
77 * transaction's buffer credits.
78 */
79
80static int start_this_handle(journal_t *journal, handle_t *handle)
81{
82 transaction_t *transaction;
83 int needed;
84 int nblocks = handle->h_buffer_credits;
85 transaction_t *new_transaction = NULL;
86 int ret = 0;
87
88 if (nblocks > journal->j_max_transaction_buffers) {
89 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
90 current->comm, nblocks,
91 journal->j_max_transaction_buffers);
92 ret = -ENOSPC;
93 goto out;
94 }
95
96alloc_transaction:
97 if (!journal->j_running_transaction) {
98 new_transaction = jbd_kmalloc(sizeof(*new_transaction),
99 GFP_NOFS);
100 if (!new_transaction) {
101 ret = -ENOMEM;
102 goto out;
103 }
104 memset(new_transaction, 0, sizeof(*new_transaction));
105 }
106
107 jbd_debug(3, "New handle %p going live.\n", handle);
108
109repeat:
110
111 /*
112 * We need to hold j_state_lock until t_updates has been incremented,
113 * for proper journal barrier handling
114 */
115 spin_lock(&journal->j_state_lock);
116repeat_locked:
117 if (is_journal_aborted(journal) ||
118 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
119 spin_unlock(&journal->j_state_lock);
120 ret = -EROFS;
121 goto out;
122 }
123
124 /* Wait on the journal's transaction barrier if necessary */
125 if (journal->j_barrier_count) {
126 spin_unlock(&journal->j_state_lock);
127 wait_event(journal->j_wait_transaction_locked,
128 journal->j_barrier_count == 0);
129 goto repeat;
130 }
131
132 if (!journal->j_running_transaction) {
133 if (!new_transaction) {
134 spin_unlock(&journal->j_state_lock);
135 goto alloc_transaction;
136 }
137 jbd2_get_transaction(journal, new_transaction);
138 new_transaction = NULL;
139 }
140
141 transaction = journal->j_running_transaction;
142
143 /*
144 * If the current transaction is locked down for commit, wait for the
145 * lock to be released.
146 */
147 if (transaction->t_state == T_LOCKED) {
148 DEFINE_WAIT(wait);
149
150 prepare_to_wait(&journal->j_wait_transaction_locked,
151 &wait, TASK_UNINTERRUPTIBLE);
152 spin_unlock(&journal->j_state_lock);
153 schedule();
154 finish_wait(&journal->j_wait_transaction_locked, &wait);
155 goto repeat;
156 }
157
158 /*
159 * If there is not enough space left in the log to write all potential
160 * buffers requested by this operation, we need to stall pending a log
161 * checkpoint to free some more log space.
162 */
163 spin_lock(&transaction->t_handle_lock);
164 needed = transaction->t_outstanding_credits + nblocks;
165
166 if (needed > journal->j_max_transaction_buffers) {
167 /*
168 * If the current transaction is already too large, then start
169 * to commit it: we can then go back and attach this handle to
170 * a new transaction.
171 */
172 DEFINE_WAIT(wait);
173
174 jbd_debug(2, "Handle %p starting new commit...\n", handle);
175 spin_unlock(&transaction->t_handle_lock);
176 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
177 TASK_UNINTERRUPTIBLE);
178 __jbd2_log_start_commit(journal, transaction->t_tid);
179 spin_unlock(&journal->j_state_lock);
180 schedule();
181 finish_wait(&journal->j_wait_transaction_locked, &wait);
182 goto repeat;
183 }
184
185 /*
186 * The commit code assumes that it can get enough log space
187 * without forcing a checkpoint. This is *critical* for
188 * correctness: a checkpoint of a buffer which is also
189 * associated with a committing transaction creates a deadlock,
190 * so commit simply cannot force through checkpoints.
191 *
192 * We must therefore ensure the necessary space in the journal
193 * *before* starting to dirty potentially checkpointed buffers
194 * in the new transaction.
195 *
196 * The worst part is, any transaction currently committing can
197 * reduce the free space arbitrarily. Be careful to account for
198 * those buffers when checkpointing.
199 */
200
201 /*
202 * @@@ AKPM: This seems rather over-defensive. We're giving commit
203 * a _lot_ of headroom: 1/4 of the journal plus the size of
204 * the committing transaction. Really, we only need to give it
205 * committing_transaction->t_outstanding_credits plus "enough" for
206 * the log control blocks.
207 * Also, this test is inconsitent with the matching one in
208 * jbd2_journal_extend().
209 */
210 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
211 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
212 spin_unlock(&transaction->t_handle_lock);
213 __jbd2_log_wait_for_space(journal);
214 goto repeat_locked;
215 }
216
217 /* OK, account for the buffers that this operation expects to
218 * use and add the handle to the running transaction. */
219
220 handle->h_transaction = transaction;
221 transaction->t_outstanding_credits += nblocks;
222 transaction->t_updates++;
223 transaction->t_handle_count++;
224 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
225 handle, nblocks, transaction->t_outstanding_credits,
226 __jbd2_log_space_left(journal));
227 spin_unlock(&transaction->t_handle_lock);
228 spin_unlock(&journal->j_state_lock);
229out:
230 if (unlikely(new_transaction)) /* It's usually NULL */
231 kfree(new_transaction);
232 return ret;
233}
234
235/* Allocate a new handle. This should probably be in a slab... */
236static handle_t *new_handle(int nblocks)
237{
238 handle_t *handle = jbd_alloc_handle(GFP_NOFS);
239 if (!handle)
240 return NULL;
241 memset(handle, 0, sizeof(*handle));
242 handle->h_buffer_credits = nblocks;
243 handle->h_ref = 1;
244
245 return handle;
246}
247
248/**
249 * handle_t *jbd2_journal_start() - Obtain a new handle.
250 * @journal: Journal to start transaction on.
251 * @nblocks: number of block buffer we might modify
252 *
253 * We make sure that the transaction can guarantee at least nblocks of
254 * modified buffers in the log. We block until the log can guarantee
255 * that much space.
256 *
257 * This function is visible to journal users (like ext3fs), so is not
258 * called with the journal already locked.
259 *
260 * Return a pointer to a newly allocated handle, or NULL on failure
261 */
262handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
263{
264 handle_t *handle = journal_current_handle();
265 int err;
266
267 if (!journal)
268 return ERR_PTR(-EROFS);
269
270 if (handle) {
271 J_ASSERT(handle->h_transaction->t_journal == journal);
272 handle->h_ref++;
273 return handle;
274 }
275
276 handle = new_handle(nblocks);
277 if (!handle)
278 return ERR_PTR(-ENOMEM);
279
280 current->journal_info = handle;
281
282 err = start_this_handle(journal, handle);
283 if (err < 0) {
284 jbd_free_handle(handle);
285 current->journal_info = NULL;
286 handle = ERR_PTR(err);
287 }
288 return handle;
289}
290
291/**
292 * int jbd2_journal_extend() - extend buffer credits.
293 * @handle: handle to 'extend'
294 * @nblocks: nr blocks to try to extend by.
295 *
296 * Some transactions, such as large extends and truncates, can be done
297 * atomically all at once or in several stages. The operation requests
298 * a credit for a number of buffer modications in advance, but can
299 * extend its credit if it needs more.
300 *
301 * jbd2_journal_extend tries to give the running handle more buffer credits.
302 * It does not guarantee that allocation - this is a best-effort only.
303 * The calling process MUST be able to deal cleanly with a failure to
304 * extend here.
305 *
306 * Return 0 on success, non-zero on failure.
307 *
308 * return code < 0 implies an error
309 * return code > 0 implies normal transaction-full status.
310 */
311int jbd2_journal_extend(handle_t *handle, int nblocks)
312{
313 transaction_t *transaction = handle->h_transaction;
314 journal_t *journal = transaction->t_journal;
315 int result;
316 int wanted;
317
318 result = -EIO;
319 if (is_handle_aborted(handle))
320 goto out;
321
322 result = 1;
323
324 spin_lock(&journal->j_state_lock);
325
326 /* Don't extend a locked-down transaction! */
327 if (handle->h_transaction->t_state != T_RUNNING) {
328 jbd_debug(3, "denied handle %p %d blocks: "
329 "transaction not running\n", handle, nblocks);
330 goto error_out;
331 }
332
333 spin_lock(&transaction->t_handle_lock);
334 wanted = transaction->t_outstanding_credits + nblocks;
335
336 if (wanted > journal->j_max_transaction_buffers) {
337 jbd_debug(3, "denied handle %p %d blocks: "
338 "transaction too large\n", handle, nblocks);
339 goto unlock;
340 }
341
342 if (wanted > __jbd2_log_space_left(journal)) {
343 jbd_debug(3, "denied handle %p %d blocks: "
344 "insufficient log space\n", handle, nblocks);
345 goto unlock;
346 }
347
348 handle->h_buffer_credits += nblocks;
349 transaction->t_outstanding_credits += nblocks;
350 result = 0;
351
352 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
353unlock:
354 spin_unlock(&transaction->t_handle_lock);
355error_out:
356 spin_unlock(&journal->j_state_lock);
357out:
358 return result;
359}
360
361
362/**
363 * int jbd2_journal_restart() - restart a handle .
364 * @handle: handle to restart
365 * @nblocks: nr credits requested
366 *
367 * Restart a handle for a multi-transaction filesystem
368 * operation.
369 *
370 * If the jbd2_journal_extend() call above fails to grant new buffer credits
371 * to a running handle, a call to jbd2_journal_restart will commit the
372 * handle's transaction so far and reattach the handle to a new
373 * transaction capabable of guaranteeing the requested number of
374 * credits.
375 */
376
377int jbd2_journal_restart(handle_t *handle, int nblocks)
378{
379 transaction_t *transaction = handle->h_transaction;
380 journal_t *journal = transaction->t_journal;
381 int ret;
382
383 /* If we've had an abort of any type, don't even think about
384 * actually doing the restart! */
385 if (is_handle_aborted(handle))
386 return 0;
387
388 /*
389 * First unlink the handle from its current transaction, and start the
390 * commit on that.
391 */
392 J_ASSERT(transaction->t_updates > 0);
393 J_ASSERT(journal_current_handle() == handle);
394
395 spin_lock(&journal->j_state_lock);
396 spin_lock(&transaction->t_handle_lock);
397 transaction->t_outstanding_credits -= handle->h_buffer_credits;
398 transaction->t_updates--;
399
400 if (!transaction->t_updates)
401 wake_up(&journal->j_wait_updates);
402 spin_unlock(&transaction->t_handle_lock);
403
404 jbd_debug(2, "restarting handle %p\n", handle);
405 __jbd2_log_start_commit(journal, transaction->t_tid);
406 spin_unlock(&journal->j_state_lock);
407
408 handle->h_buffer_credits = nblocks;
409 ret = start_this_handle(journal, handle);
410 return ret;
411}
412
413
414/**
415 * void jbd2_journal_lock_updates () - establish a transaction barrier.
416 * @journal: Journal to establish a barrier on.
417 *
418 * This locks out any further updates from being started, and blocks
419 * until all existing updates have completed, returning only once the
420 * journal is in a quiescent state with no updates running.
421 *
422 * The journal lock should not be held on entry.
423 */
424void jbd2_journal_lock_updates(journal_t *journal)
425{
426 DEFINE_WAIT(wait);
427
428 spin_lock(&journal->j_state_lock);
429 ++journal->j_barrier_count;
430
431 /* Wait until there are no running updates */
432 while (1) {
433 transaction_t *transaction = journal->j_running_transaction;
434
435 if (!transaction)
436 break;
437
438 spin_lock(&transaction->t_handle_lock);
439 if (!transaction->t_updates) {
440 spin_unlock(&transaction->t_handle_lock);
441 break;
442 }
443 prepare_to_wait(&journal->j_wait_updates, &wait,
444 TASK_UNINTERRUPTIBLE);
445 spin_unlock(&transaction->t_handle_lock);
446 spin_unlock(&journal->j_state_lock);
447 schedule();
448 finish_wait(&journal->j_wait_updates, &wait);
449 spin_lock(&journal->j_state_lock);
450 }
451 spin_unlock(&journal->j_state_lock);
452
453 /*
454 * We have now established a barrier against other normal updates, but
455 * we also need to barrier against other jbd2_journal_lock_updates() calls
456 * to make sure that we serialise special journal-locked operations
457 * too.
458 */
459 mutex_lock(&journal->j_barrier);
460}
461
462/**
463 * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
464 * @journal: Journal to release the barrier on.
465 *
466 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
467 *
468 * Should be called without the journal lock held.
469 */
470void jbd2_journal_unlock_updates (journal_t *journal)
471{
472 J_ASSERT(journal->j_barrier_count != 0);
473
474 mutex_unlock(&journal->j_barrier);
475 spin_lock(&journal->j_state_lock);
476 --journal->j_barrier_count;
477 spin_unlock(&journal->j_state_lock);
478 wake_up(&journal->j_wait_transaction_locked);
479}
480
481/*
482 * Report any unexpected dirty buffers which turn up. Normally those
483 * indicate an error, but they can occur if the user is running (say)
484 * tune2fs to modify the live filesystem, so we need the option of
485 * continuing as gracefully as possible. #
486 *
487 * The caller should already hold the journal lock and
488 * j_list_lock spinlock: most callers will need those anyway
489 * in order to probe the buffer's journaling state safely.
490 */
491static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
492{
493 int jlist;
494
495 /* If this buffer is one which might reasonably be dirty
496 * --- ie. data, or not part of this journal --- then
497 * we're OK to leave it alone, but otherwise we need to
498 * move the dirty bit to the journal's own internal
499 * JBDDirty bit. */
500 jlist = jh->b_jlist;
501
502 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
503 jlist == BJ_Shadow || jlist == BJ_Forget) {
504 struct buffer_head *bh = jh2bh(jh);
505
506 if (test_clear_buffer_dirty(bh))
507 set_buffer_jbddirty(bh);
508 }
509}
510
511/*
512 * If the buffer is already part of the current transaction, then there
513 * is nothing we need to do. If it is already part of a prior
514 * transaction which we are still committing to disk, then we need to
515 * make sure that we do not overwrite the old copy: we do copy-out to
516 * preserve the copy going to disk. We also account the buffer against
517 * the handle's metadata buffer credits (unless the buffer is already
518 * part of the transaction, that is).
519 *
520 */
521static int
522do_get_write_access(handle_t *handle, struct journal_head *jh,
523 int force_copy)
524{
525 struct buffer_head *bh;
526 transaction_t *transaction;
527 journal_t *journal;
528 int error;
529 char *frozen_buffer = NULL;
530 int need_copy = 0;
531
532 if (is_handle_aborted(handle))
533 return -EROFS;
534
535 transaction = handle->h_transaction;
536 journal = transaction->t_journal;
537
538 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
539
540 JBUFFER_TRACE(jh, "entry");
541repeat:
542 bh = jh2bh(jh);
543
544 /* @@@ Need to check for errors here at some point. */
545
546 lock_buffer(bh);
547 jbd_lock_bh_state(bh);
548
549 /* We now hold the buffer lock so it is safe to query the buffer
550 * state. Is the buffer dirty?
551 *
552 * If so, there are two possibilities. The buffer may be
553 * non-journaled, and undergoing a quite legitimate writeback.
554 * Otherwise, it is journaled, and we don't expect dirty buffers
555 * in that state (the buffers should be marked JBD_Dirty
556 * instead.) So either the IO is being done under our own
557 * control and this is a bug, or it's a third party IO such as
558 * dump(8) (which may leave the buffer scheduled for read ---
559 * ie. locked but not dirty) or tune2fs (which may actually have
560 * the buffer dirtied, ugh.) */
561
562 if (buffer_dirty(bh)) {
563 /*
564 * First question: is this buffer already part of the current
565 * transaction or the existing committing transaction?
566 */
567 if (jh->b_transaction) {
568 J_ASSERT_JH(jh,
569 jh->b_transaction == transaction ||
570 jh->b_transaction ==
571 journal->j_committing_transaction);
572 if (jh->b_next_transaction)
573 J_ASSERT_JH(jh, jh->b_next_transaction ==
574 transaction);
575 }
576 /*
577 * In any case we need to clean the dirty flag and we must
578 * do it under the buffer lock to be sure we don't race
579 * with running write-out.
580 */
581 JBUFFER_TRACE(jh, "Unexpected dirty buffer");
582 jbd_unexpected_dirty_buffer(jh);
583 }
584
585 unlock_buffer(bh);
586
587 error = -EROFS;
588 if (is_handle_aborted(handle)) {
589 jbd_unlock_bh_state(bh);
590 goto out;
591 }
592 error = 0;
593
594 /*
595 * The buffer is already part of this transaction if b_transaction or
596 * b_next_transaction points to it
597 */
598 if (jh->b_transaction == transaction ||
599 jh->b_next_transaction == transaction)
600 goto done;
601
602 /*
603 * If there is already a copy-out version of this buffer, then we don't
604 * need to make another one
605 */
606 if (jh->b_frozen_data) {
607 JBUFFER_TRACE(jh, "has frozen data");
608 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
609 jh->b_next_transaction = transaction;
610 goto done;
611 }
612
613 /* Is there data here we need to preserve? */
614
615 if (jh->b_transaction && jh->b_transaction != transaction) {
616 JBUFFER_TRACE(jh, "owned by older transaction");
617 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
618 J_ASSERT_JH(jh, jh->b_transaction ==
619 journal->j_committing_transaction);
620
621 /* There is one case we have to be very careful about.
622 * If the committing transaction is currently writing
623 * this buffer out to disk and has NOT made a copy-out,
624 * then we cannot modify the buffer contents at all
625 * right now. The essence of copy-out is that it is the
626 * extra copy, not the primary copy, which gets
627 * journaled. If the primary copy is already going to
628 * disk then we cannot do copy-out here. */
629
630 if (jh->b_jlist == BJ_Shadow) {
631 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
632 wait_queue_head_t *wqh;
633
634 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
635
636 JBUFFER_TRACE(jh, "on shadow: sleep");
637 jbd_unlock_bh_state(bh);
638 /* commit wakes up all shadow buffers after IO */
639 for ( ; ; ) {
640 prepare_to_wait(wqh, &wait.wait,
641 TASK_UNINTERRUPTIBLE);
642 if (jh->b_jlist != BJ_Shadow)
643 break;
644 schedule();
645 }
646 finish_wait(wqh, &wait.wait);
647 goto repeat;
648 }
649
650 /* Only do the copy if the currently-owning transaction
651 * still needs it. If it is on the Forget list, the
652 * committing transaction is past that stage. The
653 * buffer had better remain locked during the kmalloc,
654 * but that should be true --- we hold the journal lock
655 * still and the buffer is already on the BUF_JOURNAL
656 * list so won't be flushed.
657 *
658 * Subtle point, though: if this is a get_undo_access,
659 * then we will be relying on the frozen_data to contain
660 * the new value of the committed_data record after the
661 * transaction, so we HAVE to force the frozen_data copy
662 * in that case. */
663
664 if (jh->b_jlist != BJ_Forget || force_copy) {
665 JBUFFER_TRACE(jh, "generate frozen data");
666 if (!frozen_buffer) {
667 JBUFFER_TRACE(jh, "allocate memory for buffer");
668 jbd_unlock_bh_state(bh);
669 frozen_buffer =
670 jbd2_slab_alloc(jh2bh(jh)->b_size,
671 GFP_NOFS);
672 if (!frozen_buffer) {
673 printk(KERN_EMERG
674 "%s: OOM for frozen_buffer\n",
675 __FUNCTION__);
676 JBUFFER_TRACE(jh, "oom!");
677 error = -ENOMEM;
678 jbd_lock_bh_state(bh);
679 goto done;
680 }
681 goto repeat;
682 }
683 jh->b_frozen_data = frozen_buffer;
684 frozen_buffer = NULL;
685 need_copy = 1;
686 }
687 jh->b_next_transaction = transaction;
688 }
689
690
691 /*
692 * Finally, if the buffer is not journaled right now, we need to make
693 * sure it doesn't get written to disk before the caller actually
694 * commits the new data
695 */
696 if (!jh->b_transaction) {
697 JBUFFER_TRACE(jh, "no transaction");
698 J_ASSERT_JH(jh, !jh->b_next_transaction);
699 jh->b_transaction = transaction;
700 JBUFFER_TRACE(jh, "file as BJ_Reserved");
701 spin_lock(&journal->j_list_lock);
702 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
703 spin_unlock(&journal->j_list_lock);
704 }
705
706done:
707 if (need_copy) {
708 struct page *page;
709 int offset;
710 char *source;
711
712 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
713 "Possible IO failure.\n");
714 page = jh2bh(jh)->b_page;
715 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
716 source = kmap_atomic(page, KM_USER0);
717 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
718 kunmap_atomic(source, KM_USER0);
719 }
720 jbd_unlock_bh_state(bh);
721
722 /*
723 * If we are about to journal a buffer, then any revoke pending on it is
724 * no longer valid
725 */
726 jbd2_journal_cancel_revoke(handle, jh);
727
728out:
729 if (unlikely(frozen_buffer)) /* It's usually NULL */
730 jbd2_slab_free(frozen_buffer, bh->b_size);
731
732 JBUFFER_TRACE(jh, "exit");
733 return error;
734}
735
736/**
737 * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
738 * @handle: transaction to add buffer modifications to
739 * @bh: bh to be used for metadata writes
740 * @credits: variable that will receive credits for the buffer
741 *
742 * Returns an error code or 0 on success.
743 *
744 * In full data journalling mode the buffer may be of type BJ_AsyncData,
745 * because we're write()ing a buffer which is also part of a shared mapping.
746 */
747
748int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
749{
750 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
751 int rc;
752
753 /* We do not want to get caught playing with fields which the
754 * log thread also manipulates. Make sure that the buffer
755 * completes any outstanding IO before proceeding. */
756 rc = do_get_write_access(handle, jh, 0);
757 jbd2_journal_put_journal_head(jh);
758 return rc;
759}
760
761
762/*
763 * When the user wants to journal a newly created buffer_head
764 * (ie. getblk() returned a new buffer and we are going to populate it
765 * manually rather than reading off disk), then we need to keep the
766 * buffer_head locked until it has been completely filled with new
767 * data. In this case, we should be able to make the assertion that
768 * the bh is not already part of an existing transaction.
769 *
770 * The buffer should already be locked by the caller by this point.
771 * There is no lock ranking violation: it was a newly created,
772 * unlocked buffer beforehand. */
773
774/**
775 * int jbd2_journal_get_create_access () - notify intent to use newly created bh
776 * @handle: transaction to new buffer to
777 * @bh: new buffer.
778 *
779 * Call this if you create a new bh.
780 */
781int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
782{
783 transaction_t *transaction = handle->h_transaction;
784 journal_t *journal = transaction->t_journal;
785 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
786 int err;
787
788 jbd_debug(5, "journal_head %p\n", jh);
789 err = -EROFS;
790 if (is_handle_aborted(handle))
791 goto out;
792 err = 0;
793
794 JBUFFER_TRACE(jh, "entry");
795 /*
796 * The buffer may already belong to this transaction due to pre-zeroing
797 * in the filesystem's new_block code. It may also be on the previous,
798 * committing transaction's lists, but it HAS to be in Forget state in
799 * that case: the transaction must have deleted the buffer for it to be
800 * reused here.
801 */
802 jbd_lock_bh_state(bh);
803 spin_lock(&journal->j_list_lock);
804 J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
805 jh->b_transaction == NULL ||
806 (jh->b_transaction == journal->j_committing_transaction &&
807 jh->b_jlist == BJ_Forget)));
808
809 J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
810 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
811
812 if (jh->b_transaction == NULL) {
813 jh->b_transaction = transaction;
814 JBUFFER_TRACE(jh, "file as BJ_Reserved");
815 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
816 } else if (jh->b_transaction == journal->j_committing_transaction) {
817 JBUFFER_TRACE(jh, "set next transaction");
818 jh->b_next_transaction = transaction;
819 }
820 spin_unlock(&journal->j_list_lock);
821 jbd_unlock_bh_state(bh);
822
823 /*
824 * akpm: I added this. ext3_alloc_branch can pick up new indirect
825 * blocks which contain freed but then revoked metadata. We need
826 * to cancel the revoke in case we end up freeing it yet again
827 * and the reallocating as data - this would cause a second revoke,
828 * which hits an assertion error.
829 */
830 JBUFFER_TRACE(jh, "cancelling revoke");
831 jbd2_journal_cancel_revoke(handle, jh);
832 jbd2_journal_put_journal_head(jh);
833out:
834 return err;
835}
836
837/**
838 * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
839 * non-rewindable consequences
840 * @handle: transaction
841 * @bh: buffer to undo
842 * @credits: store the number of taken credits here (if not NULL)
843 *
844 * Sometimes there is a need to distinguish between metadata which has
845 * been committed to disk and that which has not. The ext3fs code uses
846 * this for freeing and allocating space, we have to make sure that we
847 * do not reuse freed space until the deallocation has been committed,
848 * since if we overwrote that space we would make the delete
849 * un-rewindable in case of a crash.
850 *
851 * To deal with that, jbd2_journal_get_undo_access requests write access to a
852 * buffer for parts of non-rewindable operations such as delete
853 * operations on the bitmaps. The journaling code must keep a copy of
854 * the buffer's contents prior to the undo_access call until such time
855 * as we know that the buffer has definitely been committed to disk.
856 *
857 * We never need to know which transaction the committed data is part
858 * of, buffers touched here are guaranteed to be dirtied later and so
859 * will be committed to a new transaction in due course, at which point
860 * we can discard the old committed data pointer.
861 *
862 * Returns error number or 0 on success.
863 */
864int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
865{
866 int err;
867 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
868 char *committed_data = NULL;
869
870 JBUFFER_TRACE(jh, "entry");
871
872 /*
873 * Do this first --- it can drop the journal lock, so we want to
874 * make sure that obtaining the committed_data is done
875 * atomically wrt. completion of any outstanding commits.
876 */
877 err = do_get_write_access(handle, jh, 1);
878 if (err)
879 goto out;
880
881repeat:
882 if (!jh->b_committed_data) {
883 committed_data = jbd2_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
884 if (!committed_data) {
885 printk(KERN_EMERG "%s: No memory for committed data\n",
886 __FUNCTION__);
887 err = -ENOMEM;
888 goto out;
889 }
890 }
891
892 jbd_lock_bh_state(bh);
893 if (!jh->b_committed_data) {
894 /* Copy out the current buffer contents into the
895 * preserved, committed copy. */
896 JBUFFER_TRACE(jh, "generate b_committed data");
897 if (!committed_data) {
898 jbd_unlock_bh_state(bh);
899 goto repeat;
900 }
901
902 jh->b_committed_data = committed_data;
903 committed_data = NULL;
904 memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
905 }
906 jbd_unlock_bh_state(bh);
907out:
908 jbd2_journal_put_journal_head(jh);
909 if (unlikely(committed_data))
910 jbd2_slab_free(committed_data, bh->b_size);
911 return err;
912}
913
914/**
915 * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
916 * needs to be flushed before we can commit the
917 * current transaction.
918 * @handle: transaction
919 * @bh: bufferhead to mark
920 *
921 * The buffer is placed on the transaction's data list and is marked as
922 * belonging to the transaction.
923 *
924 * Returns error number or 0 on success.
925 *
926 * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
927 * by kswapd.
928 */
929int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
930{
931 journal_t *journal = handle->h_transaction->t_journal;
932 int need_brelse = 0;
933 struct journal_head *jh;
934
935 if (is_handle_aborted(handle))
936 return 0;
937
938 jh = jbd2_journal_add_journal_head(bh);
939 JBUFFER_TRACE(jh, "entry");
940
941 /*
942 * The buffer could *already* be dirty. Writeout can start
943 * at any time.
944 */
945 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
946
947 /*
948 * What if the buffer is already part of a running transaction?
949 *
950 * There are two cases:
951 * 1) It is part of the current running transaction. Refile it,
952 * just in case we have allocated it as metadata, deallocated
953 * it, then reallocated it as data.
954 * 2) It is part of the previous, still-committing transaction.
955 * If all we want to do is to guarantee that the buffer will be
956 * written to disk before this new transaction commits, then
957 * being sure that the *previous* transaction has this same
958 * property is sufficient for us! Just leave it on its old
959 * transaction.
960 *
961 * In case (2), the buffer must not already exist as metadata
962 * --- that would violate write ordering (a transaction is free
963 * to write its data at any point, even before the previous
964 * committing transaction has committed). The caller must
965 * never, ever allow this to happen: there's nothing we can do
966 * about it in this layer.
967 */
968 jbd_lock_bh_state(bh);
969 spin_lock(&journal->j_list_lock);
970 if (jh->b_transaction) {
971 JBUFFER_TRACE(jh, "has transaction");
972 if (jh->b_transaction != handle->h_transaction) {
973 JBUFFER_TRACE(jh, "belongs to older transaction");
974 J_ASSERT_JH(jh, jh->b_transaction ==
975 journal->j_committing_transaction);
976
977 /* @@@ IS THIS TRUE ? */
978 /*
979 * Not any more. Scenario: someone does a write()
980 * in data=journal mode. The buffer's transaction has
981 * moved into commit. Then someone does another
982 * write() to the file. We do the frozen data copyout
983 * and set b_next_transaction to point to j_running_t.
984 * And while we're in that state, someone does a
985 * writepage() in an attempt to pageout the same area
986 * of the file via a shared mapping. At present that
987 * calls jbd2_journal_dirty_data(), and we get right here.
988 * It may be too late to journal the data. Simply
989 * falling through to the next test will suffice: the
990 * data will be dirty and wil be checkpointed. The
991 * ordering comments in the next comment block still
992 * apply.
993 */
994 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
995
996 /*
997 * If we're journalling data, and this buffer was
998 * subject to a write(), it could be metadata, forget
999 * or shadow against the committing transaction. Now,
1000 * someone has dirtied the same darn page via a mapping
1001 * and it is being writepage()'d.
1002 * We *could* just steal the page from commit, with some
1003 * fancy locking there. Instead, we just skip it -
1004 * don't tie the page's buffers to the new transaction
1005 * at all.
1006 * Implication: if we crash before the writepage() data
1007 * is written into the filesystem, recovery will replay
1008 * the write() data.
1009 */
1010 if (jh->b_jlist != BJ_None &&
1011 jh->b_jlist != BJ_SyncData &&
1012 jh->b_jlist != BJ_Locked) {
1013 JBUFFER_TRACE(jh, "Not stealing");
1014 goto no_journal;
1015 }
1016
1017 /*
1018 * This buffer may be undergoing writeout in commit. We
1019 * can't return from here and let the caller dirty it
1020 * again because that can cause the write-out loop in
1021 * commit to never terminate.
1022 */
1023 if (buffer_dirty(bh)) {
1024 get_bh(bh);
1025 spin_unlock(&journal->j_list_lock);
1026 jbd_unlock_bh_state(bh);
1027 need_brelse = 1;
1028 sync_dirty_buffer(bh);
1029 jbd_lock_bh_state(bh);
1030 spin_lock(&journal->j_list_lock);
1031 /* The buffer may become locked again at any
1032 time if it is redirtied */
1033 }
1034
1035 /* journal_clean_data_list() may have got there first */
1036 if (jh->b_transaction != NULL) {
1037 JBUFFER_TRACE(jh, "unfile from commit");
1038 __jbd2_journal_temp_unlink_buffer(jh);
1039 /* It still points to the committing
1040 * transaction; move it to this one so
1041 * that the refile assert checks are
1042 * happy. */
1043 jh->b_transaction = handle->h_transaction;
1044 }
1045 /* The buffer will be refiled below */
1046
1047 }
1048 /*
1049 * Special case --- the buffer might actually have been
1050 * allocated and then immediately deallocated in the previous,
1051 * committing transaction, so might still be left on that
1052 * transaction's metadata lists.
1053 */
1054 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1055 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1056 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1057 __jbd2_journal_temp_unlink_buffer(jh);
1058 jh->b_transaction = handle->h_transaction;
1059 JBUFFER_TRACE(jh, "file as data");
1060 __jbd2_journal_file_buffer(jh, handle->h_transaction,
1061 BJ_SyncData);
1062 }
1063 } else {
1064 JBUFFER_TRACE(jh, "not on a transaction");
1065 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1066 }
1067no_journal:
1068 spin_unlock(&journal->j_list_lock);
1069 jbd_unlock_bh_state(bh);
1070 if (need_brelse) {
1071 BUFFER_TRACE(bh, "brelse");
1072 __brelse(bh);
1073 }
1074 JBUFFER_TRACE(jh, "exit");
1075 jbd2_journal_put_journal_head(jh);
1076 return 0;
1077}
1078
1079/**
1080 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1081 * @handle: transaction to add buffer to.
1082 * @bh: buffer to mark
1083 *
1084 * mark dirty metadata which needs to be journaled as part of the current
1085 * transaction.
1086 *
1087 * The buffer is placed on the transaction's metadata list and is marked
1088 * as belonging to the transaction.
1089 *
1090 * Returns error number or 0 on success.
1091 *
1092 * Special care needs to be taken if the buffer already belongs to the
1093 * current committing transaction (in which case we should have frozen
1094 * data present for that commit). In that case, we don't relink the
1095 * buffer: that only gets done when the old transaction finally
1096 * completes its commit.
1097 */
1098int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1099{
1100 transaction_t *transaction = handle->h_transaction;
1101 journal_t *journal = transaction->t_journal;
1102 struct journal_head *jh = bh2jh(bh);
1103
1104 jbd_debug(5, "journal_head %p\n", jh);
1105 JBUFFER_TRACE(jh, "entry");
1106 if (is_handle_aborted(handle))
1107 goto out;
1108
1109 jbd_lock_bh_state(bh);
1110
1111 if (jh->b_modified == 0) {
1112 /*
1113 * This buffer's got modified and becoming part
1114 * of the transaction. This needs to be done
1115 * once a transaction -bzzz
1116 */
1117 jh->b_modified = 1;
1118 J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1119 handle->h_buffer_credits--;
1120 }
1121
1122 /*
1123 * fastpath, to avoid expensive locking. If this buffer is already
1124 * on the running transaction's metadata list there is nothing to do.
1125 * Nobody can take it off again because there is a handle open.
1126 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1127 * result in this test being false, so we go in and take the locks.
1128 */
1129 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1130 JBUFFER_TRACE(jh, "fastpath");
1131 J_ASSERT_JH(jh, jh->b_transaction ==
1132 journal->j_running_transaction);
1133 goto out_unlock_bh;
1134 }
1135
1136 set_buffer_jbddirty(bh);
1137
1138 /*
1139 * Metadata already on the current transaction list doesn't
1140 * need to be filed. Metadata on another transaction's list must
1141 * be committing, and will be refiled once the commit completes:
1142 * leave it alone for now.
1143 */
1144 if (jh->b_transaction != transaction) {
1145 JBUFFER_TRACE(jh, "already on other transaction");
1146 J_ASSERT_JH(jh, jh->b_transaction ==
1147 journal->j_committing_transaction);
1148 J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1149 /* And this case is illegal: we can't reuse another
1150 * transaction's data buffer, ever. */
1151 goto out_unlock_bh;
1152 }
1153
1154 /* That test should have eliminated the following case: */
1155 J_ASSERT_JH(jh, jh->b_frozen_data == 0);
1156
1157 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1158 spin_lock(&journal->j_list_lock);
1159 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1160 spin_unlock(&journal->j_list_lock);
1161out_unlock_bh:
1162 jbd_unlock_bh_state(bh);
1163out:
1164 JBUFFER_TRACE(jh, "exit");
1165 return 0;
1166}
1167
1168/*
1169 * jbd2_journal_release_buffer: undo a get_write_access without any buffer
1170 * updates, if the update decided in the end that it didn't need access.
1171 *
1172 */
1173void
1174jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1175{
1176 BUFFER_TRACE(bh, "entry");
1177}
1178
1179/**
1180 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1181 * @handle: transaction handle
1182 * @bh: bh to 'forget'
1183 *
1184 * We can only do the bforget if there are no commits pending against the
1185 * buffer. If the buffer is dirty in the current running transaction we
1186 * can safely unlink it.
1187 *
1188 * bh may not be a journalled buffer at all - it may be a non-JBD
1189 * buffer which came off the hashtable. Check for this.
1190 *
1191 * Decrements bh->b_count by one.
1192 *
1193 * Allow this call even if the handle has aborted --- it may be part of
1194 * the caller's cleanup after an abort.
1195 */
1196int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1197{
1198 transaction_t *transaction = handle->h_transaction;
1199 journal_t *journal = transaction->t_journal;
1200 struct journal_head *jh;
1201 int drop_reserve = 0;
1202 int err = 0;
1203
1204 BUFFER_TRACE(bh, "entry");
1205
1206 jbd_lock_bh_state(bh);
1207 spin_lock(&journal->j_list_lock);
1208
1209 if (!buffer_jbd(bh))
1210 goto not_jbd;
1211 jh = bh2jh(bh);
1212
1213 /* Critical error: attempting to delete a bitmap buffer, maybe?
1214 * Don't do any jbd operations, and return an error. */
1215 if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1216 "inconsistent data on disk")) {
1217 err = -EIO;
1218 goto not_jbd;
1219 }
1220
1221 /*
1222 * The buffer's going from the transaction, we must drop
1223 * all references -bzzz
1224 */
1225 jh->b_modified = 0;
1226
1227 if (jh->b_transaction == handle->h_transaction) {
1228 J_ASSERT_JH(jh, !jh->b_frozen_data);
1229
1230 /* If we are forgetting a buffer which is already part
1231 * of this transaction, then we can just drop it from
1232 * the transaction immediately. */
1233 clear_buffer_dirty(bh);
1234 clear_buffer_jbddirty(bh);
1235
1236 JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1237
1238 drop_reserve = 1;
1239
1240 /*
1241 * We are no longer going to journal this buffer.
1242 * However, the commit of this transaction is still
1243 * important to the buffer: the delete that we are now
1244 * processing might obsolete an old log entry, so by
1245 * committing, we can satisfy the buffer's checkpoint.
1246 *
1247 * So, if we have a checkpoint on the buffer, we should
1248 * now refile the buffer on our BJ_Forget list so that
1249 * we know to remove the checkpoint after we commit.
1250 */
1251
1252 if (jh->b_cp_transaction) {
1253 __jbd2_journal_temp_unlink_buffer(jh);
1254 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1255 } else {
1256 __jbd2_journal_unfile_buffer(jh);
1257 jbd2_journal_remove_journal_head(bh);
1258 __brelse(bh);
1259 if (!buffer_jbd(bh)) {
1260 spin_unlock(&journal->j_list_lock);
1261 jbd_unlock_bh_state(bh);
1262 __bforget(bh);
1263 goto drop;
1264 }
1265 }
1266 } else if (jh->b_transaction) {
1267 J_ASSERT_JH(jh, (jh->b_transaction ==
1268 journal->j_committing_transaction));
1269 /* However, if the buffer is still owned by a prior
1270 * (committing) transaction, we can't drop it yet... */
1271 JBUFFER_TRACE(jh, "belongs to older transaction");
1272 /* ... but we CAN drop it from the new transaction if we
1273 * have also modified it since the original commit. */
1274
1275 if (jh->b_next_transaction) {
1276 J_ASSERT(jh->b_next_transaction == transaction);
1277 jh->b_next_transaction = NULL;
1278 drop_reserve = 1;
1279 }
1280 }
1281
1282not_jbd:
1283 spin_unlock(&journal->j_list_lock);
1284 jbd_unlock_bh_state(bh);
1285 __brelse(bh);
1286drop:
1287 if (drop_reserve) {
1288 /* no need to reserve log space for this block -bzzz */
1289 handle->h_buffer_credits++;
1290 }
1291 return err;
1292}
1293
1294/**
1295 * int jbd2_journal_stop() - complete a transaction
1296 * @handle: tranaction to complete.
1297 *
1298 * All done for a particular handle.
1299 *
1300 * There is not much action needed here. We just return any remaining
1301 * buffer credits to the transaction and remove the handle. The only
1302 * complication is that we need to start a commit operation if the
1303 * filesystem is marked for synchronous update.
1304 *
1305 * jbd2_journal_stop itself will not usually return an error, but it may
1306 * do so in unusual circumstances. In particular, expect it to
1307 * return -EIO if a jbd2_journal_abort has been executed since the
1308 * transaction began.
1309 */
1310int jbd2_journal_stop(handle_t *handle)
1311{
1312 transaction_t *transaction = handle->h_transaction;
1313 journal_t *journal = transaction->t_journal;
1314 int old_handle_count, err;
1315 pid_t pid;
1316
1317 J_ASSERT(journal_current_handle() == handle);
1318
1319 if (is_handle_aborted(handle))
1320 err = -EIO;
1321 else {
1322 J_ASSERT(transaction->t_updates > 0);
1323 err = 0;
1324 }
1325
1326 if (--handle->h_ref > 0) {
1327 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1328 handle->h_ref);
1329 return err;
1330 }
1331
1332 jbd_debug(4, "Handle %p going down\n", handle);
1333
1334 /*
1335 * Implement synchronous transaction batching. If the handle
1336 * was synchronous, don't force a commit immediately. Let's
1337 * yield and let another thread piggyback onto this transaction.
1338 * Keep doing that while new threads continue to arrive.
1339 * It doesn't cost much - we're about to run a commit and sleep
1340 * on IO anyway. Speeds up many-threaded, many-dir operations
1341 * by 30x or more...
1342 *
1343 * But don't do this if this process was the most recent one to
1344 * perform a synchronous write. We do this to detect the case where a
1345 * single process is doing a stream of sync writes. No point in waiting
1346 * for joiners in that case.
1347 */
1348 pid = current->pid;
1349 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1350 journal->j_last_sync_writer = pid;
1351 do {
1352 old_handle_count = transaction->t_handle_count;
1353 schedule_timeout_uninterruptible(1);
1354 } while (old_handle_count != transaction->t_handle_count);
1355 }
1356
1357 current->journal_info = NULL;
1358 spin_lock(&journal->j_state_lock);
1359 spin_lock(&transaction->t_handle_lock);
1360 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1361 transaction->t_updates--;
1362 if (!transaction->t_updates) {
1363 wake_up(&journal->j_wait_updates);
1364 if (journal->j_barrier_count)
1365 wake_up(&journal->j_wait_transaction_locked);
1366 }
1367
1368 /*
1369 * If the handle is marked SYNC, we need to set another commit
1370 * going! We also want to force a commit if the current
1371 * transaction is occupying too much of the log, or if the
1372 * transaction is too old now.
1373 */
1374 if (handle->h_sync ||
1375 transaction->t_outstanding_credits >
1376 journal->j_max_transaction_buffers ||
1377 time_after_eq(jiffies, transaction->t_expires)) {
1378 /* Do this even for aborted journals: an abort still
1379 * completes the commit thread, it just doesn't write
1380 * anything to disk. */
1381 tid_t tid = transaction->t_tid;
1382
1383 spin_unlock(&transaction->t_handle_lock);
1384 jbd_debug(2, "transaction too old, requesting commit for "
1385 "handle %p\n", handle);
1386 /* This is non-blocking */
1387 __jbd2_log_start_commit(journal, transaction->t_tid);
1388 spin_unlock(&journal->j_state_lock);
1389
1390 /*
1391 * Special case: JBD2_SYNC synchronous updates require us
1392 * to wait for the commit to complete.
1393 */
1394 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1395 err = jbd2_log_wait_commit(journal, tid);
1396 } else {
1397 spin_unlock(&transaction->t_handle_lock);
1398 spin_unlock(&journal->j_state_lock);
1399 }
1400
1401 jbd_free_handle(handle);
1402 return err;
1403}
1404
1405/**int jbd2_journal_force_commit() - force any uncommitted transactions
1406 * @journal: journal to force
1407 *
1408 * For synchronous operations: force any uncommitted transactions
1409 * to disk. May seem kludgy, but it reuses all the handle batching
1410 * code in a very simple manner.
1411 */
1412int jbd2_journal_force_commit(journal_t *journal)
1413{
1414 handle_t *handle;
1415 int ret;
1416
1417 handle = jbd2_journal_start(journal, 1);
1418 if (IS_ERR(handle)) {
1419 ret = PTR_ERR(handle);
1420 } else {
1421 handle->h_sync = 1;
1422 ret = jbd2_journal_stop(handle);
1423 }
1424 return ret;
1425}
1426
1427/*
1428 *
1429 * List management code snippets: various functions for manipulating the
1430 * transaction buffer lists.
1431 *
1432 */
1433
1434/*
1435 * Append a buffer to a transaction list, given the transaction's list head
1436 * pointer.
1437 *
1438 * j_list_lock is held.
1439 *
1440 * jbd_lock_bh_state(jh2bh(jh)) is held.
1441 */
1442
1443static inline void
1444__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1445{
1446 if (!*list) {
1447 jh->b_tnext = jh->b_tprev = jh;
1448 *list = jh;
1449 } else {
1450 /* Insert at the tail of the list to preserve order */
1451 struct journal_head *first = *list, *last = first->b_tprev;
1452 jh->b_tprev = last;
1453 jh->b_tnext = first;
1454 last->b_tnext = first->b_tprev = jh;
1455 }
1456}
1457
1458/*
1459 * Remove a buffer from a transaction list, given the transaction's list
1460 * head pointer.
1461 *
1462 * Called with j_list_lock held, and the journal may not be locked.
1463 *
1464 * jbd_lock_bh_state(jh2bh(jh)) is held.
1465 */
1466
1467static inline void
1468__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1469{
1470 if (*list == jh) {
1471 *list = jh->b_tnext;
1472 if (*list == jh)
1473 *list = NULL;
1474 }
1475 jh->b_tprev->b_tnext = jh->b_tnext;
1476 jh->b_tnext->b_tprev = jh->b_tprev;
1477}
1478
1479/*
1480 * Remove a buffer from the appropriate transaction list.
1481 *
1482 * Note that this function can *change* the value of
1483 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
1484 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
1485 * is holding onto a copy of one of thee pointers, it could go bad.
1486 * Generally the caller needs to re-read the pointer from the transaction_t.
1487 *
1488 * Called under j_list_lock. The journal may not be locked.
1489 */
1490void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1491{
1492 struct journal_head **list = NULL;
1493 transaction_t *transaction;
1494 struct buffer_head *bh = jh2bh(jh);
1495
1496 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1497 transaction = jh->b_transaction;
1498 if (transaction)
1499 assert_spin_locked(&transaction->t_journal->j_list_lock);
1500
1501 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1502 if (jh->b_jlist != BJ_None)
1503 J_ASSERT_JH(jh, transaction != 0);
1504
1505 switch (jh->b_jlist) {
1506 case BJ_None:
1507 return;
1508 case BJ_SyncData:
1509 list = &transaction->t_sync_datalist;
1510 break;
1511 case BJ_Metadata:
1512 transaction->t_nr_buffers--;
1513 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1514 list = &transaction->t_buffers;
1515 break;
1516 case BJ_Forget:
1517 list = &transaction->t_forget;
1518 break;
1519 case BJ_IO:
1520 list = &transaction->t_iobuf_list;
1521 break;
1522 case BJ_Shadow:
1523 list = &transaction->t_shadow_list;
1524 break;
1525 case BJ_LogCtl:
1526 list = &transaction->t_log_list;
1527 break;
1528 case BJ_Reserved:
1529 list = &transaction->t_reserved_list;
1530 break;
1531 case BJ_Locked:
1532 list = &transaction->t_locked_list;
1533 break;
1534 }
1535
1536 __blist_del_buffer(list, jh);
1537 jh->b_jlist = BJ_None;
1538 if (test_clear_buffer_jbddirty(bh))
1539 mark_buffer_dirty(bh); /* Expose it to the VM */
1540}
1541
1542void __jbd2_journal_unfile_buffer(struct journal_head *jh)
1543{
1544 __jbd2_journal_temp_unlink_buffer(jh);
1545 jh->b_transaction = NULL;
1546}
1547
1548void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1549{
1550 jbd_lock_bh_state(jh2bh(jh));
1551 spin_lock(&journal->j_list_lock);
1552 __jbd2_journal_unfile_buffer(jh);
1553 spin_unlock(&journal->j_list_lock);
1554 jbd_unlock_bh_state(jh2bh(jh));
1555}
1556
1557/*
1558 * Called from jbd2_journal_try_to_free_buffers().
1559 *
1560 * Called under jbd_lock_bh_state(bh)
1561 */
1562static void
1563__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1564{
1565 struct journal_head *jh;
1566
1567 jh = bh2jh(bh);
1568
1569 if (buffer_locked(bh) || buffer_dirty(bh))
1570 goto out;
1571
1572 if (jh->b_next_transaction != 0)
1573 goto out;
1574
1575 spin_lock(&journal->j_list_lock);
1576 if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
1577 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1578 /* A written-back ordered data buffer */
1579 JBUFFER_TRACE(jh, "release data");
1580 __jbd2_journal_unfile_buffer(jh);
1581 jbd2_journal_remove_journal_head(bh);
1582 __brelse(bh);
1583 }
1584 } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
1585 /* written-back checkpointed metadata buffer */
1586 if (jh->b_jlist == BJ_None) {
1587 JBUFFER_TRACE(jh, "remove from checkpoint list");
1588 __jbd2_journal_remove_checkpoint(jh);
1589 jbd2_journal_remove_journal_head(bh);
1590 __brelse(bh);
1591 }
1592 }
1593 spin_unlock(&journal->j_list_lock);
1594out:
1595 return;
1596}
1597
1598
1599/**
1600 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1601 * @journal: journal for operation
1602 * @page: to try and free
1603 * @unused_gfp_mask: unused
1604 *
1605 *
1606 * For all the buffers on this page,
1607 * if they are fully written out ordered data, move them onto BUF_CLEAN
1608 * so try_to_free_buffers() can reap them.
1609 *
1610 * This function returns non-zero if we wish try_to_free_buffers()
1611 * to be called. We do this if the page is releasable by try_to_free_buffers().
1612 * We also do it if the page has locked or dirty buffers and the caller wants
1613 * us to perform sync or async writeout.
1614 *
1615 * This complicates JBD locking somewhat. We aren't protected by the
1616 * BKL here. We wish to remove the buffer from its committing or
1617 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
1618 *
1619 * This may *change* the value of transaction_t->t_datalist, so anyone
1620 * who looks at t_datalist needs to lock against this function.
1621 *
1622 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
1623 * buffer. So we need to lock against that. jbd2_journal_dirty_data()
1624 * will come out of the lock with the buffer dirty, which makes it
1625 * ineligible for release here.
1626 *
1627 * Who else is affected by this? hmm... Really the only contender
1628 * is do_get_write_access() - it could be looking at the buffer while
1629 * journal_try_to_free_buffer() is changing its state. But that
1630 * cannot happen because we never reallocate freed data as metadata
1631 * while the data is part of a transaction. Yes?
1632 */
1633int jbd2_journal_try_to_free_buffers(journal_t *journal,
1634 struct page *page, gfp_t unused_gfp_mask)
1635{
1636 struct buffer_head *head;
1637 struct buffer_head *bh;
1638 int ret = 0;
1639
1640 J_ASSERT(PageLocked(page));
1641
1642 head = page_buffers(page);
1643 bh = head;
1644 do {
1645 struct journal_head *jh;
1646
1647 /*
1648 * We take our own ref against the journal_head here to avoid
1649 * having to add tons of locking around each instance of
1650 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
1651 */
1652 jh = jbd2_journal_grab_journal_head(bh);
1653 if (!jh)
1654 continue;
1655
1656 jbd_lock_bh_state(bh);
1657 __journal_try_to_free_buffer(journal, bh);
1658 jbd2_journal_put_journal_head(jh);
1659 jbd_unlock_bh_state(bh);
1660 if (buffer_jbd(bh))
1661 goto busy;
1662 } while ((bh = bh->b_this_page) != head);
1663 ret = try_to_free_buffers(page);
1664busy:
1665 return ret;
1666}
1667
1668/*
1669 * This buffer is no longer needed. If it is on an older transaction's
1670 * checkpoint list we need to record it on this transaction's forget list
1671 * to pin this buffer (and hence its checkpointing transaction) down until
1672 * this transaction commits. If the buffer isn't on a checkpoint list, we
1673 * release it.
1674 * Returns non-zero if JBD no longer has an interest in the buffer.
1675 *
1676 * Called under j_list_lock.
1677 *
1678 * Called under jbd_lock_bh_state(bh).
1679 */
1680static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1681{
1682 int may_free = 1;
1683 struct buffer_head *bh = jh2bh(jh);
1684
1685 __jbd2_journal_unfile_buffer(jh);
1686
1687 if (jh->b_cp_transaction) {
1688 JBUFFER_TRACE(jh, "on running+cp transaction");
1689 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1690 clear_buffer_jbddirty(bh);
1691 may_free = 0;
1692 } else {
1693 JBUFFER_TRACE(jh, "on running transaction");
1694 jbd2_journal_remove_journal_head(bh);
1695 __brelse(bh);
1696 }
1697 return may_free;
1698}
1699
1700/*
1701 * jbd2_journal_invalidatepage
1702 *
1703 * This code is tricky. It has a number of cases to deal with.
1704 *
1705 * There are two invariants which this code relies on:
1706 *
1707 * i_size must be updated on disk before we start calling invalidatepage on the
1708 * data.
1709 *
1710 * This is done in ext3 by defining an ext3_setattr method which
1711 * updates i_size before truncate gets going. By maintaining this
1712 * invariant, we can be sure that it is safe to throw away any buffers
1713 * attached to the current transaction: once the transaction commits,
1714 * we know that the data will not be needed.
1715 *
1716 * Note however that we can *not* throw away data belonging to the
1717 * previous, committing transaction!
1718 *
1719 * Any disk blocks which *are* part of the previous, committing
1720 * transaction (and which therefore cannot be discarded immediately) are
1721 * not going to be reused in the new running transaction
1722 *
1723 * The bitmap committed_data images guarantee this: any block which is
1724 * allocated in one transaction and removed in the next will be marked
1725 * as in-use in the committed_data bitmap, so cannot be reused until
1726 * the next transaction to delete the block commits. This means that
1727 * leaving committing buffers dirty is quite safe: the disk blocks
1728 * cannot be reallocated to a different file and so buffer aliasing is
1729 * not possible.
1730 *
1731 *
1732 * The above applies mainly to ordered data mode. In writeback mode we
1733 * don't make guarantees about the order in which data hits disk --- in
1734 * particular we don't guarantee that new dirty data is flushed before
1735 * transaction commit --- so it is always safe just to discard data
1736 * immediately in that mode. --sct
1737 */
1738
1739/*
1740 * The journal_unmap_buffer helper function returns zero if the buffer
1741 * concerned remains pinned as an anonymous buffer belonging to an older
1742 * transaction.
1743 *
1744 * We're outside-transaction here. Either or both of j_running_transaction
1745 * and j_committing_transaction may be NULL.
1746 */
1747static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1748{
1749 transaction_t *transaction;
1750 struct journal_head *jh;
1751 int may_free = 1;
1752 int ret;
1753
1754 BUFFER_TRACE(bh, "entry");
1755
1756 /*
1757 * It is safe to proceed here without the j_list_lock because the
1758 * buffers cannot be stolen by try_to_free_buffers as long as we are
1759 * holding the page lock. --sct
1760 */
1761
1762 if (!buffer_jbd(bh))
1763 goto zap_buffer_unlocked;
1764
1765 spin_lock(&journal->j_state_lock);
1766 jbd_lock_bh_state(bh);
1767 spin_lock(&journal->j_list_lock);
1768
1769 jh = jbd2_journal_grab_journal_head(bh);
1770 if (!jh)
1771 goto zap_buffer_no_jh;
1772
1773 transaction = jh->b_transaction;
1774 if (transaction == NULL) {
1775 /* First case: not on any transaction. If it
1776 * has no checkpoint link, then we can zap it:
1777 * it's a writeback-mode buffer so we don't care
1778 * if it hits disk safely. */
1779 if (!jh->b_cp_transaction) {
1780 JBUFFER_TRACE(jh, "not on any transaction: zap");
1781 goto zap_buffer;
1782 }
1783
1784 if (!buffer_dirty(bh)) {
1785 /* bdflush has written it. We can drop it now */
1786 goto zap_buffer;
1787 }
1788
1789 /* OK, it must be in the journal but still not
1790 * written fully to disk: it's metadata or
1791 * journaled data... */
1792
1793 if (journal->j_running_transaction) {
1794 /* ... and once the current transaction has
1795 * committed, the buffer won't be needed any
1796 * longer. */
1797 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
1798 ret = __dispose_buffer(jh,
1799 journal->j_running_transaction);
1800 jbd2_journal_put_journal_head(jh);
1801 spin_unlock(&journal->j_list_lock);
1802 jbd_unlock_bh_state(bh);
1803 spin_unlock(&journal->j_state_lock);
1804 return ret;
1805 } else {
1806 /* There is no currently-running transaction. So the
1807 * orphan record which we wrote for this file must have
1808 * passed into commit. We must attach this buffer to
1809 * the committing transaction, if it exists. */
1810 if (journal->j_committing_transaction) {
1811 JBUFFER_TRACE(jh, "give to committing trans");
1812 ret = __dispose_buffer(jh,
1813 journal->j_committing_transaction);
1814 jbd2_journal_put_journal_head(jh);
1815 spin_unlock(&journal->j_list_lock);
1816 jbd_unlock_bh_state(bh);
1817 spin_unlock(&journal->j_state_lock);
1818 return ret;
1819 } else {
1820 /* The orphan record's transaction has
1821 * committed. We can cleanse this buffer */
1822 clear_buffer_jbddirty(bh);
1823 goto zap_buffer;
1824 }
1825 }
1826 } else if (transaction == journal->j_committing_transaction) {
1827 if (jh->b_jlist == BJ_Locked) {
1828 /*
1829 * The buffer is on the committing transaction's locked
1830 * list. We have the buffer locked, so I/O has
1831 * completed. So we can nail the buffer now.
1832 */
1833 may_free = __dispose_buffer(jh, transaction);
1834 goto zap_buffer;
1835 }
1836 /*
1837 * If it is committing, we simply cannot touch it. We
1838 * can remove it's next_transaction pointer from the
1839 * running transaction if that is set, but nothing
1840 * else. */
1841 JBUFFER_TRACE(jh, "on committing transaction");
1842 set_buffer_freed(bh);
1843 if (jh->b_next_transaction) {
1844 J_ASSERT(jh->b_next_transaction ==
1845 journal->j_running_transaction);
1846 jh->b_next_transaction = NULL;
1847 }
1848 jbd2_journal_put_journal_head(jh);
1849 spin_unlock(&journal->j_list_lock);
1850 jbd_unlock_bh_state(bh);
1851 spin_unlock(&journal->j_state_lock);
1852 return 0;
1853 } else {
1854 /* Good, the buffer belongs to the running transaction.
1855 * We are writing our own transaction's data, not any
1856 * previous one's, so it is safe to throw it away
1857 * (remember that we expect the filesystem to have set
1858 * i_size already for this truncate so recovery will not
1859 * expose the disk blocks we are discarding here.) */
1860 J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
1861 may_free = __dispose_buffer(jh, transaction);
1862 }
1863
1864zap_buffer:
1865 jbd2_journal_put_journal_head(jh);
1866zap_buffer_no_jh:
1867 spin_unlock(&journal->j_list_lock);
1868 jbd_unlock_bh_state(bh);
1869 spin_unlock(&journal->j_state_lock);
1870zap_buffer_unlocked:
1871 clear_buffer_dirty(bh);
1872 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
1873 clear_buffer_mapped(bh);
1874 clear_buffer_req(bh);
1875 clear_buffer_new(bh);
1876 bh->b_bdev = NULL;
1877 return may_free;
1878}
1879
1880/**
1881 * void jbd2_journal_invalidatepage()
1882 * @journal: journal to use for flush...
1883 * @page: page to flush
1884 * @offset: length of page to invalidate.
1885 *
1886 * Reap page buffers containing data after offset in page.
1887 *
1888 */
1889void jbd2_journal_invalidatepage(journal_t *journal,
1890 struct page *page,
1891 unsigned long offset)
1892{
1893 struct buffer_head *head, *bh, *next;
1894 unsigned int curr_off = 0;
1895 int may_free = 1;
1896
1897 if (!PageLocked(page))
1898 BUG();
1899 if (!page_has_buffers(page))
1900 return;
1901
1902 /* We will potentially be playing with lists other than just the
1903 * data lists (especially for journaled data mode), so be
1904 * cautious in our locking. */
1905
1906 head = bh = page_buffers(page);
1907 do {
1908 unsigned int next_off = curr_off + bh->b_size;
1909 next = bh->b_this_page;
1910
1911 if (offset <= curr_off) {
1912 /* This block is wholly outside the truncation point */
1913 lock_buffer(bh);
1914 may_free &= journal_unmap_buffer(journal, bh);
1915 unlock_buffer(bh);
1916 }
1917 curr_off = next_off;
1918 bh = next;
1919
1920 } while (bh != head);
1921
1922 if (!offset) {
1923 if (may_free && try_to_free_buffers(page))
1924 J_ASSERT(!page_has_buffers(page));
1925 }
1926}
1927
1928/*
1929 * File a buffer on the given transaction list.
1930 */
1931void __jbd2_journal_file_buffer(struct journal_head *jh,
1932 transaction_t *transaction, int jlist)
1933{
1934 struct journal_head **list = NULL;
1935 int was_dirty = 0;
1936 struct buffer_head *bh = jh2bh(jh);
1937
1938 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1939 assert_spin_locked(&transaction->t_journal->j_list_lock);
1940
1941 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1942 J_ASSERT_JH(jh, jh->b_transaction == transaction ||
1943 jh->b_transaction == 0);
1944
1945 if (jh->b_transaction && jh->b_jlist == jlist)
1946 return;
1947
1948 /* The following list of buffer states needs to be consistent
1949 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
1950 * state. */
1951
1952 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
1953 jlist == BJ_Shadow || jlist == BJ_Forget) {
1954 if (test_clear_buffer_dirty(bh) ||
1955 test_clear_buffer_jbddirty(bh))
1956 was_dirty = 1;
1957 }
1958
1959 if (jh->b_transaction)
1960 __jbd2_journal_temp_unlink_buffer(jh);
1961 jh->b_transaction = transaction;
1962
1963 switch (jlist) {
1964 case BJ_None:
1965 J_ASSERT_JH(jh, !jh->b_committed_data);
1966 J_ASSERT_JH(jh, !jh->b_frozen_data);
1967 return;
1968 case BJ_SyncData:
1969 list = &transaction->t_sync_datalist;
1970 break;
1971 case BJ_Metadata:
1972 transaction->t_nr_buffers++;
1973 list = &transaction->t_buffers;
1974 break;
1975 case BJ_Forget:
1976 list = &transaction->t_forget;
1977 break;
1978 case BJ_IO:
1979 list = &transaction->t_iobuf_list;
1980 break;
1981 case BJ_Shadow:
1982 list = &transaction->t_shadow_list;
1983 break;
1984 case BJ_LogCtl:
1985 list = &transaction->t_log_list;
1986 break;
1987 case BJ_Reserved:
1988 list = &transaction->t_reserved_list;
1989 break;
1990 case BJ_Locked:
1991 list = &transaction->t_locked_list;
1992 break;
1993 }
1994
1995 __blist_add_buffer(list, jh);
1996 jh->b_jlist = jlist;
1997
1998 if (was_dirty)
1999 set_buffer_jbddirty(bh);
2000}
2001
2002void jbd2_journal_file_buffer(struct journal_head *jh,
2003 transaction_t *transaction, int jlist)
2004{
2005 jbd_lock_bh_state(jh2bh(jh));
2006 spin_lock(&transaction->t_journal->j_list_lock);
2007 __jbd2_journal_file_buffer(jh, transaction, jlist);
2008 spin_unlock(&transaction->t_journal->j_list_lock);
2009 jbd_unlock_bh_state(jh2bh(jh));
2010}
2011
2012/*
2013 * Remove a buffer from its current buffer list in preparation for
2014 * dropping it from its current transaction entirely. If the buffer has
2015 * already started to be used by a subsequent transaction, refile the
2016 * buffer on that transaction's metadata list.
2017 *
2018 * Called under journal->j_list_lock
2019 *
2020 * Called under jbd_lock_bh_state(jh2bh(jh))
2021 */
2022void __jbd2_journal_refile_buffer(struct journal_head *jh)
2023{
2024 int was_dirty;
2025 struct buffer_head *bh = jh2bh(jh);
2026
2027 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
2028 if (jh->b_transaction)
2029 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
2030
2031 /* If the buffer is now unused, just drop it. */
2032 if (jh->b_next_transaction == NULL) {
2033 __jbd2_journal_unfile_buffer(jh);
2034 return;
2035 }
2036
2037 /*
2038 * It has been modified by a later transaction: add it to the new
2039 * transaction's metadata list.
2040 */
2041
2042 was_dirty = test_clear_buffer_jbddirty(bh);
2043 __jbd2_journal_temp_unlink_buffer(jh);
2044 jh->b_transaction = jh->b_next_transaction;
2045 jh->b_next_transaction = NULL;
2046 __jbd2_journal_file_buffer(jh, jh->b_transaction,
2047 was_dirty ? BJ_Metadata : BJ_Reserved);
2048 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
2049
2050 if (was_dirty)
2051 set_buffer_jbddirty(bh);
2052}
2053
2054/*
2055 * For the unlocked version of this call, also make sure that any
2056 * hanging journal_head is cleaned up if necessary.
2057 *
2058 * __jbd2_journal_refile_buffer is usually called as part of a single locked
2059 * operation on a buffer_head, in which the caller is probably going to
2060 * be hooking the journal_head onto other lists. In that case it is up
2061 * to the caller to remove the journal_head if necessary. For the
2062 * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be
2063 * doing anything else to the buffer so we need to do the cleanup
2064 * ourselves to avoid a jh leak.
2065 *
2066 * *** The journal_head may be freed by this call! ***
2067 */
2068void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2069{
2070 struct buffer_head *bh = jh2bh(jh);
2071
2072 jbd_lock_bh_state(bh);
2073 spin_lock(&journal->j_list_lock);
2074
2075 __jbd2_journal_refile_buffer(jh);
2076 jbd_unlock_bh_state(bh);
2077 jbd2_journal_remove_journal_head(bh);
2078
2079 spin_unlock(&journal->j_list_lock);
2080 __brelse(bh);
2081}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 6de374513c01..bc4b8106a490 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -334,10 +334,10 @@ static int __init init_jffs2_fs(void)
334 which means just 'no padding', without the alignment 334 which means just 'no padding', without the alignment
335 thing. But GCC doesn't have that -- we have to just 335 thing. But GCC doesn't have that -- we have to just
336 hope the structs are the right sizes, instead. */ 336 hope the structs are the right sizes, instead. */
337 BUG_ON(sizeof(struct jffs2_unknown_node) != 12); 337 BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
338 BUG_ON(sizeof(struct jffs2_raw_dirent) != 40); 338 BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
339 BUG_ON(sizeof(struct jffs2_raw_inode) != 68); 339 BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
340 BUG_ON(sizeof(struct jffs2_raw_summary) != 32); 340 BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
341 341
342 printk(KERN_INFO "JFFS2 version 2.2." 342 printk(KERN_INFO "JFFS2 version 2.2."
343#ifdef CONFIG_JFFS2_FS_WRITEBUFFER 343#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 87e1d03e8267..b85a0ad2cfb6 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -100,12 +100,12 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
100/* 100/*
101 * The server lockd has called us back to tell us the lock was granted 101 * The server lockd has called us back to tell us the lock was granted
102 */ 102 */
103u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) 103__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
104{ 104{
105 const struct file_lock *fl = &lock->fl; 105 const struct file_lock *fl = &lock->fl;
106 const struct nfs_fh *fh = &lock->fh; 106 const struct nfs_fh *fh = &lock->fh;
107 struct nlm_wait *block; 107 struct nlm_wait *block;
108 u32 res = nlm_lck_denied; 108 __be32 res = nlm_lck_denied;
109 109
110 /* 110 /*
111 * Look up blocked request based on arguments. 111 * Look up blocked request based on arguments.
@@ -144,42 +144,12 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
144 */ 144 */
145 145
146/* 146/*
147 * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
148 * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
149 */
150static void nlmclnt_prepare_reclaim(struct nlm_host *host)
151{
152 down_write(&host->h_rwsem);
153 host->h_monitored = 0;
154 host->h_state++;
155 host->h_nextrebind = 0;
156 nlm_rebind_host(host);
157
158 /*
159 * Mark the locks for reclaiming.
160 */
161 list_splice_init(&host->h_granted, &host->h_reclaim);
162
163 dprintk("NLM: reclaiming locks for host %s\n", host->h_name);
164}
165
166static void nlmclnt_finish_reclaim(struct nlm_host *host)
167{
168 host->h_reclaiming = 0;
169 up_write(&host->h_rwsem);
170 dprintk("NLM: done reclaiming locks for host %s", host->h_name);
171}
172
173/*
174 * Reclaim all locks on server host. We do this by spawning a separate 147 * Reclaim all locks on server host. We do this by spawning a separate
175 * reclaimer thread. 148 * reclaimer thread.
176 */ 149 */
177void 150void
178nlmclnt_recovery(struct nlm_host *host, u32 newstate) 151nlmclnt_recovery(struct nlm_host *host)
179{ 152{
180 if (host->h_nsmstate == newstate)
181 return;
182 host->h_nsmstate = newstate;
183 if (!host->h_reclaiming++) { 153 if (!host->h_reclaiming++) {
184 nlm_get_host(host); 154 nlm_get_host(host);
185 __module_get(THIS_MODULE); 155 __module_get(THIS_MODULE);
@@ -199,18 +169,30 @@ reclaimer(void *ptr)
199 daemonize("%s-reclaim", host->h_name); 169 daemonize("%s-reclaim", host->h_name);
200 allow_signal(SIGKILL); 170 allow_signal(SIGKILL);
201 171
172 down_write(&host->h_rwsem);
173
202 /* This one ensures that our parent doesn't terminate while the 174 /* This one ensures that our parent doesn't terminate while the
203 * reclaim is in progress */ 175 * reclaim is in progress */
204 lock_kernel(); 176 lock_kernel();
205 lockd_up(0); /* note: this cannot fail as lockd is already running */ 177 lockd_up(0); /* note: this cannot fail as lockd is already running */
206 178
207 nlmclnt_prepare_reclaim(host); 179 dprintk("lockd: reclaiming locks for host %s", host->h_name);
208 /* First, reclaim all locks that have been marked. */ 180
209restart: 181restart:
210 nsmstate = host->h_nsmstate; 182 nsmstate = host->h_nsmstate;
183
184 /* Force a portmap getport - the peer's lockd will
185 * most likely end up on a different port.
186 */
187 host->h_nextrebind = jiffies;
188 nlm_rebind_host(host);
189
190 /* First, reclaim all locks that have been granted. */
191 list_splice_init(&host->h_granted, &host->h_reclaim);
211 list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { 192 list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
212 list_del_init(&fl->fl_u.nfs_fl.list); 193 list_del_init(&fl->fl_u.nfs_fl.list);
213 194
195 /* Why are we leaking memory here? --okir */
214 if (signalled()) 196 if (signalled())
215 continue; 197 continue;
216 if (nlmclnt_reclaim(host, fl) != 0) 198 if (nlmclnt_reclaim(host, fl) != 0)
@@ -218,11 +200,13 @@ restart:
218 list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); 200 list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
219 if (host->h_nsmstate != nsmstate) { 201 if (host->h_nsmstate != nsmstate) {
220 /* Argh! The server rebooted again! */ 202 /* Argh! The server rebooted again! */
221 list_splice_init(&host->h_granted, &host->h_reclaim);
222 goto restart; 203 goto restart;
223 } 204 }
224 } 205 }
225 nlmclnt_finish_reclaim(host); 206
207 host->h_reclaiming = 0;
208 up_write(&host->h_rwsem);
209 dprintk("NLM: done reclaiming locks for host %s", host->h_name);
226 210
227 /* Now, wake up all processes that sleep on a blocked lock */ 211 /* Now, wake up all processes that sleep on a blocked lock */
228 list_for_each_entry(block, &nlm_blocked, b_list) { 212 list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 0116729cec5f..3d84f600b633 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -36,14 +36,14 @@ static const struct rpc_call_ops nlmclnt_cancel_ops;
36/* 36/*
37 * Cookie counter for NLM requests 37 * Cookie counter for NLM requests
38 */ 38 */
39static u32 nlm_cookie = 0x1234; 39static atomic_t nlm_cookie = ATOMIC_INIT(0x1234);
40 40
41static inline void nlmclnt_next_cookie(struct nlm_cookie *c) 41void nlmclnt_next_cookie(struct nlm_cookie *c)
42{ 42{
43 memcpy(c->data, &nlm_cookie, 4); 43 u32 cookie = atomic_inc_return(&nlm_cookie);
44 memset(c->data+4, 0, 4); 44
45 memcpy(c->data, &cookie, 4);
45 c->len=4; 46 c->len=4;
46 nlm_cookie++;
47} 47}
48 48
49static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) 49static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner)
@@ -153,6 +153,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
153{ 153{
154 struct rpc_clnt *client = NFS_CLIENT(inode); 154 struct rpc_clnt *client = NFS_CLIENT(inode);
155 struct sockaddr_in addr; 155 struct sockaddr_in addr;
156 struct nfs_server *nfssrv = NFS_SERVER(inode);
156 struct nlm_host *host; 157 struct nlm_host *host;
157 struct nlm_rqst *call; 158 struct nlm_rqst *call;
158 sigset_t oldset; 159 sigset_t oldset;
@@ -166,7 +167,9 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
166 } 167 }
167 168
168 rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr)); 169 rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
169 host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers); 170 host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers,
171 nfssrv->nfs_client->cl_hostname,
172 strlen(nfssrv->nfs_client->cl_hostname));
170 if (host == NULL) 173 if (host == NULL)
171 return -ENOLCK; 174 return -ENOLCK;
172 175
@@ -499,7 +502,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
499 unsigned char fl_flags = fl->fl_flags; 502 unsigned char fl_flags = fl->fl_flags;
500 int status = -ENOLCK; 503 int status = -ENOLCK;
501 504
502 if (!host->h_monitored && nsm_monitor(host) < 0) { 505 if (nsm_monitor(host) < 0) {
503 printk(KERN_NOTICE "lockd: failed to monitor %s\n", 506 printk(KERN_NOTICE "lockd: failed to monitor %s\n",
504 host->h_name); 507 host->h_name);
505 goto out; 508 goto out;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a0d0b58ce7a4..fb24a9730345 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -27,46 +27,60 @@
27#define NLM_HOST_EXPIRE ((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ) 27#define NLM_HOST_EXPIRE ((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ)
28#define NLM_HOST_COLLECT ((nrhosts > NLM_HOST_MAX)? 120 * HZ : 60 * HZ) 28#define NLM_HOST_COLLECT ((nrhosts > NLM_HOST_MAX)? 120 * HZ : 60 * HZ)
29 29
30static struct nlm_host * nlm_hosts[NLM_HOST_NRHASH]; 30static struct hlist_head nlm_hosts[NLM_HOST_NRHASH];
31static unsigned long next_gc; 31static unsigned long next_gc;
32static int nrhosts; 32static int nrhosts;
33static DEFINE_MUTEX(nlm_host_mutex); 33static DEFINE_MUTEX(nlm_host_mutex);
34 34
35 35
36static void nlm_gc_hosts(void); 36static void nlm_gc_hosts(void);
37static struct nsm_handle * __nsm_find(const struct sockaddr_in *,
38 const char *, int, int);
37 39
38/* 40/*
39 * Find an NLM server handle in the cache. If there is none, create it. 41 * Find an NLM server handle in the cache. If there is none, create it.
40 */ 42 */
41struct nlm_host * 43struct nlm_host *
42nlmclnt_lookup_host(struct sockaddr_in *sin, int proto, int version) 44nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
45 const char *hostname, int hostname_len)
43{ 46{
44 return nlm_lookup_host(0, sin, proto, version); 47 return nlm_lookup_host(0, sin, proto, version,
48 hostname, hostname_len);
45} 49}
46 50
47/* 51/*
48 * Find an NLM client handle in the cache. If there is none, create it. 52 * Find an NLM client handle in the cache. If there is none, create it.
49 */ 53 */
50struct nlm_host * 54struct nlm_host *
51nlmsvc_lookup_host(struct svc_rqst *rqstp) 55nlmsvc_lookup_host(struct svc_rqst *rqstp,
56 const char *hostname, int hostname_len)
52{ 57{
53 return nlm_lookup_host(1, &rqstp->rq_addr, 58 return nlm_lookup_host(1, &rqstp->rq_addr,
54 rqstp->rq_prot, rqstp->rq_vers); 59 rqstp->rq_prot, rqstp->rq_vers,
60 hostname, hostname_len);
55} 61}
56 62
57/* 63/*
58 * Common host lookup routine for server & client 64 * Common host lookup routine for server & client
59 */ 65 */
60struct nlm_host * 66struct nlm_host *
61nlm_lookup_host(int server, struct sockaddr_in *sin, 67nlm_lookup_host(int server, const struct sockaddr_in *sin,
62 int proto, int version) 68 int proto, int version,
69 const char *hostname,
70 int hostname_len)
63{ 71{
64 struct nlm_host *host, **hp; 72 struct hlist_head *chain;
65 u32 addr; 73 struct hlist_node *pos;
74 struct nlm_host *host;
75 struct nsm_handle *nsm = NULL;
66 int hash; 76 int hash;
67 77
68 dprintk("lockd: nlm_lookup_host(%08x, p=%d, v=%d)\n", 78 dprintk("lockd: nlm_lookup_host(%u.%u.%u.%u, p=%d, v=%d, my role=%s, name=%.*s)\n",
69 (unsigned)(sin? ntohl(sin->sin_addr.s_addr) : 0), proto, version); 79 NIPQUAD(sin->sin_addr.s_addr), proto, version,
80 server? "server" : "client",
81 hostname_len,
82 hostname? hostname : "<none>");
83
70 84
71 hash = NLM_ADDRHASH(sin->sin_addr.s_addr); 85 hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
72 86
@@ -76,7 +90,22 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
76 if (time_after_eq(jiffies, next_gc)) 90 if (time_after_eq(jiffies, next_gc))
77 nlm_gc_hosts(); 91 nlm_gc_hosts();
78 92
79 for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) { 93 /* We may keep several nlm_host objects for a peer, because each
94 * nlm_host is identified by
95 * (address, protocol, version, server/client)
96 * We could probably simplify this a little by putting all those
97 * different NLM rpc_clients into one single nlm_host object.
98 * This would allow us to have one nlm_host per address.
99 */
100 chain = &nlm_hosts[hash];
101 hlist_for_each_entry(host, pos, chain, h_hash) {
102 if (!nlm_cmp_addr(&host->h_addr, sin))
103 continue;
104
105 /* See if we have an NSM handle for this client */
106 if (!nsm)
107 nsm = host->h_nsmhandle;
108
80 if (host->h_proto != proto) 109 if (host->h_proto != proto)
81 continue; 110 continue;
82 if (host->h_version != version) 111 if (host->h_version != version)
@@ -84,28 +113,30 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
84 if (host->h_server != server) 113 if (host->h_server != server)
85 continue; 114 continue;
86 115
87 if (nlm_cmp_addr(&host->h_addr, sin)) { 116 /* Move to head of hash chain. */
88 if (hp != nlm_hosts + hash) { 117 hlist_del(&host->h_hash);
89 *hp = host->h_next; 118 hlist_add_head(&host->h_hash, chain);
90 host->h_next = nlm_hosts[hash];
91 nlm_hosts[hash] = host;
92 }
93 nlm_get_host(host);
94 mutex_unlock(&nlm_host_mutex);
95 return host;
96 }
97 }
98 119
99 /* Ooops, no host found, create it */ 120 nlm_get_host(host);
100 dprintk("lockd: creating host entry\n"); 121 goto out;
122 }
123 if (nsm)
124 atomic_inc(&nsm->sm_count);
101 125
102 host = kzalloc(sizeof(*host), GFP_KERNEL); 126 host = NULL;
103 if (!host)
104 goto nohost;
105 127
106 addr = sin->sin_addr.s_addr; 128 /* Sadly, the host isn't in our hash table yet. See if
107 sprintf(host->h_name, "%u.%u.%u.%u", NIPQUAD(addr)); 129 * we have an NSM handle for it. If not, create one.
130 */
131 if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
132 goto out;
108 133
134 host = kzalloc(sizeof(*host), GFP_KERNEL);
135 if (!host) {
136 nsm_release(nsm);
137 goto out;
138 }
139 host->h_name = nsm->sm_name;
109 host->h_addr = *sin; 140 host->h_addr = *sin;
110 host->h_addr.sin_port = 0; /* ouch! */ 141 host->h_addr.sin_port = 0; /* ouch! */
111 host->h_version = version; 142 host->h_version = version;
@@ -119,9 +150,9 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
119 init_rwsem(&host->h_rwsem); 150 init_rwsem(&host->h_rwsem);
120 host->h_state = 0; /* pseudo NSM state */ 151 host->h_state = 0; /* pseudo NSM state */
121 host->h_nsmstate = 0; /* real NSM state */ 152 host->h_nsmstate = 0; /* real NSM state */
153 host->h_nsmhandle = nsm;
122 host->h_server = server; 154 host->h_server = server;
123 host->h_next = nlm_hosts[hash]; 155 hlist_add_head(&host->h_hash, chain);
124 nlm_hosts[hash] = host;
125 INIT_LIST_HEAD(&host->h_lockowners); 156 INIT_LIST_HEAD(&host->h_lockowners);
126 spin_lock_init(&host->h_lock); 157 spin_lock_init(&host->h_lock);
127 INIT_LIST_HEAD(&host->h_granted); 158 INIT_LIST_HEAD(&host->h_granted);
@@ -130,35 +161,39 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
130 if (++nrhosts > NLM_HOST_MAX) 161 if (++nrhosts > NLM_HOST_MAX)
131 next_gc = 0; 162 next_gc = 0;
132 163
133nohost: 164out:
134 mutex_unlock(&nlm_host_mutex); 165 mutex_unlock(&nlm_host_mutex);
135 return host; 166 return host;
136} 167}
137 168
138struct nlm_host * 169/*
139nlm_find_client(void) 170 * Destroy a host
171 */
172static void
173nlm_destroy_host(struct nlm_host *host)
140{ 174{
141 /* find a nlm_host for a client for which h_killed == 0. 175 struct rpc_clnt *clnt;
142 * and return it 176
177 BUG_ON(!list_empty(&host->h_lockowners));
178 BUG_ON(atomic_read(&host->h_count));
179
180 /*
181 * Release NSM handle and unmonitor host.
143 */ 182 */
144 int hash; 183 nsm_unmonitor(host);
145 mutex_lock(&nlm_host_mutex); 184
146 for (hash = 0 ; hash < NLM_HOST_NRHASH; hash++) { 185 if ((clnt = host->h_rpcclnt) != NULL) {
147 struct nlm_host *host, **hp; 186 if (atomic_read(&clnt->cl_users)) {
148 for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) { 187 printk(KERN_WARNING
149 if (host->h_server && 188 "lockd: active RPC handle\n");
150 host->h_killed == 0) { 189 clnt->cl_dead = 1;
151 nlm_get_host(host); 190 } else {
152 mutex_unlock(&nlm_host_mutex); 191 rpc_destroy_client(host->h_rpcclnt);
153 return host;
154 }
155 } 192 }
156 } 193 }
157 mutex_unlock(&nlm_host_mutex); 194 kfree(host);
158 return NULL;
159} 195}
160 196
161
162/* 197/*
163 * Create the NLM RPC client for an NLM peer 198 * Create the NLM RPC client for an NLM peer
164 */ 199 */
@@ -260,22 +295,82 @@ void nlm_release_host(struct nlm_host *host)
260} 295}
261 296
262/* 297/*
298 * We were notified that the host indicated by address &sin
299 * has rebooted.
300 * Release all resources held by that peer.
301 */
302void nlm_host_rebooted(const struct sockaddr_in *sin,
303 const char *hostname, int hostname_len,
304 u32 new_state)
305{
306 struct hlist_head *chain;
307 struct hlist_node *pos;
308 struct nsm_handle *nsm;
309 struct nlm_host *host;
310
311 dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
312 hostname, NIPQUAD(sin->sin_addr));
313
314 /* Find the NSM handle for this peer */
315 if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
316 return;
317
318 /* When reclaiming locks on this peer, make sure that
319 * we set up a new notification */
320 nsm->sm_monitored = 0;
321
322 /* Mark all hosts tied to this NSM state as having rebooted.
323 * We run the loop repeatedly, because we drop the host table
324 * lock for this.
325 * To avoid processing a host several times, we match the nsmstate.
326 */
327again: mutex_lock(&nlm_host_mutex);
328 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
329 hlist_for_each_entry(host, pos, chain, h_hash) {
330 if (host->h_nsmhandle == nsm
331 && host->h_nsmstate != new_state) {
332 host->h_nsmstate = new_state;
333 host->h_state++;
334
335 nlm_get_host(host);
336 mutex_unlock(&nlm_host_mutex);
337
338 if (host->h_server) {
339 /* We're server for this guy, just ditch
340 * all the locks he held. */
341 nlmsvc_free_host_resources(host);
342 } else {
343 /* He's the server, initiate lock recovery. */
344 nlmclnt_recovery(host);
345 }
346
347 nlm_release_host(host);
348 goto again;
349 }
350 }
351 }
352
353 mutex_unlock(&nlm_host_mutex);
354}
355
356/*
263 * Shut down the hosts module. 357 * Shut down the hosts module.
264 * Note that this routine is called only at server shutdown time. 358 * Note that this routine is called only at server shutdown time.
265 */ 359 */
266void 360void
267nlm_shutdown_hosts(void) 361nlm_shutdown_hosts(void)
268{ 362{
363 struct hlist_head *chain;
364 struct hlist_node *pos;
269 struct nlm_host *host; 365 struct nlm_host *host;
270 int i;
271 366
272 dprintk("lockd: shutting down host module\n"); 367 dprintk("lockd: shutting down host module\n");
273 mutex_lock(&nlm_host_mutex); 368 mutex_lock(&nlm_host_mutex);
274 369
275 /* First, make all hosts eligible for gc */ 370 /* First, make all hosts eligible for gc */
276 dprintk("lockd: nuking all hosts...\n"); 371 dprintk("lockd: nuking all hosts...\n");
277 for (i = 0; i < NLM_HOST_NRHASH; i++) { 372 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
278 for (host = nlm_hosts[i]; host; host = host->h_next) 373 hlist_for_each_entry(host, pos, chain, h_hash)
279 host->h_expires = jiffies - 1; 374 host->h_expires = jiffies - 1;
280 } 375 }
281 376
@@ -287,8 +382,8 @@ nlm_shutdown_hosts(void)
287 if (nrhosts) { 382 if (nrhosts) {
288 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); 383 printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
289 dprintk("lockd: %d hosts left:\n", nrhosts); 384 dprintk("lockd: %d hosts left:\n", nrhosts);
290 for (i = 0; i < NLM_HOST_NRHASH; i++) { 385 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
291 for (host = nlm_hosts[i]; host; host = host->h_next) { 386 hlist_for_each_entry(host, pos, chain, h_hash) {
292 dprintk(" %s (cnt %d use %d exp %ld)\n", 387 dprintk(" %s (cnt %d use %d exp %ld)\n",
293 host->h_name, atomic_read(&host->h_count), 388 host->h_name, atomic_read(&host->h_count),
294 host->h_inuse, host->h_expires); 389 host->h_inuse, host->h_expires);
@@ -305,45 +400,32 @@ nlm_shutdown_hosts(void)
305static void 400static void
306nlm_gc_hosts(void) 401nlm_gc_hosts(void)
307{ 402{
308 struct nlm_host **q, *host; 403 struct hlist_head *chain;
309 struct rpc_clnt *clnt; 404 struct hlist_node *pos, *next;
310 int i; 405 struct nlm_host *host;
311 406
312 dprintk("lockd: host garbage collection\n"); 407 dprintk("lockd: host garbage collection\n");
313 for (i = 0; i < NLM_HOST_NRHASH; i++) { 408 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
314 for (host = nlm_hosts[i]; host; host = host->h_next) 409 hlist_for_each_entry(host, pos, chain, h_hash)
315 host->h_inuse = 0; 410 host->h_inuse = 0;
316 } 411 }
317 412
318 /* Mark all hosts that hold locks, blocks or shares */ 413 /* Mark all hosts that hold locks, blocks or shares */
319 nlmsvc_mark_resources(); 414 nlmsvc_mark_resources();
320 415
321 for (i = 0; i < NLM_HOST_NRHASH; i++) { 416 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
322 q = &nlm_hosts[i]; 417 hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
323 while ((host = *q) != NULL) {
324 if (atomic_read(&host->h_count) || host->h_inuse 418 if (atomic_read(&host->h_count) || host->h_inuse
325 || time_before(jiffies, host->h_expires)) { 419 || time_before(jiffies, host->h_expires)) {
326 dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n", 420 dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
327 host->h_name, atomic_read(&host->h_count), 421 host->h_name, atomic_read(&host->h_count),
328 host->h_inuse, host->h_expires); 422 host->h_inuse, host->h_expires);
329 q = &host->h_next;
330 continue; 423 continue;
331 } 424 }
332 dprintk("lockd: delete host %s\n", host->h_name); 425 dprintk("lockd: delete host %s\n", host->h_name);
333 *q = host->h_next; 426 hlist_del_init(&host->h_hash);
334 /* Don't unmonitor hosts that have been invalidated */ 427
335 if (host->h_monitored && !host->h_killed) 428 nlm_destroy_host(host);
336 nsm_unmonitor(host);
337 if ((clnt = host->h_rpcclnt) != NULL) {
338 if (atomic_read(&clnt->cl_users)) {
339 printk(KERN_WARNING
340 "lockd: active RPC handle\n");
341 clnt->cl_dead = 1;
342 } else {
343 rpc_destroy_client(host->h_rpcclnt);
344 }
345 }
346 kfree(host);
347 nrhosts--; 429 nrhosts--;
348 } 430 }
349 } 431 }
@@ -351,3 +433,88 @@ nlm_gc_hosts(void)
351 next_gc = jiffies + NLM_HOST_COLLECT; 433 next_gc = jiffies + NLM_HOST_COLLECT;
352} 434}
353 435
436
437/*
438 * Manage NSM handles
439 */
440static LIST_HEAD(nsm_handles);
441static DEFINE_MUTEX(nsm_mutex);
442
443static struct nsm_handle *
444__nsm_find(const struct sockaddr_in *sin,
445 const char *hostname, int hostname_len,
446 int create)
447{
448 struct nsm_handle *nsm = NULL;
449 struct list_head *pos;
450
451 if (!sin)
452 return NULL;
453
454 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
455 if (printk_ratelimit()) {
456 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
457 "in NFS lock request\n",
458 hostname_len, hostname);
459 }
460 return NULL;
461 }
462
463 mutex_lock(&nsm_mutex);
464 list_for_each(pos, &nsm_handles) {
465 nsm = list_entry(pos, struct nsm_handle, sm_link);
466
467 if (hostname && nsm_use_hostnames) {
468 if (strlen(nsm->sm_name) != hostname_len
469 || memcmp(nsm->sm_name, hostname, hostname_len))
470 continue;
471 } else if (!nlm_cmp_addr(&nsm->sm_addr, sin))
472 continue;
473 atomic_inc(&nsm->sm_count);
474 goto out;
475 }
476
477 if (!create) {
478 nsm = NULL;
479 goto out;
480 }
481
482 nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
483 if (nsm != NULL) {
484 nsm->sm_addr = *sin;
485 nsm->sm_name = (char *) (nsm + 1);
486 memcpy(nsm->sm_name, hostname, hostname_len);
487 nsm->sm_name[hostname_len] = '\0';
488 atomic_set(&nsm->sm_count, 1);
489
490 list_add(&nsm->sm_link, &nsm_handles);
491 }
492
493out:
494 mutex_unlock(&nsm_mutex);
495 return nsm;
496}
497
498struct nsm_handle *
499nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
500{
501 return __nsm_find(sin, hostname, hostname_len, 1);
502}
503
504/*
505 * Release an NSM handle
506 */
507void
508nsm_release(struct nsm_handle *nsm)
509{
510 if (!nsm)
511 return;
512 if (atomic_dec_and_test(&nsm->sm_count)) {
513 mutex_lock(&nsm_mutex);
514 if (atomic_read(&nsm->sm_count) == 0) {
515 list_del(&nsm->sm_link);
516 kfree(nsm);
517 }
518 mutex_unlock(&nsm_mutex);
519 }
520}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index a816b920d431..eb243edf8932 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -24,13 +24,13 @@ static struct rpc_program nsm_program;
24/* 24/*
25 * Local NSM state 25 * Local NSM state
26 */ 26 */
27u32 nsm_local_state; 27int nsm_local_state;
28 28
29/* 29/*
30 * Common procedure for SM_MON/SM_UNMON calls 30 * Common procedure for SM_MON/SM_UNMON calls
31 */ 31 */
32static int 32static int
33nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res) 33nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
34{ 34{
35 struct rpc_clnt *clnt; 35 struct rpc_clnt *clnt;
36 int status; 36 int status;
@@ -46,10 +46,11 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
46 goto out; 46 goto out;
47 } 47 }
48 48
49 args.addr = host->h_addr.sin_addr.s_addr; 49 memset(&args, 0, sizeof(args));
50 args.proto= (host->h_proto<<1) | host->h_server; 50 args.mon_name = nsm->sm_name;
51 args.addr = nsm->sm_addr.sin_addr.s_addr;
51 args.prog = NLM_PROGRAM; 52 args.prog = NLM_PROGRAM;
52 args.vers = host->h_version; 53 args.vers = 3;
53 args.proc = NLMPROC_NSM_NOTIFY; 54 args.proc = NLMPROC_NSM_NOTIFY;
54 memset(res, 0, sizeof(*res)); 55 memset(res, 0, sizeof(*res));
55 56
@@ -70,17 +71,22 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
70int 71int
71nsm_monitor(struct nlm_host *host) 72nsm_monitor(struct nlm_host *host)
72{ 73{
74 struct nsm_handle *nsm = host->h_nsmhandle;
73 struct nsm_res res; 75 struct nsm_res res;
74 int status; 76 int status;
75 77
76 dprintk("lockd: nsm_monitor(%s)\n", host->h_name); 78 dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
79 BUG_ON(nsm == NULL);
77 80
78 status = nsm_mon_unmon(host, SM_MON, &res); 81 if (nsm->sm_monitored)
82 return 0;
83
84 status = nsm_mon_unmon(nsm, SM_MON, &res);
79 85
80 if (status < 0 || res.status != 0) 86 if (status < 0 || res.status != 0)
81 printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name); 87 printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
82 else 88 else
83 host->h_monitored = 1; 89 nsm->sm_monitored = 1;
84 return status; 90 return status;
85} 91}
86 92
@@ -90,16 +96,26 @@ nsm_monitor(struct nlm_host *host)
90int 96int
91nsm_unmonitor(struct nlm_host *host) 97nsm_unmonitor(struct nlm_host *host)
92{ 98{
99 struct nsm_handle *nsm = host->h_nsmhandle;
93 struct nsm_res res; 100 struct nsm_res res;
94 int status; 101 int status = 0;
95 102
96 dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); 103 if (nsm == NULL)
97 104 return 0;
98 status = nsm_mon_unmon(host, SM_UNMON, &res); 105 host->h_nsmhandle = NULL;
99 if (status < 0) 106
100 printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", host->h_name); 107 if (atomic_read(&nsm->sm_count) == 1
101 else 108 && nsm->sm_monitored && !nsm->sm_sticky) {
102 host->h_monitored = 0; 109 dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
110
111 status = nsm_mon_unmon(nsm, SM_UNMON, &res);
112 if (status < 0)
113 printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
114 host->h_name);
115 else
116 nsm->sm_monitored = 0;
117 }
118 nsm_release(nsm);
103 return status; 119 return status;
104} 120}
105 121
@@ -132,10 +148,10 @@ nsm_create(void)
132 * XDR functions for NSM. 148 * XDR functions for NSM.
133 */ 149 */
134 150
135static u32 * 151static __be32 *
136xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp) 152xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
137{ 153{
138 char buffer[20]; 154 char buffer[20], *name;
139 155
140 /* 156 /*
141 * Use the dotted-quad IP address of the remote host as 157 * Use the dotted-quad IP address of the remote host as
@@ -143,8 +159,13 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
143 * hostname first for whatever remote hostname it receives, 159 * hostname first for whatever remote hostname it receives,
144 * so this works alright. 160 * so this works alright.
145 */ 161 */
146 sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); 162 if (nsm_use_hostnames) {
147 if (!(p = xdr_encode_string(p, buffer)) 163 name = argp->mon_name;
164 } else {
165 sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
166 name = buffer;
167 }
168 if (!(p = xdr_encode_string(p, name))
148 || !(p = xdr_encode_string(p, utsname()->nodename))) 169 || !(p = xdr_encode_string(p, utsname()->nodename)))
149 return ERR_PTR(-EIO); 170 return ERR_PTR(-EIO);
150 *p++ = htonl(argp->prog); 171 *p++ = htonl(argp->prog);
@@ -155,21 +176,23 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
155} 176}
156 177
157static int 178static int
158xdr_encode_mon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp) 179xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
159{ 180{
160 p = xdr_encode_common(rqstp, p, argp); 181 p = xdr_encode_common(rqstp, p, argp);
161 if (IS_ERR(p)) 182 if (IS_ERR(p))
162 return PTR_ERR(p); 183 return PTR_ERR(p);
184
185 /* Surprise - there may even be room for an IPv6 address now */
163 *p++ = argp->addr; 186 *p++ = argp->addr;
164 *p++ = argp->vers; 187 *p++ = 0;
165 *p++ = argp->proto; 188 *p++ = 0;
166 *p++ = 0; 189 *p++ = 0;
167 rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); 190 rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
168 return 0; 191 return 0;
169} 192}
170 193
171static int 194static int
172xdr_encode_unmon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp) 195xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
173{ 196{
174 p = xdr_encode_common(rqstp, p, argp); 197 p = xdr_encode_common(rqstp, p, argp);
175 if (IS_ERR(p)) 198 if (IS_ERR(p))
@@ -179,7 +202,7 @@ xdr_encode_unmon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
179} 202}
180 203
181static int 204static int
182xdr_decode_stat_res(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp) 205xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
183{ 206{
184 resp->status = ntohl(*p++); 207 resp->status = ntohl(*p++);
185 resp->state = ntohl(*p++); 208 resp->state = ntohl(*p++);
@@ -189,7 +212,7 @@ xdr_decode_stat_res(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
189} 212}
190 213
191static int 214static int
192xdr_decode_stat(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp) 215xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
193{ 216{
194 resp->state = ntohl(*p++); 217 resp->state = ntohl(*p++);
195 return 0; 218 return 0;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 3cc369e5693f..634139232aaf 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -33,6 +33,7 @@
33#include <linux/sunrpc/svcsock.h> 33#include <linux/sunrpc/svcsock.h>
34#include <net/ip.h> 34#include <net/ip.h>
35#include <linux/lockd/lockd.h> 35#include <linux/lockd/lockd.h>
36#include <linux/lockd/sm_inter.h>
36#include <linux/nfs.h> 37#include <linux/nfs.h>
37 38
38#define NLMDBG_FACILITY NLMDBG_SVC 39#define NLMDBG_FACILITY NLMDBG_SVC
@@ -61,6 +62,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lockd_exit);
61static unsigned long nlm_grace_period; 62static unsigned long nlm_grace_period;
62static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; 63static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
63static int nlm_udpport, nlm_tcpport; 64static int nlm_udpport, nlm_tcpport;
65int nsm_use_hostnames = 0;
64 66
65/* 67/*
66 * Constants needed for the sysctl interface. 68 * Constants needed for the sysctl interface.
@@ -395,6 +397,22 @@ static ctl_table nlm_sysctls[] = {
395 .extra1 = (int *) &nlm_port_min, 397 .extra1 = (int *) &nlm_port_min,
396 .extra2 = (int *) &nlm_port_max, 398 .extra2 = (int *) &nlm_port_max,
397 }, 399 },
400 {
401 .ctl_name = CTL_UNNUMBERED,
402 .procname = "nsm_use_hostnames",
403 .data = &nsm_use_hostnames,
404 .maxlen = sizeof(int),
405 .mode = 0644,
406 .proc_handler = &proc_dointvec,
407 },
408 {
409 .ctl_name = CTL_UNNUMBERED,
410 .procname = "nsm_local_state",
411 .data = &nsm_local_state,
412 .maxlen = sizeof(int),
413 .mode = 0644,
414 .proc_handler = &proc_dointvec,
415 },
398 { .ctl_name = 0 } 416 { .ctl_name = 0 }
399}; 417};
400 418
@@ -483,6 +501,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
483 &nlm_udpport, 0644); 501 &nlm_udpport, 0644);
484module_param_call(nlm_tcpport, param_set_port, param_get_int, 502module_param_call(nlm_tcpport, param_set_port, param_get_int,
485 &nlm_tcpport, 0644); 503 &nlm_tcpport, 0644);
504module_param(nsm_use_hostnames, bool, 0644);
486 505
487/* 506/*
488 * Initialising and terminating the module. 507 * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a2dd9ccb9b32..0ce5c81ff507 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -24,22 +24,22 @@
24/* 24/*
25 * Obtain client and file from arguments 25 * Obtain client and file from arguments
26 */ 26 */
27static u32 27static __be32
28nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, 28nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
29 struct nlm_host **hostp, struct nlm_file **filp) 29 struct nlm_host **hostp, struct nlm_file **filp)
30{ 30{
31 struct nlm_host *host = NULL; 31 struct nlm_host *host = NULL;
32 struct nlm_file *file = NULL; 32 struct nlm_file *file = NULL;
33 struct nlm_lock *lock = &argp->lock; 33 struct nlm_lock *lock = &argp->lock;
34 u32 error = 0; 34 __be32 error = 0;
35 35
36 /* nfsd callbacks must have been installed for this procedure */ 36 /* nfsd callbacks must have been installed for this procedure */
37 if (!nlmsvc_ops) 37 if (!nlmsvc_ops)
38 return nlm_lck_denied_nolocks; 38 return nlm_lck_denied_nolocks;
39 39
40 /* Obtain host handle */ 40 /* Obtain host handle */
41 if (!(host = nlmsvc_lookup_host(rqstp)) 41 if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
42 || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0)) 42 || (argp->monitor && nsm_monitor(host) < 0))
43 goto no_locks; 43 goto no_locks;
44 *hostp = host; 44 *hostp = host;
45 45
@@ -68,7 +68,7 @@ no_locks:
68/* 68/*
69 * NULL: Test for presence of service 69 * NULL: Test for presence of service
70 */ 70 */
71static int 71static __be32
72nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) 72nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
73{ 73{
74 dprintk("lockd: NULL called\n"); 74 dprintk("lockd: NULL called\n");
@@ -78,7 +78,7 @@ nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
78/* 78/*
79 * TEST: Check for conflicting lock 79 * TEST: Check for conflicting lock
80 */ 80 */
81static int 81static __be32
82nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, 82nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
83 struct nlm_res *resp) 83 struct nlm_res *resp)
84{ 84{
@@ -96,7 +96,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
96 96
97 /* Obtain client and file */ 97 /* Obtain client and file */
98 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 98 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
99 return rpc_success; 99 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
100 100
101 /* Now check for conflicting locks */ 101 /* Now check for conflicting locks */
102 resp->status = nlmsvc_testlock(file, &argp->lock, &resp->lock); 102 resp->status = nlmsvc_testlock(file, &argp->lock, &resp->lock);
@@ -107,7 +107,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
107 return rpc_success; 107 return rpc_success;
108} 108}
109 109
110static int 110static __be32
111nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, 111nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
112 struct nlm_res *resp) 112 struct nlm_res *resp)
113{ 113{
@@ -126,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
126 126
127 /* Obtain client and file */ 127 /* Obtain client and file */
128 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 128 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
129 return rpc_success; 129 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
130 130
131#if 0 131#if 0
132 /* If supplied state doesn't match current state, we assume it's 132 /* If supplied state doesn't match current state, we assume it's
@@ -150,7 +150,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
150 return rpc_success; 150 return rpc_success;
151} 151}
152 152
153static int 153static __be32
154nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, 154nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
155 struct nlm_res *resp) 155 struct nlm_res *resp)
156{ 156{
@@ -169,7 +169,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
169 169
170 /* Obtain client and file */ 170 /* Obtain client and file */
171 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 171 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
172 return rpc_success; 172 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
173 173
174 /* Try to cancel request. */ 174 /* Try to cancel request. */
175 resp->status = nlmsvc_cancel_blocked(file, &argp->lock); 175 resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
@@ -183,7 +183,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
183/* 183/*
184 * UNLOCK: release a lock 184 * UNLOCK: release a lock
185 */ 185 */
186static int 186static __be32
187nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, 187nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
188 struct nlm_res *resp) 188 struct nlm_res *resp)
189{ 189{
@@ -202,7 +202,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
202 202
203 /* Obtain client and file */ 203 /* Obtain client and file */
204 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 204 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
205 return rpc_success; 205 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
206 206
207 /* Now try to remove the lock */ 207 /* Now try to remove the lock */
208 resp->status = nlmsvc_unlock(file, &argp->lock); 208 resp->status = nlmsvc_unlock(file, &argp->lock);
@@ -217,7 +217,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
217 * GRANTED: A server calls us to tell that a process' lock request 217 * GRANTED: A server calls us to tell that a process' lock request
218 * was granted 218 * was granted
219 */ 219 */
220static int 220static __be32
221nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, 221nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
222 struct nlm_res *resp) 222 struct nlm_res *resp)
223{ 223{
@@ -253,14 +253,16 @@ static const struct rpc_call_ops nlm4svc_callback_ops = {
253 * because we send the callback before the reply proper. I hope this 253 * because we send the callback before the reply proper. I hope this
254 * doesn't break any clients. 254 * doesn't break any clients.
255 */ 255 */
256static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp, 256static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
257 int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *)) 257 __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *))
258{ 258{
259 struct nlm_host *host; 259 struct nlm_host *host;
260 struct nlm_rqst *call; 260 struct nlm_rqst *call;
261 int stat; 261 __be32 stat;
262 262
263 host = nlmsvc_lookup_host(rqstp); 263 host = nlmsvc_lookup_host(rqstp,
264 argp->lock.caller,
265 argp->lock.len);
264 if (host == NULL) 266 if (host == NULL)
265 return rpc_system_err; 267 return rpc_system_err;
266 268
@@ -280,35 +282,35 @@ static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *a
280 return rpc_success; 282 return rpc_success;
281} 283}
282 284
283static int nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 285static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
284 void *resp) 286 void *resp)
285{ 287{
286 dprintk("lockd: TEST_MSG called\n"); 288 dprintk("lockd: TEST_MSG called\n");
287 return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test); 289 return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test);
288} 290}
289 291
290static int nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 292static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
291 void *resp) 293 void *resp)
292{ 294{
293 dprintk("lockd: LOCK_MSG called\n"); 295 dprintk("lockd: LOCK_MSG called\n");
294 return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock); 296 return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock);
295} 297}
296 298
297static int nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 299static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
298 void *resp) 300 void *resp)
299{ 301{
300 dprintk("lockd: CANCEL_MSG called\n"); 302 dprintk("lockd: CANCEL_MSG called\n");
301 return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel); 303 return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel);
302} 304}
303 305
304static int nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 306static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
305 void *resp) 307 void *resp)
306{ 308{
307 dprintk("lockd: UNLOCK_MSG called\n"); 309 dprintk("lockd: UNLOCK_MSG called\n");
308 return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock); 310 return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock);
309} 311}
310 312
311static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 313static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
312 void *resp) 314 void *resp)
313{ 315{
314 dprintk("lockd: GRANTED_MSG called\n"); 316 dprintk("lockd: GRANTED_MSG called\n");
@@ -318,7 +320,7 @@ static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *arg
318/* 320/*
319 * SHARE: create a DOS share or alter existing share. 321 * SHARE: create a DOS share or alter existing share.
320 */ 322 */
321static int 323static __be32
322nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, 324nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
323 struct nlm_res *resp) 325 struct nlm_res *resp)
324{ 326{
@@ -337,7 +339,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
337 339
338 /* Obtain client and file */ 340 /* Obtain client and file */
339 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 341 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
340 return rpc_success; 342 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
341 343
342 /* Now try to create the share */ 344 /* Now try to create the share */
343 resp->status = nlmsvc_share_file(host, file, argp); 345 resp->status = nlmsvc_share_file(host, file, argp);
@@ -351,7 +353,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
351/* 353/*
352 * UNSHARE: Release a DOS share. 354 * UNSHARE: Release a DOS share.
353 */ 355 */
354static int 356static __be32
355nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, 357nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
356 struct nlm_res *resp) 358 struct nlm_res *resp)
357{ 359{
@@ -370,7 +372,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
370 372
371 /* Obtain client and file */ 373 /* Obtain client and file */
372 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) 374 if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
373 return rpc_success; 375 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
374 376
375 /* Now try to lock the file */ 377 /* Now try to lock the file */
376 resp->status = nlmsvc_unshare_file(host, file, argp); 378 resp->status = nlmsvc_unshare_file(host, file, argp);
@@ -384,7 +386,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
384/* 386/*
385 * NM_LOCK: Create an unmonitored lock 387 * NM_LOCK: Create an unmonitored lock
386 */ 388 */
387static int 389static __be32
388nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp, 390nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
389 struct nlm_res *resp) 391 struct nlm_res *resp)
390{ 392{
@@ -397,7 +399,7 @@ nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
397/* 399/*
398 * FREE_ALL: Release all locks and shares held by client 400 * FREE_ALL: Release all locks and shares held by client
399 */ 401 */
400static int 402static __be32
401nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp, 403nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
402 void *resp) 404 void *resp)
403{ 405{
@@ -415,15 +417,11 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
415/* 417/*
416 * SM_NOTIFY: private callback from statd (not part of official NLM proto) 418 * SM_NOTIFY: private callback from statd (not part of official NLM proto)
417 */ 419 */
418static int 420static __be32
419nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, 421nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
420 void *resp) 422 void *resp)
421{ 423{
422 struct sockaddr_in saddr = rqstp->rq_addr; 424 struct sockaddr_in saddr = rqstp->rq_addr;
423 int vers = argp->vers;
424 int prot = argp->proto >> 1;
425
426 struct nlm_host *host;
427 425
428 dprintk("lockd: SM_NOTIFY called\n"); 426 dprintk("lockd: SM_NOTIFY called\n");
429 if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) 427 if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
@@ -438,28 +436,17 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
438 /* Obtain the host pointer for this NFS server and try to 436 /* Obtain the host pointer for this NFS server and try to
439 * reclaim all locks we hold on this server. 437 * reclaim all locks we hold on this server.
440 */ 438 */
439 memset(&saddr, 0, sizeof(saddr));
441 saddr.sin_addr.s_addr = argp->addr; 440 saddr.sin_addr.s_addr = argp->addr;
441 nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
442 442
443 if ((argp->proto & 1)==0) {
444 if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
445 nlmclnt_recovery(host, argp->state);
446 nlm_release_host(host);
447 }
448 } else {
449 /* If we run on an NFS server, delete all locks held by the client */
450
451 if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) {
452 nlmsvc_free_host_resources(host);
453 nlm_release_host(host);
454 }
455 }
456 return rpc_success; 443 return rpc_success;
457} 444}
458 445
459/* 446/*
460 * client sent a GRANTED_RES, let's remove the associated block 447 * client sent a GRANTED_RES, let's remove the associated block
461 */ 448 */
462static int 449static __be32
463nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, 450nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
464 void *resp) 451 void *resp)
465{ 452{
@@ -468,7 +455,7 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
468 455
469 dprintk("lockd: GRANTED_RES called\n"); 456 dprintk("lockd: GRANTED_RES called\n");
470 457
471 nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status); 458 nlmsvc_grant_reply(&argp->cookie, argp->status);
472 return rpc_success; 459 return rpc_success;
473} 460}
474 461
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 93c00ee7189d..7e219b938552 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -40,7 +40,7 @@
40 40
41static void nlmsvc_release_block(struct nlm_block *block); 41static void nlmsvc_release_block(struct nlm_block *block);
42static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); 42static void nlmsvc_insert_block(struct nlm_block *block, unsigned long);
43static int nlmsvc_remove_block(struct nlm_block *block); 43static void nlmsvc_remove_block(struct nlm_block *block);
44 44
45static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); 45static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
46static void nlmsvc_freegrantargs(struct nlm_rqst *call); 46static void nlmsvc_freegrantargs(struct nlm_rqst *call);
@@ -49,7 +49,7 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
49/* 49/*
50 * The list of blocked locks to retry 50 * The list of blocked locks to retry
51 */ 51 */
52static struct nlm_block * nlm_blocked; 52static LIST_HEAD(nlm_blocked);
53 53
54/* 54/*
55 * Insert a blocked lock into the global list 55 * Insert a blocked lock into the global list
@@ -57,48 +57,44 @@ static struct nlm_block * nlm_blocked;
57static void 57static void
58nlmsvc_insert_block(struct nlm_block *block, unsigned long when) 58nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
59{ 59{
60 struct nlm_block **bp, *b; 60 struct nlm_block *b;
61 struct list_head *pos;
61 62
62 dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when); 63 dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when);
63 kref_get(&block->b_count); 64 if (list_empty(&block->b_list)) {
64 if (block->b_queued) 65 kref_get(&block->b_count);
65 nlmsvc_remove_block(block); 66 } else {
66 bp = &nlm_blocked; 67 list_del_init(&block->b_list);
68 }
69
70 pos = &nlm_blocked;
67 if (when != NLM_NEVER) { 71 if (when != NLM_NEVER) {
68 if ((when += jiffies) == NLM_NEVER) 72 if ((when += jiffies) == NLM_NEVER)
69 when ++; 73 when ++;
70 while ((b = *bp) && time_before_eq(b->b_when,when) && b->b_when != NLM_NEVER) 74 list_for_each(pos, &nlm_blocked) {
71 bp = &b->b_next; 75 b = list_entry(pos, struct nlm_block, b_list);
72 } else 76 if (time_after(b->b_when,when) || b->b_when == NLM_NEVER)
73 while ((b = *bp) != 0) 77 break;
74 bp = &b->b_next; 78 }
79 /* On normal exit from the loop, pos == &nlm_blocked,
80 * so we will be adding to the end of the list - good
81 */
82 }
75 83
76 block->b_queued = 1; 84 list_add_tail(&block->b_list, pos);
77 block->b_when = when; 85 block->b_when = when;
78 block->b_next = b;
79 *bp = block;
80} 86}
81 87
82/* 88/*
83 * Remove a block from the global list 89 * Remove a block from the global list
84 */ 90 */
85static int 91static inline void
86nlmsvc_remove_block(struct nlm_block *block) 92nlmsvc_remove_block(struct nlm_block *block)
87{ 93{
88 struct nlm_block **bp, *b; 94 if (!list_empty(&block->b_list)) {
89 95 list_del_init(&block->b_list);
90 if (!block->b_queued) 96 nlmsvc_release_block(block);
91 return 1;
92 for (bp = &nlm_blocked; (b = *bp) != 0; bp = &b->b_next) {
93 if (b == block) {
94 *bp = block->b_next;
95 block->b_queued = 0;
96 nlmsvc_release_block(block);
97 return 1;
98 }
99 } 97 }
100
101 return 0;
102} 98}
103 99
104/* 100/*
@@ -107,14 +103,14 @@ nlmsvc_remove_block(struct nlm_block *block)
107static struct nlm_block * 103static struct nlm_block *
108nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) 104nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
109{ 105{
110 struct nlm_block **head, *block; 106 struct nlm_block *block;
111 struct file_lock *fl; 107 struct file_lock *fl;
112 108
113 dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n", 109 dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
114 file, lock->fl.fl_pid, 110 file, lock->fl.fl_pid,
115 (long long)lock->fl.fl_start, 111 (long long)lock->fl.fl_start,
116 (long long)lock->fl.fl_end, lock->fl.fl_type); 112 (long long)lock->fl.fl_end, lock->fl.fl_type);
117 for (head = &nlm_blocked; (block = *head) != 0; head = &block->b_next) { 113 list_for_each_entry(block, &nlm_blocked, b_list) {
118 fl = &block->b_call->a_args.lock.fl; 114 fl = &block->b_call->a_args.lock.fl;
119 dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", 115 dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
120 block->b_file, fl->fl_pid, 116 block->b_file, fl->fl_pid,
@@ -143,20 +139,20 @@ static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
143 * Find a block with a given NLM cookie. 139 * Find a block with a given NLM cookie.
144 */ 140 */
145static inline struct nlm_block * 141static inline struct nlm_block *
146nlmsvc_find_block(struct nlm_cookie *cookie, struct sockaddr_in *sin) 142nlmsvc_find_block(struct nlm_cookie *cookie)
147{ 143{
148 struct nlm_block *block; 144 struct nlm_block *block;
149 145
150 for (block = nlm_blocked; block; block = block->b_next) { 146 list_for_each_entry(block, &nlm_blocked, b_list) {
151 dprintk("cookie: head of blocked queue %p, block %p\n", 147 if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie))
152 nlm_blocked, block); 148 goto found;
153 if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)
154 && nlm_cmp_addr(sin, &block->b_host->h_addr))
155 break;
156 } 149 }
157 150
158 if (block != NULL) 151 return NULL;
159 kref_get(&block->b_count); 152
153found:
154 dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block);
155 kref_get(&block->b_count);
160 return block; 156 return block;
161} 157}
162 158
@@ -169,6 +165,11 @@ nlmsvc_find_block(struct nlm_cookie *cookie, struct sockaddr_in *sin)
169 * request, but (as I found out later) that's because some implementations 165 * request, but (as I found out later) that's because some implementations
170 * do just this. Never mind the standards comittees, they support our 166 * do just this. Never mind the standards comittees, they support our
171 * logging industries. 167 * logging industries.
168 *
169 * 10 years later: I hope we can safely ignore these old and broken
170 * clients by now. Let's fix this so we can uniquely identify an incoming
171 * GRANTED_RES message by cookie, without having to rely on the client's IP
172 * address. --okir
172 */ 173 */
173static inline struct nlm_block * 174static inline struct nlm_block *
174nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file, 175nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
@@ -179,7 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
179 struct nlm_rqst *call = NULL; 180 struct nlm_rqst *call = NULL;
180 181
181 /* Create host handle for callback */ 182 /* Create host handle for callback */
182 host = nlmsvc_lookup_host(rqstp); 183 host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
183 if (host == NULL) 184 if (host == NULL)
184 return NULL; 185 return NULL;
185 186
@@ -192,6 +193,8 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
192 if (block == NULL) 193 if (block == NULL)
193 goto failed; 194 goto failed;
194 kref_init(&block->b_count); 195 kref_init(&block->b_count);
196 INIT_LIST_HEAD(&block->b_list);
197 INIT_LIST_HEAD(&block->b_flist);
195 198
196 if (!nlmsvc_setgrantargs(call, lock)) 199 if (!nlmsvc_setgrantargs(call, lock))
197 goto failed_free; 200 goto failed_free;
@@ -199,7 +202,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
199 /* Set notifier function for VFS, and init args */ 202 /* Set notifier function for VFS, and init args */
200 call->a_args.lock.fl.fl_flags |= FL_SLEEP; 203 call->a_args.lock.fl.fl_flags |= FL_SLEEP;
201 call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations; 204 call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
202 call->a_args.cookie = *cookie; /* see above */ 205 nlmclnt_next_cookie(&call->a_args.cookie);
203 206
204 dprintk("lockd: created block %p...\n", block); 207 dprintk("lockd: created block %p...\n", block);
205 208
@@ -210,8 +213,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
210 file->f_count++; 213 file->f_count++;
211 214
212 /* Add to file's list of blocks */ 215 /* Add to file's list of blocks */
213 block->b_fnext = file->f_blocks; 216 list_add(&block->b_flist, &file->f_blocks);
214 file->f_blocks = block;
215 217
216 /* Set up RPC arguments for callback */ 218 /* Set up RPC arguments for callback */
217 block->b_call = call; 219 block->b_call = call;
@@ -248,19 +250,13 @@ static void nlmsvc_free_block(struct kref *kref)
248{ 250{
249 struct nlm_block *block = container_of(kref, struct nlm_block, b_count); 251 struct nlm_block *block = container_of(kref, struct nlm_block, b_count);
250 struct nlm_file *file = block->b_file; 252 struct nlm_file *file = block->b_file;
251 struct nlm_block **bp;
252 253
253 dprintk("lockd: freeing block %p...\n", block); 254 dprintk("lockd: freeing block %p...\n", block);
254 255
255 down(&file->f_sema);
256 /* Remove block from file's list of blocks */ 256 /* Remove block from file's list of blocks */
257 for (bp = &file->f_blocks; *bp; bp = &(*bp)->b_fnext) { 257 mutex_lock(&file->f_mutex);
258 if (*bp == block) { 258 list_del_init(&block->b_flist);
259 *bp = block->b_fnext; 259 mutex_unlock(&file->f_mutex);
260 break;
261 }
262 }
263 up(&file->f_sema);
264 260
265 nlmsvc_freegrantargs(block->b_call); 261 nlmsvc_freegrantargs(block->b_call);
266 nlm_release_call(block->b_call); 262 nlm_release_call(block->b_call);
@@ -274,47 +270,32 @@ static void nlmsvc_release_block(struct nlm_block *block)
274 kref_put(&block->b_count, nlmsvc_free_block); 270 kref_put(&block->b_count, nlmsvc_free_block);
275} 271}
276 272
277static void nlmsvc_act_mark(struct nlm_host *host, struct nlm_file *file) 273/*
278{ 274 * Loop over all blocks and delete blocks held by
279 struct nlm_block *block; 275 * a matching host.
280 276 */
281 down(&file->f_sema); 277void nlmsvc_traverse_blocks(struct nlm_host *host,
282 for (block = file->f_blocks; block != NULL; block = block->b_fnext) 278 struct nlm_file *file,
283 block->b_host->h_inuse = 1; 279 nlm_host_match_fn_t match)
284 up(&file->f_sema);
285}
286
287static void nlmsvc_act_unlock(struct nlm_host *host, struct nlm_file *file)
288{ 280{
289 struct nlm_block *block; 281 struct nlm_block *block, *next;
290 282
291restart: 283restart:
292 down(&file->f_sema); 284 mutex_lock(&file->f_mutex);
293 for (block = file->f_blocks; block != NULL; block = block->b_fnext) { 285 list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) {
294 if (host != NULL && host != block->b_host) 286 if (!match(block->b_host, host))
295 continue; 287 continue;
296 if (!block->b_queued) 288 /* Do not destroy blocks that are not on
289 * the global retry list - why? */
290 if (list_empty(&block->b_list))
297 continue; 291 continue;
298 kref_get(&block->b_count); 292 kref_get(&block->b_count);
299 up(&file->f_sema); 293 mutex_unlock(&file->f_mutex);
300 nlmsvc_unlink_block(block); 294 nlmsvc_unlink_block(block);
301 nlmsvc_release_block(block); 295 nlmsvc_release_block(block);
302 goto restart; 296 goto restart;
303 } 297 }
304 up(&file->f_sema); 298 mutex_unlock(&file->f_mutex);
305}
306
307/*
308 * Loop over all blocks and perform the action specified.
309 * (NLM_ACT_CHECK handled by nlmsvc_inspect_file).
310 */
311void
312nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
313{
314 if (action == NLM_ACT_MARK)
315 nlmsvc_act_mark(host, file);
316 else
317 nlmsvc_act_unlock(host, file);
318} 299}
319 300
320/* 301/*
@@ -353,13 +334,13 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
353 * Attempt to establish a lock, and if it can't be granted, block it 334 * Attempt to establish a lock, and if it can't be granted, block it
354 * if required. 335 * if required.
355 */ 336 */
356u32 337__be32
357nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, 338nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
358 struct nlm_lock *lock, int wait, struct nlm_cookie *cookie) 339 struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
359{ 340{
360 struct nlm_block *block, *newblock = NULL; 341 struct nlm_block *block, *newblock = NULL;
361 int error; 342 int error;
362 u32 ret; 343 __be32 ret;
363 344
364 dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", 345 dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
365 file->f_file->f_dentry->d_inode->i_sb->s_id, 346 file->f_file->f_dentry->d_inode->i_sb->s_id,
@@ -373,7 +354,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
373 lock->fl.fl_flags &= ~FL_SLEEP; 354 lock->fl.fl_flags &= ~FL_SLEEP;
374again: 355again:
375 /* Lock file against concurrent access */ 356 /* Lock file against concurrent access */
376 down(&file->f_sema); 357 mutex_lock(&file->f_mutex);
377 /* Get existing block (in case client is busy-waiting) */ 358 /* Get existing block (in case client is busy-waiting) */
378 block = nlmsvc_lookup_block(file, lock); 359 block = nlmsvc_lookup_block(file, lock);
379 if (block == NULL) { 360 if (block == NULL) {
@@ -411,10 +392,10 @@ again:
411 392
412 /* If we don't have a block, create and initialize it. Then 393 /* If we don't have a block, create and initialize it. Then
413 * retry because we may have slept in kmalloc. */ 394 * retry because we may have slept in kmalloc. */
414 /* We have to release f_sema as nlmsvc_create_block may try to 395 /* We have to release f_mutex as nlmsvc_create_block may try to
415 * to claim it while doing host garbage collection */ 396 * to claim it while doing host garbage collection */
416 if (newblock == NULL) { 397 if (newblock == NULL) {
417 up(&file->f_sema); 398 mutex_unlock(&file->f_mutex);
418 dprintk("lockd: blocking on this lock (allocating).\n"); 399 dprintk("lockd: blocking on this lock (allocating).\n");
419 if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie))) 400 if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie)))
420 return nlm_lck_denied_nolocks; 401 return nlm_lck_denied_nolocks;
@@ -424,7 +405,7 @@ again:
424 /* Append to list of blocked */ 405 /* Append to list of blocked */
425 nlmsvc_insert_block(newblock, NLM_NEVER); 406 nlmsvc_insert_block(newblock, NLM_NEVER);
426out: 407out:
427 up(&file->f_sema); 408 mutex_unlock(&file->f_mutex);
428 nlmsvc_release_block(newblock); 409 nlmsvc_release_block(newblock);
429 nlmsvc_release_block(block); 410 nlmsvc_release_block(block);
430 dprintk("lockd: nlmsvc_lock returned %u\n", ret); 411 dprintk("lockd: nlmsvc_lock returned %u\n", ret);
@@ -434,7 +415,7 @@ out:
434/* 415/*
435 * Test for presence of a conflicting lock. 416 * Test for presence of a conflicting lock.
436 */ 417 */
437u32 418__be32
438nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock, 419nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
439 struct nlm_lock *conflock) 420 struct nlm_lock *conflock)
440{ 421{
@@ -451,6 +432,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
451 (long long)conflock->fl.fl_start, 432 (long long)conflock->fl.fl_start,
452 (long long)conflock->fl.fl_end); 433 (long long)conflock->fl.fl_end);
453 conflock->caller = "somehost"; /* FIXME */ 434 conflock->caller = "somehost"; /* FIXME */
435 conflock->len = strlen(conflock->caller);
454 conflock->oh.len = 0; /* don't return OH info */ 436 conflock->oh.len = 0; /* don't return OH info */
455 conflock->svid = conflock->fl.fl_pid; 437 conflock->svid = conflock->fl.fl_pid;
456 return nlm_lck_denied; 438 return nlm_lck_denied;
@@ -466,7 +448,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
466 * afterwards. In this case the block will still be there, and hence 448 * afterwards. In this case the block will still be there, and hence
467 * must be removed. 449 * must be removed.
468 */ 450 */
469u32 451__be32
470nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock) 452nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
471{ 453{
472 int error; 454 int error;
@@ -494,7 +476,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
494 * be in progress. 476 * be in progress.
495 * The calling procedure must check whether the file can be closed. 477 * The calling procedure must check whether the file can be closed.
496 */ 478 */
497u32 479__be32
498nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) 480nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
499{ 481{
500 struct nlm_block *block; 482 struct nlm_block *block;
@@ -507,9 +489,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
507 (long long)lock->fl.fl_start, 489 (long long)lock->fl.fl_start,
508 (long long)lock->fl.fl_end); 490 (long long)lock->fl.fl_end);
509 491
510 down(&file->f_sema); 492 mutex_lock(&file->f_mutex);
511 block = nlmsvc_lookup_block(file, lock); 493 block = nlmsvc_lookup_block(file, lock);
512 up(&file->f_sema); 494 mutex_unlock(&file->f_mutex);
513 if (block != NULL) { 495 if (block != NULL) {
514 status = nlmsvc_unlink_block(block); 496 status = nlmsvc_unlink_block(block);
515 nlmsvc_release_block(block); 497 nlmsvc_release_block(block);
@@ -527,10 +509,10 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
527static void 509static void
528nlmsvc_notify_blocked(struct file_lock *fl) 510nlmsvc_notify_blocked(struct file_lock *fl)
529{ 511{
530 struct nlm_block **bp, *block; 512 struct nlm_block *block;
531 513
532 dprintk("lockd: VFS unblock notification for block %p\n", fl); 514 dprintk("lockd: VFS unblock notification for block %p\n", fl);
533 for (bp = &nlm_blocked; (block = *bp) != 0; bp = &block->b_next) { 515 list_for_each_entry(block, &nlm_blocked, b_list) {
534 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { 516 if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
535 nlmsvc_insert_block(block, 0); 517 nlmsvc_insert_block(block, 0);
536 svc_wake_up(block->b_daemon); 518 svc_wake_up(block->b_daemon);
@@ -663,17 +645,14 @@ static const struct rpc_call_ops nlmsvc_grant_ops = {
663 * block. 645 * block.
664 */ 646 */
665void 647void
666nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status) 648nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status)
667{ 649{
668 struct nlm_block *block; 650 struct nlm_block *block;
669 struct nlm_file *file;
670 651
671 dprintk("grant_reply: looking for cookie %x, host (%08x), s=%d \n", 652 dprintk("grant_reply: looking for cookie %x, s=%d \n",
672 *(unsigned int *)(cookie->data), 653 *(unsigned int *)(cookie->data), status);
673 ntohl(rqstp->rq_addr.sin_addr.s_addr), status); 654 if (!(block = nlmsvc_find_block(cookie)))
674 if (!(block = nlmsvc_find_block(cookie, &rqstp->rq_addr)))
675 return; 655 return;
676 file = block->b_file;
677 656
678 if (block) { 657 if (block) {
679 if (status == NLM_LCK_DENIED_GRACE_PERIOD) { 658 if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
@@ -696,16 +675,19 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
696unsigned long 675unsigned long
697nlmsvc_retry_blocked(void) 676nlmsvc_retry_blocked(void)
698{ 677{
699 struct nlm_block *block; 678 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
679 struct nlm_block *block;
680
681 while (!list_empty(&nlm_blocked)) {
682 block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
700 683
701 dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
702 nlm_blocked,
703 nlm_blocked? nlm_blocked->b_when : 0);
704 while ((block = nlm_blocked) != 0) {
705 if (block->b_when == NLM_NEVER) 684 if (block->b_when == NLM_NEVER)
706 break; 685 break;
707 if (time_after(block->b_when,jiffies)) 686 if (time_after(block->b_when,jiffies)) {
687 timeout = block->b_when - jiffies;
708 break; 688 break;
689 }
690
709 dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n", 691 dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
710 block, block->b_when); 692 block, block->b_when);
711 kref_get(&block->b_count); 693 kref_get(&block->b_count);
@@ -713,8 +695,5 @@ nlmsvc_retry_blocked(void)
713 nlmsvc_release_block(block); 695 nlmsvc_release_block(block);
714 } 696 }
715 697
716 if ((block = nlm_blocked) && block->b_when != NLM_NEVER) 698 return timeout;
717 return (block->b_when - jiffies);
718
719 return MAX_SCHEDULE_TIMEOUT;
720} 699}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index dbb66a3b5cd9..32e99a6e8dca 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -22,8 +22,8 @@
22#define NLMDBG_FACILITY NLMDBG_CLIENT 22#define NLMDBG_FACILITY NLMDBG_CLIENT
23 23
24#ifdef CONFIG_LOCKD_V4 24#ifdef CONFIG_LOCKD_V4
25static u32 25static __be32
26cast_to_nlm(u32 status, u32 vers) 26cast_to_nlm(__be32 status, u32 vers)
27{ 27{
28 /* Note: status is assumed to be in network byte order !!! */ 28 /* Note: status is assumed to be in network byte order !!! */
29 if (vers != 4){ 29 if (vers != 4){
@@ -52,22 +52,22 @@ cast_to_nlm(u32 status, u32 vers)
52/* 52/*
53 * Obtain client and file from arguments 53 * Obtain client and file from arguments
54 */ 54 */
55static u32 55static __be32
56nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, 56nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
57 struct nlm_host **hostp, struct nlm_file **filp) 57 struct nlm_host **hostp, struct nlm_file **filp)
58{ 58{
59 struct nlm_host *host = NULL; 59 struct nlm_host *host = NULL;
60 struct nlm_file *file = NULL; 60 struct nlm_file *file = NULL;
61 struct nlm_lock *lock = &argp->lock; 61 struct nlm_lock *lock = &argp->lock;
62 u32 error; 62 __be32 error = 0;
63 63
64 /* nfsd callbacks must have been installed for this procedure */ 64 /* nfsd callbacks must have been installed for this procedure */
65 if (!nlmsvc_ops) 65 if (!nlmsvc_ops)
66 return nlm_lck_denied_nolocks; 66 return nlm_lck_denied_nolocks;
67 67
68 /* Obtain host handle */ 68 /* Obtain host handle */
69 if (!(host = nlmsvc_lookup_host(rqstp)) 69 if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
70 || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0)) 70 || (argp->monitor && nsm_monitor(host) < 0))
71 goto no_locks; 71 goto no_locks;
72 *hostp = host; 72 *hostp = host;
73 73
@@ -88,13 +88,15 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
88no_locks: 88no_locks:
89 if (host) 89 if (host)
90 nlm_release_host(host); 90 nlm_release_host(host);
91 if (error)
92 return error;
91 return nlm_lck_denied_nolocks; 93 return nlm_lck_denied_nolocks;
92} 94}
93 95
94/* 96/*
95 * NULL: Test for presence of service 97 * NULL: Test for presence of service
96 */ 98 */
97static int 99static __be32
98nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) 100nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
99{ 101{
100 dprintk("lockd: NULL called\n"); 102 dprintk("lockd: NULL called\n");
@@ -104,7 +106,7 @@ nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
104/* 106/*
105 * TEST: Check for conflicting lock 107 * TEST: Check for conflicting lock
106 */ 108 */
107static int 109static __be32
108nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, 110nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
109 struct nlm_res *resp) 111 struct nlm_res *resp)
110{ 112{
@@ -122,7 +124,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
122 124
123 /* Obtain client and file */ 125 /* Obtain client and file */
124 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 126 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
125 return rpc_success; 127 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
126 128
127 /* Now check for conflicting locks */ 129 /* Now check for conflicting locks */
128 resp->status = cast_status(nlmsvc_testlock(file, &argp->lock, &resp->lock)); 130 resp->status = cast_status(nlmsvc_testlock(file, &argp->lock, &resp->lock));
@@ -134,7 +136,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
134 return rpc_success; 136 return rpc_success;
135} 137}
136 138
137static int 139static __be32
138nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, 140nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
139 struct nlm_res *resp) 141 struct nlm_res *resp)
140{ 142{
@@ -153,7 +155,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
153 155
154 /* Obtain client and file */ 156 /* Obtain client and file */
155 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 157 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
156 return rpc_success; 158 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
157 159
158#if 0 160#if 0
159 /* If supplied state doesn't match current state, we assume it's 161 /* If supplied state doesn't match current state, we assume it's
@@ -177,7 +179,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
177 return rpc_success; 179 return rpc_success;
178} 180}
179 181
180static int 182static __be32
181nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, 183nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
182 struct nlm_res *resp) 184 struct nlm_res *resp)
183{ 185{
@@ -196,7 +198,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
196 198
197 /* Obtain client and file */ 199 /* Obtain client and file */
198 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 200 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
199 return rpc_success; 201 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
200 202
201 /* Try to cancel request. */ 203 /* Try to cancel request. */
202 resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock)); 204 resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
@@ -210,7 +212,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
210/* 212/*
211 * UNLOCK: release a lock 213 * UNLOCK: release a lock
212 */ 214 */
213static int 215static __be32
214nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, 216nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
215 struct nlm_res *resp) 217 struct nlm_res *resp)
216{ 218{
@@ -229,7 +231,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
229 231
230 /* Obtain client and file */ 232 /* Obtain client and file */
231 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 233 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
232 return rpc_success; 234 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
233 235
234 /* Now try to remove the lock */ 236 /* Now try to remove the lock */
235 resp->status = cast_status(nlmsvc_unlock(file, &argp->lock)); 237 resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
@@ -244,7 +246,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
244 * GRANTED: A server calls us to tell that a process' lock request 246 * GRANTED: A server calls us to tell that a process' lock request
245 * was granted 247 * was granted
246 */ 248 */
247static int 249static __be32
248nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, 250nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
249 struct nlm_res *resp) 251 struct nlm_res *resp)
250{ 252{
@@ -280,14 +282,16 @@ static const struct rpc_call_ops nlmsvc_callback_ops = {
280 * because we send the callback before the reply proper. I hope this 282 * because we send the callback before the reply proper. I hope this
281 * doesn't break any clients. 283 * doesn't break any clients.
282 */ 284 */
283static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp, 285static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
284 int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *)) 286 __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *))
285{ 287{
286 struct nlm_host *host; 288 struct nlm_host *host;
287 struct nlm_rqst *call; 289 struct nlm_rqst *call;
288 int stat; 290 __be32 stat;
289 291
290 host = nlmsvc_lookup_host(rqstp); 292 host = nlmsvc_lookup_host(rqstp,
293 argp->lock.caller,
294 argp->lock.len);
291 if (host == NULL) 295 if (host == NULL)
292 return rpc_system_err; 296 return rpc_system_err;
293 297
@@ -307,28 +311,28 @@ static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *ar
307 return rpc_success; 311 return rpc_success;
308} 312}
309 313
310static int nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 314static __be32 nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
311 void *resp) 315 void *resp)
312{ 316{
313 dprintk("lockd: TEST_MSG called\n"); 317 dprintk("lockd: TEST_MSG called\n");
314 return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test); 318 return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test);
315} 319}
316 320
317static int nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 321static __be32 nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
318 void *resp) 322 void *resp)
319{ 323{
320 dprintk("lockd: LOCK_MSG called\n"); 324 dprintk("lockd: LOCK_MSG called\n");
321 return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock); 325 return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock);
322} 326}
323 327
324static int nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 328static __be32 nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
325 void *resp) 329 void *resp)
326{ 330{
327 dprintk("lockd: CANCEL_MSG called\n"); 331 dprintk("lockd: CANCEL_MSG called\n");
328 return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel); 332 return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel);
329} 333}
330 334
331static int 335static __be32
332nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 336nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
333 void *resp) 337 void *resp)
334{ 338{
@@ -336,7 +340,7 @@ nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
336 return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock); 340 return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock);
337} 341}
338 342
339static int 343static __be32
340nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp, 344nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
341 void *resp) 345 void *resp)
342{ 346{
@@ -347,7 +351,7 @@ nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
347/* 351/*
348 * SHARE: create a DOS share or alter existing share. 352 * SHARE: create a DOS share or alter existing share.
349 */ 353 */
350static int 354static __be32
351nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, 355nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
352 struct nlm_res *resp) 356 struct nlm_res *resp)
353{ 357{
@@ -366,7 +370,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
366 370
367 /* Obtain client and file */ 371 /* Obtain client and file */
368 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 372 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
369 return rpc_success; 373 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
370 374
371 /* Now try to create the share */ 375 /* Now try to create the share */
372 resp->status = cast_status(nlmsvc_share_file(host, file, argp)); 376 resp->status = cast_status(nlmsvc_share_file(host, file, argp));
@@ -380,7 +384,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
380/* 384/*
381 * UNSHARE: Release a DOS share. 385 * UNSHARE: Release a DOS share.
382 */ 386 */
383static int 387static __be32
384nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, 388nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
385 struct nlm_res *resp) 389 struct nlm_res *resp)
386{ 390{
@@ -399,7 +403,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
399 403
400 /* Obtain client and file */ 404 /* Obtain client and file */
401 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) 405 if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
402 return rpc_success; 406 return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
403 407
404 /* Now try to unshare the file */ 408 /* Now try to unshare the file */
405 resp->status = cast_status(nlmsvc_unshare_file(host, file, argp)); 409 resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
@@ -413,7 +417,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
413/* 417/*
414 * NM_LOCK: Create an unmonitored lock 418 * NM_LOCK: Create an unmonitored lock
415 */ 419 */
416static int 420static __be32
417nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp, 421nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
418 struct nlm_res *resp) 422 struct nlm_res *resp)
419{ 423{
@@ -426,7 +430,7 @@ nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
426/* 430/*
427 * FREE_ALL: Release all locks and shares held by client 431 * FREE_ALL: Release all locks and shares held by client
428 */ 432 */
429static int 433static __be32
430nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp, 434nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
431 void *resp) 435 void *resp)
432{ 436{
@@ -444,14 +448,11 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
444/* 448/*
445 * SM_NOTIFY: private callback from statd (not part of official NLM proto) 449 * SM_NOTIFY: private callback from statd (not part of official NLM proto)
446 */ 450 */
447static int 451static __be32
448nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, 452nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
449 void *resp) 453 void *resp)
450{ 454{
451 struct sockaddr_in saddr = rqstp->rq_addr; 455 struct sockaddr_in saddr = rqstp->rq_addr;
452 int vers = argp->vers;
453 int prot = argp->proto >> 1;
454 struct nlm_host *host;
455 456
456 dprintk("lockd: SM_NOTIFY called\n"); 457 dprintk("lockd: SM_NOTIFY called\n");
457 if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) 458 if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
@@ -466,19 +467,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
466 /* Obtain the host pointer for this NFS server and try to 467 /* Obtain the host pointer for this NFS server and try to
467 * reclaim all locks we hold on this server. 468 * reclaim all locks we hold on this server.
468 */ 469 */
470 memset(&saddr, 0, sizeof(saddr));
469 saddr.sin_addr.s_addr = argp->addr; 471 saddr.sin_addr.s_addr = argp->addr;
470 if ((argp->proto & 1)==0) { 472 nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
471 if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
472 nlmclnt_recovery(host, argp->state);
473 nlm_release_host(host);
474 }
475 } else {
476 /* If we run on an NFS server, delete all locks held by the client */
477 if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) {
478 nlmsvc_free_host_resources(host);
479 nlm_release_host(host);
480 }
481 }
482 473
483 return rpc_success; 474 return rpc_success;
484} 475}
@@ -486,7 +477,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
486/* 477/*
487 * client sent a GRANTED_RES, let's remove the associated block 478 * client sent a GRANTED_RES, let's remove the associated block
488 */ 479 */
489static int 480static __be32
490nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, 481nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
491 void *resp) 482 void *resp)
492{ 483{
@@ -495,7 +486,7 @@ nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
495 486
496 dprintk("lockd: GRANTED_RES called\n"); 487 dprintk("lockd: GRANTED_RES called\n");
497 488
498 nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status); 489 nlmsvc_grant_reply(&argp->cookie, argp->status);
499 return rpc_success; 490 return rpc_success;
500} 491}
501 492
diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c
index 27288c83da96..6220dc2a3f2c 100644
--- a/fs/lockd/svcshare.c
+++ b/fs/lockd/svcshare.c
@@ -23,7 +23,7 @@ nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh)
23 && !memcmp(share->s_owner.data, oh->data, oh->len); 23 && !memcmp(share->s_owner.data, oh->data, oh->len);
24} 24}
25 25
26u32 26__be32
27nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file, 27nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file,
28 struct nlm_args *argp) 28 struct nlm_args *argp)
29{ 29{
@@ -64,7 +64,7 @@ update:
64/* 64/*
65 * Delete a share. 65 * Delete a share.
66 */ 66 */
67u32 67__be32
68nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, 68nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
69 struct nlm_args *argp) 69 struct nlm_args *argp)
70{ 70{
@@ -85,24 +85,20 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
85} 85}
86 86
87/* 87/*
88 * Traverse all shares for a given file (and host). 88 * Traverse all shares for a given file, and delete
89 * NLM_ACT_CHECK is handled by nlmsvc_inspect_file. 89 * those owned by the given (type of) host
90 */ 90 */
91void 91void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file,
92nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, int action) 92 nlm_host_match_fn_t match)
93{ 93{
94 struct nlm_share *share, **shpp; 94 struct nlm_share *share, **shpp;
95 95
96 shpp = &file->f_shares; 96 shpp = &file->f_shares;
97 while ((share = *shpp) != NULL) { 97 while ((share = *shpp) != NULL) {
98 if (action == NLM_ACT_MARK) 98 if (match(share->s_host, host)) {
99 share->s_host->h_inuse = 1; 99 *shpp = share->s_next;
100 else if (action == NLM_ACT_UNLOCK) { 100 kfree(share);
101 if (host == NULL || host == share->s_host) { 101 continue;
102 *shpp = share->s_next;
103 kfree(share);
104 continue;
105 }
106 } 102 }
107 shpp = &share->s_next; 103 shpp = &share->s_next;
108 } 104 }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index a92dd98f8401..e83024e16042 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -25,9 +25,9 @@
25/* 25/*
26 * Global file hash table 26 * Global file hash table
27 */ 27 */
28#define FILE_HASH_BITS 5 28#define FILE_HASH_BITS 7
29#define FILE_NRHASH (1<<FILE_HASH_BITS) 29#define FILE_NRHASH (1<<FILE_HASH_BITS)
30static struct nlm_file * nlm_files[FILE_NRHASH]; 30static struct hlist_head nlm_files[FILE_NRHASH];
31static DEFINE_MUTEX(nlm_file_mutex); 31static DEFINE_MUTEX(nlm_file_mutex);
32 32
33#ifdef NFSD_DEBUG 33#ifdef NFSD_DEBUG
@@ -78,13 +78,14 @@ static inline unsigned int file_hash(struct nfs_fh *f)
78 * This is not quite right, but for now, we assume the client performs 78 * This is not quite right, but for now, we assume the client performs
79 * the proper R/W checking. 79 * the proper R/W checking.
80 */ 80 */
81u32 81__be32
82nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, 82nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
83 struct nfs_fh *f) 83 struct nfs_fh *f)
84{ 84{
85 struct hlist_node *pos;
85 struct nlm_file *file; 86 struct nlm_file *file;
86 unsigned int hash; 87 unsigned int hash;
87 u32 nfserr; 88 __be32 nfserr;
88 89
89 nlm_debug_print_fh("nlm_file_lookup", f); 90 nlm_debug_print_fh("nlm_file_lookup", f);
90 91
@@ -93,7 +94,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
93 /* Lock file table */ 94 /* Lock file table */
94 mutex_lock(&nlm_file_mutex); 95 mutex_lock(&nlm_file_mutex);
95 96
96 for (file = nlm_files[hash]; file; file = file->f_next) 97 hlist_for_each_entry(file, pos, &nlm_files[hash], f_list)
97 if (!nfs_compare_fh(&file->f_handle, f)) 98 if (!nfs_compare_fh(&file->f_handle, f))
98 goto found; 99 goto found;
99 100
@@ -105,8 +106,9 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
105 goto out_unlock; 106 goto out_unlock;
106 107
107 memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); 108 memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
108 file->f_hash = hash; 109 mutex_init(&file->f_mutex);
109 init_MUTEX(&file->f_sema); 110 INIT_HLIST_NODE(&file->f_list);
111 INIT_LIST_HEAD(&file->f_blocks);
110 112
111 /* Open the file. Note that this must not sleep for too long, else 113 /* Open the file. Note that this must not sleep for too long, else
112 * we would lock up lockd:-) So no NFS re-exports, folks. 114 * we would lock up lockd:-) So no NFS re-exports, folks.
@@ -115,12 +117,11 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
115 * the file. 117 * the file.
116 */ 118 */
117 if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) { 119 if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) {
118 dprintk("lockd: open failed (nfserr %d)\n", ntohl(nfserr)); 120 dprintk("lockd: open failed (error %d)\n", nfserr);
119 goto out_free; 121 goto out_free;
120 } 122 }
121 123
122 file->f_next = nlm_files[hash]; 124 hlist_add_head(&file->f_list, &nlm_files[hash]);
123 nlm_files[hash] = file;
124 125
125found: 126found:
126 dprintk("lockd: found file %p (count %d)\n", file, file->f_count); 127 dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
@@ -134,12 +135,6 @@ out_unlock:
134 135
135out_free: 136out_free:
136 kfree(file); 137 kfree(file);
137#ifdef CONFIG_LOCKD_V4
138 if (nfserr == 1)
139 nfserr = nlm4_stale_fh;
140 else
141#endif
142 nfserr = nlm_lck_denied;
143 goto out_unlock; 138 goto out_unlock;
144} 139}
145 140
@@ -149,22 +144,14 @@ out_free:
149static inline void 144static inline void
150nlm_delete_file(struct nlm_file *file) 145nlm_delete_file(struct nlm_file *file)
151{ 146{
152 struct nlm_file **fp, *f;
153
154 nlm_debug_print_file("closing file", file); 147 nlm_debug_print_file("closing file", file);
155 148 if (!hlist_unhashed(&file->f_list)) {
156 fp = nlm_files + file->f_hash; 149 hlist_del(&file->f_list);
157 while ((f = *fp) != NULL) { 150 nlmsvc_ops->fclose(file->f_file);
158 if (f == file) { 151 kfree(file);
159 *fp = file->f_next; 152 } else {
160 nlmsvc_ops->fclose(file->f_file); 153 printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
161 kfree(file);
162 return;
163 }
164 fp = &f->f_next;
165 } 154 }
166
167 printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
168} 155}
169 156
170/* 157/*
@@ -172,7 +159,8 @@ nlm_delete_file(struct nlm_file *file)
172 * action. 159 * action.
173 */ 160 */
174static int 161static int
175nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, int action) 162nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
163 nlm_host_match_fn_t match)
176{ 164{
177 struct inode *inode = nlmsvc_file_inode(file); 165 struct inode *inode = nlmsvc_file_inode(file);
178 struct file_lock *fl; 166 struct file_lock *fl;
@@ -186,17 +174,11 @@ again:
186 174
187 /* update current lock count */ 175 /* update current lock count */
188 file->f_locks++; 176 file->f_locks++;
177
189 lockhost = (struct nlm_host *) fl->fl_owner; 178 lockhost = (struct nlm_host *) fl->fl_owner;
190 if (action == NLM_ACT_MARK) 179 if (match(lockhost, host)) {
191 lockhost->h_inuse = 1;
192 else if (action == NLM_ACT_CHECK)
193 return 1;
194 else if (action == NLM_ACT_UNLOCK) {
195 struct file_lock lock = *fl; 180 struct file_lock lock = *fl;
196 181
197 if (host && lockhost != host)
198 continue;
199
200 lock.fl_type = F_UNLCK; 182 lock.fl_type = F_UNLCK;
201 lock.fl_start = 0; 183 lock.fl_start = 0;
202 lock.fl_end = OFFSET_MAX; 184 lock.fl_end = OFFSET_MAX;
@@ -213,53 +195,66 @@ again:
213} 195}
214 196
215/* 197/*
216 * Operate on a single file 198 * Inspect a single file
199 */
200static inline int
201nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match)
202{
203 nlmsvc_traverse_blocks(host, file, match);
204 nlmsvc_traverse_shares(host, file, match);
205 return nlm_traverse_locks(host, file, match);
206}
207
208/*
209 * Quick check whether there are still any locks, blocks or
210 * shares on a given file.
217 */ 211 */
218static inline int 212static inline int
219nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, int action) 213nlm_file_inuse(struct nlm_file *file)
220{ 214{
221 if (action == NLM_ACT_CHECK) { 215 struct inode *inode = nlmsvc_file_inode(file);
222 /* Fast path for mark and sweep garbage collection */ 216 struct file_lock *fl;
223 if (file->f_count || file->f_blocks || file->f_shares) 217
218 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
219 return 1;
220
221 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
222 if (fl->fl_lmops == &nlmsvc_lock_operations)
224 return 1; 223 return 1;
225 } else {
226 nlmsvc_traverse_blocks(host, file, action);
227 nlmsvc_traverse_shares(host, file, action);
228 } 224 }
229 return nlm_traverse_locks(host, file, action); 225 file->f_locks = 0;
226 return 0;
230} 227}
231 228
232/* 229/*
233 * Loop over all files in the file table. 230 * Loop over all files in the file table.
234 */ 231 */
235static int 232static int
236nlm_traverse_files(struct nlm_host *host, int action) 233nlm_traverse_files(struct nlm_host *host, nlm_host_match_fn_t match)
237{ 234{
238 struct nlm_file *file, **fp; 235 struct hlist_node *pos, *next;
236 struct nlm_file *file;
239 int i, ret = 0; 237 int i, ret = 0;
240 238
241 mutex_lock(&nlm_file_mutex); 239 mutex_lock(&nlm_file_mutex);
242 for (i = 0; i < FILE_NRHASH; i++) { 240 for (i = 0; i < FILE_NRHASH; i++) {
243 fp = nlm_files + i; 241 hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) {
244 while ((file = *fp) != NULL) {
245 file->f_count++; 242 file->f_count++;
246 mutex_unlock(&nlm_file_mutex); 243 mutex_unlock(&nlm_file_mutex);
247 244
248 /* Traverse locks, blocks and shares of this file 245 /* Traverse locks, blocks and shares of this file
249 * and update file->f_locks count */ 246 * and update file->f_locks count */
250 if (nlm_inspect_file(host, file, action)) 247 if (nlm_inspect_file(host, file, match))
251 ret = 1; 248 ret = 1;
252 249
253 mutex_lock(&nlm_file_mutex); 250 mutex_lock(&nlm_file_mutex);
254 file->f_count--; 251 file->f_count--;
255 /* No more references to this file. Let go of it. */ 252 /* No more references to this file. Let go of it. */
256 if (!file->f_blocks && !file->f_locks 253 if (list_empty(&file->f_blocks) && !file->f_locks
257 && !file->f_shares && !file->f_count) { 254 && !file->f_shares && !file->f_count) {
258 *fp = file->f_next; 255 hlist_del(&file->f_list);
259 nlmsvc_ops->fclose(file->f_file); 256 nlmsvc_ops->fclose(file->f_file);
260 kfree(file); 257 kfree(file);
261 } else {
262 fp = &file->f_next;
263 } 258 }
264 } 259 }
265 } 260 }
@@ -286,23 +281,63 @@ nlm_release_file(struct nlm_file *file)
286 mutex_lock(&nlm_file_mutex); 281 mutex_lock(&nlm_file_mutex);
287 282
288 /* If there are no more locks etc, delete the file */ 283 /* If there are no more locks etc, delete the file */
289 if(--file->f_count == 0) { 284 if (--file->f_count == 0 && !nlm_file_inuse(file))
290 if(!nlm_inspect_file(NULL, file, NLM_ACT_CHECK)) 285 nlm_delete_file(file);
291 nlm_delete_file(file);
292 }
293 286
294 mutex_unlock(&nlm_file_mutex); 287 mutex_unlock(&nlm_file_mutex);
295} 288}
296 289
297/* 290/*
291 * Helpers function for resource traversal
292 *
293 * nlmsvc_mark_host:
294 * used by the garbage collector; simply sets h_inuse.
295 * Always returns 0.
296 *
297 * nlmsvc_same_host:
298 * returns 1 iff the two hosts match. Used to release
299 * all resources bound to a specific host.
300 *
301 * nlmsvc_is_client:
302 * returns 1 iff the host is a client.
303 * Used by nlmsvc_invalidate_all
304 */
305static int
306nlmsvc_mark_host(struct nlm_host *host, struct nlm_host *dummy)
307{
308 host->h_inuse = 1;
309 return 0;
310}
311
312static int
313nlmsvc_same_host(struct nlm_host *host, struct nlm_host *other)
314{
315 return host == other;
316}
317
318static int
319nlmsvc_is_client(struct nlm_host *host, struct nlm_host *dummy)
320{
321 if (host->h_server) {
322 /* we are destroying locks even though the client
323 * hasn't asked us too, so don't unmonitor the
324 * client
325 */
326 if (host->h_nsmhandle)
327 host->h_nsmhandle->sm_sticky = 1;
328 return 1;
329 } else
330 return 0;
331}
332
333/*
298 * Mark all hosts that still hold resources 334 * Mark all hosts that still hold resources
299 */ 335 */
300void 336void
301nlmsvc_mark_resources(void) 337nlmsvc_mark_resources(void)
302{ 338{
303 dprintk("lockd: nlmsvc_mark_resources\n"); 339 dprintk("lockd: nlmsvc_mark_resources\n");
304 340 nlm_traverse_files(NULL, nlmsvc_mark_host);
305 nlm_traverse_files(NULL, NLM_ACT_MARK);
306} 341}
307 342
308/* 343/*
@@ -313,23 +348,25 @@ nlmsvc_free_host_resources(struct nlm_host *host)
313{ 348{
314 dprintk("lockd: nlmsvc_free_host_resources\n"); 349 dprintk("lockd: nlmsvc_free_host_resources\n");
315 350
316 if (nlm_traverse_files(host, NLM_ACT_UNLOCK)) 351 if (nlm_traverse_files(host, nlmsvc_same_host)) {
317 printk(KERN_WARNING 352 printk(KERN_WARNING
318 "lockd: couldn't remove all locks held by %s", 353 "lockd: couldn't remove all locks held by %s\n",
319 host->h_name); 354 host->h_name);
355 BUG();
356 }
320} 357}
321 358
322/* 359/*
323 * delete all hosts structs for clients 360 * Remove all locks held for clients
324 */ 361 */
325void 362void
326nlmsvc_invalidate_all(void) 363nlmsvc_invalidate_all(void)
327{ 364{
328 struct nlm_host *host; 365 /* Release all locks held by NFS clients.
329 while ((host = nlm_find_client()) != NULL) { 366 * Previously, the code would call
330 nlmsvc_free_host_resources(host); 367 * nlmsvc_free_host_resources for each client in
331 host->h_expires = 0; 368 * turn, which is about as inefficient as it gets.
332 host->h_killed = 1; 369 * Now we just do it once in nlm_traverse_files.
333 nlm_release_host(host); 370 */
334 } 371 nlm_traverse_files(NULL, nlmsvc_is_client);
335} 372}
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 61c46facf257..b7c949256e5a 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -43,7 +43,7 @@ loff_t_to_s32(loff_t offset)
43/* 43/*
44 * XDR functions for basic NLM types 44 * XDR functions for basic NLM types
45 */ 45 */
46static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c) 46static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
47{ 47{
48 unsigned int len; 48 unsigned int len;
49 49
@@ -69,8 +69,8 @@ static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c)
69 return p; 69 return p;
70} 70}
71 71
72static inline u32 * 72static inline __be32 *
73nlm_encode_cookie(u32 *p, struct nlm_cookie *c) 73nlm_encode_cookie(__be32 *p, struct nlm_cookie *c)
74{ 74{
75 *p++ = htonl(c->len); 75 *p++ = htonl(c->len);
76 memcpy(p, c->data, c->len); 76 memcpy(p, c->data, c->len);
@@ -78,8 +78,8 @@ nlm_encode_cookie(u32 *p, struct nlm_cookie *c)
78 return p; 78 return p;
79} 79}
80 80
81static u32 * 81static __be32 *
82nlm_decode_fh(u32 *p, struct nfs_fh *f) 82nlm_decode_fh(__be32 *p, struct nfs_fh *f)
83{ 83{
84 unsigned int len; 84 unsigned int len;
85 85
@@ -95,8 +95,8 @@ nlm_decode_fh(u32 *p, struct nfs_fh *f)
95 return p + XDR_QUADLEN(NFS2_FHSIZE); 95 return p + XDR_QUADLEN(NFS2_FHSIZE);
96} 96}
97 97
98static inline u32 * 98static inline __be32 *
99nlm_encode_fh(u32 *p, struct nfs_fh *f) 99nlm_encode_fh(__be32 *p, struct nfs_fh *f)
100{ 100{
101 *p++ = htonl(NFS2_FHSIZE); 101 *p++ = htonl(NFS2_FHSIZE);
102 memcpy(p, f->data, NFS2_FHSIZE); 102 memcpy(p, f->data, NFS2_FHSIZE);
@@ -106,20 +106,20 @@ nlm_encode_fh(u32 *p, struct nfs_fh *f)
106/* 106/*
107 * Encode and decode owner handle 107 * Encode and decode owner handle
108 */ 108 */
109static inline u32 * 109static inline __be32 *
110nlm_decode_oh(u32 *p, struct xdr_netobj *oh) 110nlm_decode_oh(__be32 *p, struct xdr_netobj *oh)
111{ 111{
112 return xdr_decode_netobj(p, oh); 112 return xdr_decode_netobj(p, oh);
113} 113}
114 114
115static inline u32 * 115static inline __be32 *
116nlm_encode_oh(u32 *p, struct xdr_netobj *oh) 116nlm_encode_oh(__be32 *p, struct xdr_netobj *oh)
117{ 117{
118 return xdr_encode_netobj(p, oh); 118 return xdr_encode_netobj(p, oh);
119} 119}
120 120
121static u32 * 121static __be32 *
122nlm_decode_lock(u32 *p, struct nlm_lock *lock) 122nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
123{ 123{
124 struct file_lock *fl = &lock->fl; 124 struct file_lock *fl = &lock->fl;
125 s32 start, len, end; 125 s32 start, len, end;
@@ -153,8 +153,8 @@ nlm_decode_lock(u32 *p, struct nlm_lock *lock)
153/* 153/*
154 * Encode a lock as part of an NLM call 154 * Encode a lock as part of an NLM call
155 */ 155 */
156static u32 * 156static __be32 *
157nlm_encode_lock(u32 *p, struct nlm_lock *lock) 157nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
158{ 158{
159 struct file_lock *fl = &lock->fl; 159 struct file_lock *fl = &lock->fl;
160 __s32 start, len; 160 __s32 start, len;
@@ -184,8 +184,8 @@ nlm_encode_lock(u32 *p, struct nlm_lock *lock)
184/* 184/*
185 * Encode result of a TEST/TEST_MSG call 185 * Encode result of a TEST/TEST_MSG call
186 */ 186 */
187static u32 * 187static __be32 *
188nlm_encode_testres(u32 *p, struct nlm_res *resp) 188nlm_encode_testres(__be32 *p, struct nlm_res *resp)
189{ 189{
190 s32 start, len; 190 s32 start, len;
191 191
@@ -221,7 +221,7 @@ nlm_encode_testres(u32 *p, struct nlm_res *resp)
221 * First, the server side XDR functions 221 * First, the server side XDR functions
222 */ 222 */
223int 223int
224nlmsvc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 224nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
225{ 225{
226 u32 exclusive; 226 u32 exclusive;
227 227
@@ -238,7 +238,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
238} 238}
239 239
240int 240int
241nlmsvc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp) 241nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
242{ 242{
243 if (!(p = nlm_encode_testres(p, resp))) 243 if (!(p = nlm_encode_testres(p, resp)))
244 return 0; 244 return 0;
@@ -246,7 +246,7 @@ nlmsvc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
246} 246}
247 247
248int 248int
249nlmsvc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 249nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
250{ 250{
251 u32 exclusive; 251 u32 exclusive;
252 252
@@ -266,7 +266,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
266} 266}
267 267
268int 268int
269nlmsvc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 269nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
270{ 270{
271 u32 exclusive; 271 u32 exclusive;
272 272
@@ -282,7 +282,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
282} 282}
283 283
284int 284int
285nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 285nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
286{ 286{
287 if (!(p = nlm_decode_cookie(p, &argp->cookie)) 287 if (!(p = nlm_decode_cookie(p, &argp->cookie))
288 || !(p = nlm_decode_lock(p, &argp->lock))) 288 || !(p = nlm_decode_lock(p, &argp->lock)))
@@ -292,7 +292,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
292} 292}
293 293
294int 294int
295nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 295nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
296{ 296{
297 struct nlm_lock *lock = &argp->lock; 297 struct nlm_lock *lock = &argp->lock;
298 298
@@ -313,7 +313,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
313} 313}
314 314
315int 315int
316nlmsvc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp) 316nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
317{ 317{
318 if (!(p = nlm_encode_cookie(p, &resp->cookie))) 318 if (!(p = nlm_encode_cookie(p, &resp->cookie)))
319 return 0; 319 return 0;
@@ -323,7 +323,7 @@ nlmsvc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
323} 323}
324 324
325int 325int
326nlmsvc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp) 326nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
327{ 327{
328 if (!(p = nlm_encode_cookie(p, &resp->cookie))) 328 if (!(p = nlm_encode_cookie(p, &resp->cookie)))
329 return 0; 329 return 0;
@@ -332,7 +332,7 @@ nlmsvc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
332} 332}
333 333
334int 334int
335nlmsvc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp) 335nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
336{ 336{
337 struct nlm_lock *lock = &argp->lock; 337 struct nlm_lock *lock = &argp->lock;
338 338
@@ -344,7 +344,7 @@ nlmsvc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
344} 344}
345 345
346int 346int
347nlmsvc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp) 347nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
348{ 348{
349 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) 349 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
350 return 0; 350 return 0;
@@ -357,7 +357,7 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
357} 357}
358 358
359int 359int
360nlmsvc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp) 360nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
361{ 361{
362 if (!(p = nlm_decode_cookie(p, &resp->cookie))) 362 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
363 return 0; 363 return 0;
@@ -366,13 +366,13 @@ nlmsvc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
366} 366}
367 367
368int 368int
369nlmsvc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy) 369nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
370{ 370{
371 return xdr_argsize_check(rqstp, p); 371 return xdr_argsize_check(rqstp, p);
372} 372}
373 373
374int 374int
375nlmsvc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy) 375nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
376{ 376{
377 return xdr_ressize_check(rqstp, p); 377 return xdr_ressize_check(rqstp, p);
378} 378}
@@ -389,7 +389,7 @@ nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
389#endif 389#endif
390 390
391static int 391static int
392nlmclt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp) 392nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
393{ 393{
394 struct nlm_lock *lock = &argp->lock; 394 struct nlm_lock *lock = &argp->lock;
395 395
@@ -403,7 +403,7 @@ nlmclt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
403} 403}
404 404
405static int 405static int
406nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) 406nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
407{ 407{
408 if (!(p = nlm_decode_cookie(p, &resp->cookie))) 408 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
409 return -EIO; 409 return -EIO;
@@ -438,7 +438,7 @@ nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
438 438
439 439
440static int 440static int
441nlmclt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp) 441nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
442{ 442{
443 struct nlm_lock *lock = &argp->lock; 443 struct nlm_lock *lock = &argp->lock;
444 444
@@ -455,7 +455,7 @@ nlmclt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
455} 455}
456 456
457static int 457static int
458nlmclt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp) 458nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
459{ 459{
460 struct nlm_lock *lock = &argp->lock; 460 struct nlm_lock *lock = &argp->lock;
461 461
@@ -470,7 +470,7 @@ nlmclt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
470} 470}
471 471
472static int 472static int
473nlmclt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp) 473nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
474{ 474{
475 struct nlm_lock *lock = &argp->lock; 475 struct nlm_lock *lock = &argp->lock;
476 476
@@ -483,7 +483,7 @@ nlmclt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
483} 483}
484 484
485static int 485static int
486nlmclt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) 486nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
487{ 487{
488 if (!(p = nlm_encode_cookie(p, &resp->cookie))) 488 if (!(p = nlm_encode_cookie(p, &resp->cookie)))
489 return -EIO; 489 return -EIO;
@@ -493,7 +493,7 @@ nlmclt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
493} 493}
494 494
495static int 495static int
496nlmclt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) 496nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
497{ 497{
498 if (!(p = nlm_encode_testres(p, resp))) 498 if (!(p = nlm_encode_testres(p, resp)))
499 return -EIO; 499 return -EIO;
@@ -502,7 +502,7 @@ nlmclt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
502} 502}
503 503
504static int 504static int
505nlmclt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) 505nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
506{ 506{
507 if (!(p = nlm_decode_cookie(p, &resp->cookie))) 507 if (!(p = nlm_decode_cookie(p, &resp->cookie)))
508 return -EIO; 508 return -EIO;
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 36eb175ec335..f4c0b2b9f75a 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -44,8 +44,8 @@ loff_t_to_s64(loff_t offset)
44/* 44/*
45 * XDR functions for basic NLM types 45 * XDR functions for basic NLM types
46 */ 46 */
47static u32 * 47static __be32 *
48nlm4_decode_cookie(u32 *p, struct nlm_cookie *c) 48nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
49{ 49{
50 unsigned int len; 50 unsigned int len;
51 51
@@ -71,8 +71,8 @@ nlm4_decode_cookie(u32 *p, struct nlm_cookie *c)
71 return p; 71 return p;
72} 72}
73 73
74static u32 * 74static __be32 *
75nlm4_encode_cookie(u32 *p, struct nlm_cookie *c) 75nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c)
76{ 76{
77 *p++ = htonl(c->len); 77 *p++ = htonl(c->len);
78 memcpy(p, c->data, c->len); 78 memcpy(p, c->data, c->len);
@@ -80,8 +80,8 @@ nlm4_encode_cookie(u32 *p, struct nlm_cookie *c)
80 return p; 80 return p;
81} 81}
82 82
83static u32 * 83static __be32 *
84nlm4_decode_fh(u32 *p, struct nfs_fh *f) 84nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
85{ 85{
86 memset(f->data, 0, sizeof(f->data)); 86 memset(f->data, 0, sizeof(f->data));
87 f->size = ntohl(*p++); 87 f->size = ntohl(*p++);
@@ -95,8 +95,8 @@ nlm4_decode_fh(u32 *p, struct nfs_fh *f)
95 return p + XDR_QUADLEN(f->size); 95 return p + XDR_QUADLEN(f->size);
96} 96}
97 97
98static u32 * 98static __be32 *
99nlm4_encode_fh(u32 *p, struct nfs_fh *f) 99nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
100{ 100{
101 *p++ = htonl(f->size); 101 *p++ = htonl(f->size);
102 if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */ 102 if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
@@ -107,20 +107,20 @@ nlm4_encode_fh(u32 *p, struct nfs_fh *f)
107/* 107/*
108 * Encode and decode owner handle 108 * Encode and decode owner handle
109 */ 109 */
110static u32 * 110static __be32 *
111nlm4_decode_oh(u32 *p, struct xdr_netobj *oh) 111nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
112{ 112{
113 return xdr_decode_netobj(p, oh); 113 return xdr_decode_netobj(p, oh);
114} 114}
115 115
116static u32 * 116static __be32 *
117nlm4_encode_oh(u32 *p, struct xdr_netobj *oh) 117nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
118{ 118{
119 return xdr_encode_netobj(p, oh); 119 return xdr_encode_netobj(p, oh);
120} 120}
121 121
122static u32 * 122static __be32 *
123nlm4_decode_lock(u32 *p, struct nlm_lock *lock) 123nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
124{ 124{
125 struct file_lock *fl = &lock->fl; 125 struct file_lock *fl = &lock->fl;
126 __s64 len, start, end; 126 __s64 len, start, end;
@@ -153,8 +153,8 @@ nlm4_decode_lock(u32 *p, struct nlm_lock *lock)
153/* 153/*
154 * Encode a lock as part of an NLM call 154 * Encode a lock as part of an NLM call
155 */ 155 */
156static u32 * 156static __be32 *
157nlm4_encode_lock(u32 *p, struct nlm_lock *lock) 157nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
158{ 158{
159 struct file_lock *fl = &lock->fl; 159 struct file_lock *fl = &lock->fl;
160 __s64 start, len; 160 __s64 start, len;
@@ -185,8 +185,8 @@ nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
185/* 185/*
186 * Encode result of a TEST/TEST_MSG call 186 * Encode result of a TEST/TEST_MSG call
187 */ 187 */
188static u32 * 188static __be32 *
189nlm4_encode_testres(u32 *p, struct nlm_res *resp) 189nlm4_encode_testres(__be32 *p, struct nlm_res *resp)
190{ 190{
191 s64 start, len; 191 s64 start, len;
192 192
@@ -227,7 +227,7 @@ nlm4_encode_testres(u32 *p, struct nlm_res *resp)
227 * First, the server side XDR functions 227 * First, the server side XDR functions
228 */ 228 */
229int 229int
230nlm4svc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 230nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
231{ 231{
232 u32 exclusive; 232 u32 exclusive;
233 233
@@ -244,7 +244,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
244} 244}
245 245
246int 246int
247nlm4svc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp) 247nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
248{ 248{
249 if (!(p = nlm4_encode_testres(p, resp))) 249 if (!(p = nlm4_encode_testres(p, resp)))
250 return 0; 250 return 0;
@@ -252,7 +252,7 @@ nlm4svc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
252} 252}
253 253
254int 254int
255nlm4svc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 255nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
256{ 256{
257 u32 exclusive; 257 u32 exclusive;
258 258
@@ -272,7 +272,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
272} 272}
273 273
274int 274int
275nlm4svc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 275nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
276{ 276{
277 u32 exclusive; 277 u32 exclusive;
278 278
@@ -288,7 +288,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
288} 288}
289 289
290int 290int
291nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 291nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
292{ 292{
293 if (!(p = nlm4_decode_cookie(p, &argp->cookie)) 293 if (!(p = nlm4_decode_cookie(p, &argp->cookie))
294 || !(p = nlm4_decode_lock(p, &argp->lock))) 294 || !(p = nlm4_decode_lock(p, &argp->lock)))
@@ -298,7 +298,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
298} 298}
299 299
300int 300int
301nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp) 301nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
302{ 302{
303 struct nlm_lock *lock = &argp->lock; 303 struct nlm_lock *lock = &argp->lock;
304 304
@@ -319,7 +319,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
319} 319}
320 320
321int 321int
322nlm4svc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp) 322nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
323{ 323{
324 if (!(p = nlm4_encode_cookie(p, &resp->cookie))) 324 if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
325 return 0; 325 return 0;
@@ -329,7 +329,7 @@ nlm4svc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
329} 329}
330 330
331int 331int
332nlm4svc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp) 332nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
333{ 333{
334 if (!(p = nlm4_encode_cookie(p, &resp->cookie))) 334 if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
335 return 0; 335 return 0;
@@ -338,7 +338,7 @@ nlm4svc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
338} 338}
339 339
340int 340int
341nlm4svc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp) 341nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
342{ 342{
343 struct nlm_lock *lock = &argp->lock; 343 struct nlm_lock *lock = &argp->lock;
344 344
@@ -350,7 +350,7 @@ nlm4svc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
350} 350}
351 351
352int 352int
353nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp) 353nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
354{ 354{
355 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) 355 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
356 return 0; 356 return 0;
@@ -363,7 +363,7 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
363} 363}
364 364
365int 365int
366nlm4svc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp) 366nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
367{ 367{
368 if (!(p = nlm4_decode_cookie(p, &resp->cookie))) 368 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
369 return 0; 369 return 0;
@@ -372,13 +372,13 @@ nlm4svc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
372} 372}
373 373
374int 374int
375nlm4svc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy) 375nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
376{ 376{
377 return xdr_argsize_check(rqstp, p); 377 return xdr_argsize_check(rqstp, p);
378} 378}
379 379
380int 380int
381nlm4svc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy) 381nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
382{ 382{
383 return xdr_ressize_check(rqstp, p); 383 return xdr_ressize_check(rqstp, p);
384} 384}
@@ -388,14 +388,14 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
388 */ 388 */
389#ifdef NLMCLNT_SUPPORT_SHARES 389#ifdef NLMCLNT_SUPPORT_SHARES
390static int 390static int
391nlm4clt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr) 391nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
392{ 392{
393 return 0; 393 return 0;
394} 394}
395#endif 395#endif
396 396
397static int 397static int
398nlm4clt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp) 398nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
399{ 399{
400 struct nlm_lock *lock = &argp->lock; 400 struct nlm_lock *lock = &argp->lock;
401 401
@@ -409,7 +409,7 @@ nlm4clt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
409} 409}
410 410
411static int 411static int
412nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) 412nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
413{ 413{
414 if (!(p = nlm4_decode_cookie(p, &resp->cookie))) 414 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
415 return -EIO; 415 return -EIO;
@@ -444,7 +444,7 @@ nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
444 444
445 445
446static int 446static int
447nlm4clt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp) 447nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
448{ 448{
449 struct nlm_lock *lock = &argp->lock; 449 struct nlm_lock *lock = &argp->lock;
450 450
@@ -461,7 +461,7 @@ nlm4clt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
461} 461}
462 462
463static int 463static int
464nlm4clt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp) 464nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
465{ 465{
466 struct nlm_lock *lock = &argp->lock; 466 struct nlm_lock *lock = &argp->lock;
467 467
@@ -476,7 +476,7 @@ nlm4clt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
476} 476}
477 477
478static int 478static int
479nlm4clt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp) 479nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
480{ 480{
481 struct nlm_lock *lock = &argp->lock; 481 struct nlm_lock *lock = &argp->lock;
482 482
@@ -489,7 +489,7 @@ nlm4clt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
489} 489}
490 490
491static int 491static int
492nlm4clt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) 492nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
493{ 493{
494 if (!(p = nlm4_encode_cookie(p, &resp->cookie))) 494 if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
495 return -EIO; 495 return -EIO;
@@ -499,7 +499,7 @@ nlm4clt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
499} 499}
500 500
501static int 501static int
502nlm4clt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) 502nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
503{ 503{
504 if (!(p = nlm4_encode_testres(p, resp))) 504 if (!(p = nlm4_encode_testres(p, resp)))
505 return -EIO; 505 return -EIO;
@@ -508,7 +508,7 @@ nlm4clt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
508} 508}
509 509
510static int 510static int
511nlm4clt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) 511nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
512{ 512{
513 if (!(p = nlm4_decode_cookie(p, &resp->cookie))) 513 if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
514 return -EIO; 514 return -EIO;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index c11a4b9fb863..1e36bae4d0eb 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -149,12 +149,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
149 return -ENOMEM; 149 return -ENOMEM;
150 s->s_fs_info = sbi; 150 s->s_fs_info = sbi;
151 151
152 /* N.B. These should be compile-time tests. 152 BUILD_BUG_ON(32 != sizeof (struct minix_inode));
153 Unfortunately that is impossible. */ 153 BUILD_BUG_ON(64 != sizeof(struct minix2_inode));
154 if (32 != sizeof (struct minix_inode))
155 panic("bad V1 i-node size");
156 if (64 != sizeof(struct minix2_inode))
157 panic("bad V2 i-node size");
158 154
159 if (!sb_set_blocksize(s, BLOCK_SIZE)) 155 if (!sb_set_blocksize(s, BLOCK_SIZE))
160 goto out_bad_hblock; 156 goto out_bad_hblock;
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index a89ac84a8241..589d1eac55c1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -726,7 +726,7 @@ outrel:
726 struct compat_ncp_privatedata_ioctl user32; 726 struct compat_ncp_privatedata_ioctl user32;
727 user32.len = user.len; 727 user32.len = user.len;
728 user32.data = (unsigned long) user.data; 728 user32.data = (unsigned long) user.data;
729 if (copy_to_user(&user32, argp, sizeof(user32))) 729 if (copy_to_user(argp, &user32, sizeof(user32)))
730 return -EFAULT; 730 return -EFAULT;
731 } else 731 } else
732#endif 732#endif
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 5676163d26e8..db3d7919c601 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -31,10 +31,10 @@ struct cb_compound_hdr_arg {
31}; 31};
32 32
33struct cb_compound_hdr_res { 33struct cb_compound_hdr_res {
34 uint32_t *status; 34 __be32 *status;
35 int taglen; 35 int taglen;
36 const char *tag; 36 const char *tag;
37 uint32_t *nops; 37 __be32 *nops;
38}; 38};
39 39
40struct cb_getattrargs { 40struct cb_getattrargs {
@@ -44,7 +44,7 @@ struct cb_getattrargs {
44}; 44};
45 45
46struct cb_getattrres { 46struct cb_getattrres {
47 uint32_t status; 47 __be32 status;
48 uint32_t bitmap[2]; 48 uint32_t bitmap[2];
49 uint64_t size; 49 uint64_t size;
50 uint64_t change_attr; 50 uint64_t change_attr;
@@ -59,8 +59,8 @@ struct cb_recallargs {
59 uint32_t truncate; 59 uint32_t truncate;
60}; 60};
61 61
62extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); 62extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
63extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy); 63extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
64 64
65#ifdef CONFIG_NFS_V4 65#ifdef CONFIG_NFS_V4
66extern int nfs_callback_up(void); 66extern int nfs_callback_up(void);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 97cf8f71451f..72e55d83756d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -14,7 +14,7 @@
14 14
15#define NFSDBG_FACILITY NFSDBG_CALLBACK 15#define NFSDBG_FACILITY NFSDBG_CALLBACK
16 16
17unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) 17__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
18{ 18{
19 struct nfs_client *clp; 19 struct nfs_client *clp;
20 struct nfs_delegation *delegation; 20 struct nfs_delegation *delegation;
@@ -55,11 +55,11 @@ out:
55 return res->status; 55 return res->status;
56} 56}
57 57
58unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy) 58__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
59{ 59{
60 struct nfs_client *clp; 60 struct nfs_client *clp;
61 struct inode *inode; 61 struct inode *inode;
62 unsigned res; 62 __be32 res;
63 63
64 res = htonl(NFS4ERR_BADHANDLE); 64 res = htonl(NFS4ERR_BADHANDLE);
65 clp = nfs_find_client(args->addr, 4); 65 clp = nfs_find_client(args->addr, 4);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 29f932192054..f8ea1f51f590 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -22,9 +22,9 @@
22 22
23#define NFSDBG_FACILITY NFSDBG_CALLBACK 23#define NFSDBG_FACILITY NFSDBG_CALLBACK
24 24
25typedef unsigned (*callback_process_op_t)(void *, void *); 25typedef __be32 (*callback_process_op_t)(void *, void *);
26typedef unsigned (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); 26typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
27typedef unsigned (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); 27typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
28 28
29 29
30struct callback_op { 30struct callback_op {
@@ -36,24 +36,24 @@ struct callback_op {
36 36
37static struct callback_op callback_ops[]; 37static struct callback_op callback_ops[];
38 38
39static int nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp) 39static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
40{ 40{
41 return htonl(NFS4_OK); 41 return htonl(NFS4_OK);
42} 42}
43 43
44static int nfs4_decode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy) 44static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
45{ 45{
46 return xdr_argsize_check(rqstp, p); 46 return xdr_argsize_check(rqstp, p);
47} 47}
48 48
49static int nfs4_encode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy) 49static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
50{ 50{
51 return xdr_ressize_check(rqstp, p); 51 return xdr_ressize_check(rqstp, p);
52} 52}
53 53
54static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes) 54static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
55{ 55{
56 uint32_t *p; 56 __be32 *p;
57 57
58 p = xdr_inline_decode(xdr, nbytes); 58 p = xdr_inline_decode(xdr, nbytes);
59 if (unlikely(p == NULL)) 59 if (unlikely(p == NULL))
@@ -61,9 +61,9 @@ static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
61 return p; 61 return p;
62} 62}
63 63
64static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str) 64static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
65{ 65{
66 uint32_t *p; 66 __be32 *p;
67 67
68 p = read_buf(xdr, 4); 68 p = read_buf(xdr, 4);
69 if (unlikely(p == NULL)) 69 if (unlikely(p == NULL))
@@ -81,9 +81,9 @@ static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const c
81 return 0; 81 return 0;
82} 82}
83 83
84static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) 84static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
85{ 85{
86 uint32_t *p; 86 __be32 *p;
87 87
88 p = read_buf(xdr, 4); 88 p = read_buf(xdr, 4);
89 if (unlikely(p == NULL)) 89 if (unlikely(p == NULL))
@@ -99,9 +99,9 @@ static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
99 return 0; 99 return 0;
100} 100}
101 101
102static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) 102static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
103{ 103{
104 uint32_t *p; 104 __be32 *p;
105 unsigned int attrlen; 105 unsigned int attrlen;
106 106
107 p = read_buf(xdr, 4); 107 p = read_buf(xdr, 4);
@@ -118,9 +118,9 @@ static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
118 return 0; 118 return 0;
119} 119}
120 120
121static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) 121static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
122{ 122{
123 uint32_t *p; 123 __be32 *p;
124 124
125 p = read_buf(xdr, 16); 125 p = read_buf(xdr, 16);
126 if (unlikely(p == NULL)) 126 if (unlikely(p == NULL))
@@ -129,11 +129,11 @@ static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
129 return 0; 129 return 0;
130} 130}
131 131
132static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) 132static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
133{ 133{
134 uint32_t *p; 134 __be32 *p;
135 unsigned int minor_version; 135 unsigned int minor_version;
136 unsigned status; 136 __be32 status;
137 137
138 status = decode_string(xdr, &hdr->taglen, &hdr->tag); 138 status = decode_string(xdr, &hdr->taglen, &hdr->tag);
139 if (unlikely(status != 0)) 139 if (unlikely(status != 0))
@@ -159,9 +159,9 @@ static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compou
159 return 0; 159 return 0;
160} 160}
161 161
162static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) 162static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
163{ 163{
164 uint32_t *p; 164 __be32 *p;
165 p = read_buf(xdr, 4); 165 p = read_buf(xdr, 4);
166 if (unlikely(p == NULL)) 166 if (unlikely(p == NULL))
167 return htonl(NFS4ERR_RESOURCE); 167 return htonl(NFS4ERR_RESOURCE);
@@ -169,9 +169,9 @@ static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
169 return 0; 169 return 0;
170} 170}
171 171
172static unsigned decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args) 172static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
173{ 173{
174 unsigned status; 174 __be32 status;
175 175
176 status = decode_fh(xdr, &args->fh); 176 status = decode_fh(xdr, &args->fh);
177 if (unlikely(status != 0)) 177 if (unlikely(status != 0))
@@ -183,10 +183,10 @@ out:
183 return status; 183 return status;
184} 184}
185 185
186static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) 186static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
187{ 187{
188 uint32_t *p; 188 __be32 *p;
189 unsigned status; 189 __be32 status;
190 190
191 args->addr = &rqstp->rq_addr; 191 args->addr = &rqstp->rq_addr;
192 status = decode_stateid(xdr, &args->stateid); 192 status = decode_stateid(xdr, &args->stateid);
@@ -204,9 +204,9 @@ out:
204 return status; 204 return status;
205} 205}
206 206
207static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 207static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
208{ 208{
209 uint32_t *p; 209 __be32 *p;
210 210
211 p = xdr_reserve_space(xdr, 4 + len); 211 p = xdr_reserve_space(xdr, 4 + len);
212 if (unlikely(p == NULL)) 212 if (unlikely(p == NULL))
@@ -217,10 +217,10 @@ static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const ch
217 217
218#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) 218#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
219#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) 219#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
220static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, uint32_t **savep) 220static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep)
221{ 221{
222 uint32_t bm[2]; 222 __be32 bm[2];
223 uint32_t *p; 223 __be32 *p;
224 224
225 bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); 225 bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
226 bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); 226 bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
@@ -247,9 +247,9 @@ static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitma
247 return 0; 247 return 0;
248} 248}
249 249
250static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change) 250static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
251{ 251{
252 uint32_t *p; 252 __be32 *p;
253 253
254 if (!(bitmap[0] & FATTR4_WORD0_CHANGE)) 254 if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
255 return 0; 255 return 0;
@@ -260,9 +260,9 @@ static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitma
260 return 0; 260 return 0;
261} 261}
262 262
263static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size) 263static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
264{ 264{
265 uint32_t *p; 265 __be32 *p;
266 266
267 if (!(bitmap[0] & FATTR4_WORD0_SIZE)) 267 if (!(bitmap[0] & FATTR4_WORD0_SIZE))
268 return 0; 268 return 0;
@@ -273,9 +273,9 @@ static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap,
273 return 0; 273 return 0;
274} 274}
275 275
276static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *time) 276static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
277{ 277{
278 uint32_t *p; 278 __be32 *p;
279 279
280 p = xdr_reserve_space(xdr, 12); 280 p = xdr_reserve_space(xdr, 12);
281 if (unlikely(p == 0)) 281 if (unlikely(p == 0))
@@ -285,23 +285,23 @@ static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *
285 return 0; 285 return 0;
286} 286}
287 287
288static unsigned encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) 288static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
289{ 289{
290 if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) 290 if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
291 return 0; 291 return 0;
292 return encode_attr_time(xdr,time); 292 return encode_attr_time(xdr,time);
293} 293}
294 294
295static unsigned encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) 295static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
296{ 296{
297 if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) 297 if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
298 return 0; 298 return 0;
299 return encode_attr_time(xdr,time); 299 return encode_attr_time(xdr,time);
300} 300}
301 301
302static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr) 302static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
303{ 303{
304 unsigned status; 304 __be32 status;
305 305
306 hdr->status = xdr_reserve_space(xdr, 4); 306 hdr->status = xdr_reserve_space(xdr, 4);
307 if (unlikely(hdr->status == NULL)) 307 if (unlikely(hdr->status == NULL))
@@ -315,9 +315,9 @@ static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compou
315 return 0; 315 return 0;
316} 316}
317 317
318static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res) 318static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
319{ 319{
320 uint32_t *p; 320 __be32 *p;
321 321
322 p = xdr_reserve_space(xdr, 8); 322 p = xdr_reserve_space(xdr, 8);
323 if (unlikely(p == NULL)) 323 if (unlikely(p == NULL))
@@ -327,10 +327,10 @@ static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
327 return 0; 327 return 0;
328} 328}
329 329
330static unsigned encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res) 330static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
331{ 331{
332 uint32_t *savep = NULL; 332 __be32 *savep = NULL;
333 unsigned status = res->status; 333 __be32 status = res->status;
334 334
335 if (unlikely(status != 0)) 335 if (unlikely(status != 0))
336 goto out; 336 goto out;
@@ -353,15 +353,15 @@ out:
353 return status; 353 return status;
354} 354}
355 355
356static unsigned process_op(struct svc_rqst *rqstp, 356static __be32 process_op(struct svc_rqst *rqstp,
357 struct xdr_stream *xdr_in, void *argp, 357 struct xdr_stream *xdr_in, void *argp,
358 struct xdr_stream *xdr_out, void *resp) 358 struct xdr_stream *xdr_out, void *resp)
359{ 359{
360 struct callback_op *op = &callback_ops[0]; 360 struct callback_op *op = &callback_ops[0];
361 unsigned int op_nr = OP_CB_ILLEGAL; 361 unsigned int op_nr = OP_CB_ILLEGAL;
362 unsigned int status = 0; 362 __be32 status = 0;
363 long maxlen; 363 long maxlen;
364 unsigned res; 364 __be32 res;
365 365
366 dprintk("%s: start\n", __FUNCTION__); 366 dprintk("%s: start\n", __FUNCTION__);
367 status = decode_op_hdr(xdr_in, &op_nr); 367 status = decode_op_hdr(xdr_in, &op_nr);
@@ -399,20 +399,20 @@ static unsigned process_op(struct svc_rqst *rqstp,
399/* 399/*
400 * Decode, process and encode a COMPOUND 400 * Decode, process and encode a COMPOUND
401 */ 401 */
402static int nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp) 402static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
403{ 403{
404 struct cb_compound_hdr_arg hdr_arg; 404 struct cb_compound_hdr_arg hdr_arg;
405 struct cb_compound_hdr_res hdr_res; 405 struct cb_compound_hdr_res hdr_res;
406 struct xdr_stream xdr_in, xdr_out; 406 struct xdr_stream xdr_in, xdr_out;
407 uint32_t *p; 407 __be32 *p;
408 unsigned int status; 408 __be32 status;
409 unsigned int nops = 1; 409 unsigned int nops = 1;
410 410
411 dprintk("%s: start\n", __FUNCTION__); 411 dprintk("%s: start\n", __FUNCTION__);
412 412
413 xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); 413 xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
414 414
415 p = (uint32_t*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); 415 p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
416 xdr_init_encode(&xdr_out, &rqstp->rq_res, p); 416 xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
417 417
418 decode_compound_hdr_arg(&xdr_in, &hdr_arg); 418 decode_compound_hdr_arg(&xdr_in, &hdr_arg);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8106f3b29e4a..5fea638743e4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12 12
13#include <linux/config.h>
14#include <linux/module.h> 13#include <linux/module.h>
15#include <linux/init.h> 14#include <linux/init.h>
16 15
@@ -233,11 +232,15 @@ void nfs_put_client(struct nfs_client *clp)
233 * Find a client by address 232 * Find a client by address
234 * - caller must hold nfs_client_lock 233 * - caller must hold nfs_client_lock
235 */ 234 */
236static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion) 235static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port)
237{ 236{
238 struct nfs_client *clp; 237 struct nfs_client *clp;
239 238
240 list_for_each_entry(clp, &nfs_client_list, cl_share_link) { 239 list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
240 /* Don't match clients that failed to initialise properly */
241 if (clp->cl_cons_state < 0)
242 continue;
243
241 /* Different NFS versions cannot share the same nfs_client */ 244 /* Different NFS versions cannot share the same nfs_client */
242 if (clp->cl_nfsversion != nfsversion) 245 if (clp->cl_nfsversion != nfsversion)
243 continue; 246 continue;
@@ -246,7 +249,7 @@ static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int
246 sizeof(clp->cl_addr.sin_addr)) != 0) 249 sizeof(clp->cl_addr.sin_addr)) != 0)
247 continue; 250 continue;
248 251
249 if (clp->cl_addr.sin_port == addr->sin_port) 252 if (!match_port || clp->cl_addr.sin_port == addr->sin_port)
250 goto found; 253 goto found;
251 } 254 }
252 255
@@ -266,11 +269,12 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
266 struct nfs_client *clp; 269 struct nfs_client *clp;
267 270
268 spin_lock(&nfs_client_lock); 271 spin_lock(&nfs_client_lock);
269 clp = __nfs_find_client(addr, nfsversion); 272 clp = __nfs_find_client(addr, nfsversion, 0);
270 spin_unlock(&nfs_client_lock); 273 spin_unlock(&nfs_client_lock);
271 274 if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) {
272 BUG_ON(clp && clp->cl_cons_state == 0); 275 nfs_put_client(clp);
273 276 clp = NULL;
277 }
274 return clp; 278 return clp;
275} 279}
276 280
@@ -293,7 +297,7 @@ static struct nfs_client *nfs_get_client(const char *hostname,
293 do { 297 do {
294 spin_lock(&nfs_client_lock); 298 spin_lock(&nfs_client_lock);
295 299
296 clp = __nfs_find_client(addr, nfsversion); 300 clp = __nfs_find_client(addr, nfsversion, 1);
297 if (clp) 301 if (clp)
298 goto found_client; 302 goto found_client;
299 if (new) 303 if (new)
@@ -323,25 +327,11 @@ found_client:
323 if (new) 327 if (new)
324 nfs_free_client(new); 328 nfs_free_client(new);
325 329
326 if (clp->cl_cons_state == NFS_CS_INITING) { 330 error = wait_event_interruptible(nfs_client_active_wq,
327 DECLARE_WAITQUEUE(myself, current); 331 clp->cl_cons_state != NFS_CS_INITING);
328 332 if (error < 0) {
329 add_wait_queue(&nfs_client_active_wq, &myself); 333 nfs_put_client(clp);
330 334 return ERR_PTR(-ERESTARTSYS);
331 for (;;) {
332 set_current_state(TASK_INTERRUPTIBLE);
333 if (signal_pending(current) ||
334 clp->cl_cons_state > NFS_CS_READY)
335 break;
336 schedule();
337 }
338
339 remove_wait_queue(&nfs_client_active_wq, &myself);
340
341 if (signal_pending(current)) {
342 nfs_put_client(clp);
343 return ERR_PTR(-ERESTARTSYS);
344 }
345 } 335 }
346 336
347 if (clp->cl_cons_state < NFS_CS_READY) { 337 if (clp->cl_cons_state < NFS_CS_READY) {
@@ -864,6 +854,7 @@ error:
864 */ 854 */
865static int nfs4_init_client(struct nfs_client *clp, 855static int nfs4_init_client(struct nfs_client *clp,
866 int proto, int timeo, int retrans, 856 int proto, int timeo, int retrans,
857 const char *ip_addr,
867 rpc_authflavor_t authflavour) 858 rpc_authflavor_t authflavour)
868{ 859{
869 int error; 860 int error;
@@ -880,6 +871,7 @@ static int nfs4_init_client(struct nfs_client *clp,
880 error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour); 871 error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour);
881 if (error < 0) 872 if (error < 0)
882 goto error; 873 goto error;
874 memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
883 875
884 error = nfs_idmap_new(clp); 876 error = nfs_idmap_new(clp);
885 if (error < 0) { 877 if (error < 0) {
@@ -903,6 +895,7 @@ error:
903 */ 895 */
904static int nfs4_set_client(struct nfs_server *server, 896static int nfs4_set_client(struct nfs_server *server,
905 const char *hostname, const struct sockaddr_in *addr, 897 const char *hostname, const struct sockaddr_in *addr,
898 const char *ip_addr,
906 rpc_authflavor_t authflavour, 899 rpc_authflavor_t authflavour,
907 int proto, int timeo, int retrans) 900 int proto, int timeo, int retrans)
908{ 901{
@@ -917,7 +910,7 @@ static int nfs4_set_client(struct nfs_server *server,
917 error = PTR_ERR(clp); 910 error = PTR_ERR(clp);
918 goto error; 911 goto error;
919 } 912 }
920 error = nfs4_init_client(clp, proto, timeo, retrans, authflavour); 913 error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour);
921 if (error < 0) 914 if (error < 0)
922 goto error_put; 915 goto error_put;
923 916
@@ -986,7 +979,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
986 return ERR_PTR(-ENOMEM); 979 return ERR_PTR(-ENOMEM);
987 980
988 /* Get a client record */ 981 /* Get a client record */
989 error = nfs4_set_client(server, hostname, addr, authflavour, 982 error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour,
990 data->proto, data->timeo, data->retrans); 983 data->proto, data->timeo, data->retrans);
991 if (error < 0) 984 if (error < 0)
992 goto error; 985 goto error;
@@ -1056,6 +1049,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1056 /* Get a client representation. 1049 /* Get a client representation.
1057 * Note: NFSv4 always uses TCP, */ 1050 * Note: NFSv4 always uses TCP, */
1058 error = nfs4_set_client(server, data->hostname, data->addr, 1051 error = nfs4_set_client(server, data->hostname, data->addr,
1052 parent_client->cl_ipaddr,
1059 data->authflavor, 1053 data->authflavor,
1060 parent_server->client->cl_xprt->prot, 1054 parent_server->client->cl_xprt->prot,
1061 parent_client->retrans_timeo, 1055 parent_client->retrans_timeo,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 481f8892a919..4133ef5264e5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -142,12 +142,12 @@ nfs_opendir(struct inode *inode, struct file *filp)
142 return res; 142 return res;
143} 143}
144 144
145typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int); 145typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
146typedef struct { 146typedef struct {
147 struct file *file; 147 struct file *file;
148 struct page *page; 148 struct page *page;
149 unsigned long page_index; 149 unsigned long page_index;
150 u32 *ptr; 150 __be32 *ptr;
151 u64 *dir_cookie; 151 u64 *dir_cookie;
152 loff_t current_index; 152 loff_t current_index;
153 struct nfs_entry *entry; 153 struct nfs_entry *entry;
@@ -203,8 +203,10 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
203 * Note: assumes we have exclusive access to this mapping either 203 * Note: assumes we have exclusive access to this mapping either
204 * through inode->i_mutex or some other mechanism. 204 * through inode->i_mutex or some other mechanism.
205 */ 205 */
206 if (page->index == 0) 206 if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
207 invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1); 207 /* Should never happen */
208 nfs_zap_mapping(inode, inode->i_mapping);
209 }
208 unlock_page(page); 210 unlock_page(page);
209 return 0; 211 return 0;
210 error: 212 error:
@@ -218,7 +220,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
218static inline 220static inline
219int dir_decode(nfs_readdir_descriptor_t *desc) 221int dir_decode(nfs_readdir_descriptor_t *desc)
220{ 222{
221 u32 *p = desc->ptr; 223 __be32 *p = desc->ptr;
222 p = desc->decode(p, desc->entry, desc->plus); 224 p = desc->decode(p, desc->entry, desc->plus);
223 if (IS_ERR(p)) 225 if (IS_ERR(p))
224 return PTR_ERR(p); 226 return PTR_ERR(p);
@@ -1517,8 +1519,8 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
1517 pagevec_init(&lru_pvec, 0); 1519 pagevec_init(&lru_pvec, 0);
1518 if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, 1520 if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
1519 GFP_KERNEL)) { 1521 GFP_KERNEL)) {
1520 if (!pagevec_add(&lru_pvec, page)) 1522 pagevec_add(&lru_pvec, page);
1521 __pagevec_lru_add(&lru_pvec); 1523 pagevec_lru_add(&lru_pvec);
1522 SetPageUptodate(page); 1524 SetPageUptodate(page);
1523 unlock_page(page); 1525 unlock_page(page);
1524 } else 1526 } else
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9f7f8b9ea1e2..bdfabf854a51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -497,6 +497,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
497 if (dreq->commit_data != NULL) 497 if (dreq->commit_data != NULL)
498 nfs_commit_free(dreq->commit_data); 498 nfs_commit_free(dreq->commit_data);
499 nfs_direct_free_writedata(dreq); 499 nfs_direct_free_writedata(dreq);
500 nfs_zap_mapping(inode, inode->i_mapping);
500 nfs_direct_complete(dreq); 501 nfs_direct_complete(dreq);
501 } 502 }
502} 503}
@@ -517,6 +518,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
517{ 518{
518 nfs_end_data_update(inode); 519 nfs_end_data_update(inode);
519 nfs_direct_free_writedata(dreq); 520 nfs_direct_free_writedata(dreq);
521 nfs_zap_mapping(inode, inode->i_mapping);
520 nfs_direct_complete(dreq); 522 nfs_direct_complete(dreq);
521} 523}
522#endif 524#endif
@@ -532,10 +534,12 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
532 534
533 spin_lock(&dreq->lock); 535 spin_lock(&dreq->lock);
534 536
535 if (likely(status >= 0)) 537 if (unlikely(status < 0)) {
536 dreq->count += data->res.count; 538 dreq->error = status;
537 else 539 goto out_unlock;
538 dreq->error = task->tk_status; 540 }
541
542 dreq->count += data->res.count;
539 543
540 if (data->res.verf->committed != NFS_FILE_SYNC) { 544 if (data->res.verf->committed != NFS_FILE_SYNC) {
541 switch (dreq->flags) { 545 switch (dreq->flags) {
@@ -550,7 +554,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
550 } 554 }
551 } 555 }
552 } 556 }
553 557out_unlock:
554 spin_unlock(&dreq->lock); 558 spin_unlock(&dreq->lock);
555} 559}
556 560
@@ -828,17 +832,6 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
828 832
829 retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos); 833 retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
830 834
831 /*
832 * XXX: nfs_end_data_update() already ensures this file's
833 * cached data is subsequently invalidated. Do we really
834 * need to call invalidate_inode_pages2() again here?
835 *
836 * For aio writes, this invalidation will almost certainly
837 * occur before the writes complete. Kind of racey.
838 */
839 if (mapping->nrpages)
840 invalidate_inode_pages2(mapping);
841
842 if (retval > 0) 835 if (retval > 0)
843 iocb->ki_pos = pos + retval; 836 iocb->ki_pos = pos + retval;
844 837
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 76b08ae9ed82..20c6f39ea38a 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -9,7 +9,6 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/config.h>
13#include <linux/module.h> 12#include <linux/module.h>
14#include <linux/init.h> 13#include <linux/init.h>
15 14
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bc9376ca86cd..08cc4c5919ab 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -131,6 +131,15 @@ void nfs_zap_caches(struct inode *inode)
131 spin_unlock(&inode->i_lock); 131 spin_unlock(&inode->i_lock);
132} 132}
133 133
134void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
135{
136 if (mapping->nrpages != 0) {
137 spin_lock(&inode->i_lock);
138 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
139 spin_unlock(&inode->i_lock);
140 }
141}
142
134static void nfs_zap_acl_cache(struct inode *inode) 143static void nfs_zap_acl_cache(struct inode *inode)
135{ 144{
136 void (*clear_acl_cache)(struct inode *); 145 void (*clear_acl_cache)(struct inode *);
@@ -574,7 +583,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
574 583
575 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 584 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
576 lock_kernel(); 585 lock_kernel();
577 if (!inode || is_bad_inode(inode)) 586 if (is_bad_inode(inode))
578 goto out_nowait; 587 goto out_nowait;
579 if (NFS_STALE(inode)) 588 if (NFS_STALE(inode))
580 goto out_nowait; 589 goto out_nowait;
@@ -671,13 +680,20 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
671 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) 680 if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
672 || nfs_attribute_timeout(inode)) 681 || nfs_attribute_timeout(inode))
673 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 682 ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
683 if (ret < 0)
684 goto out;
674 685
675 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { 686 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
676 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 687 if (mapping->nrpages != 0) {
677 if (S_ISREG(inode->i_mode)) 688 if (S_ISREG(inode->i_mode)) {
678 nfs_sync_mapping(mapping); 689 ret = nfs_sync_mapping(mapping);
679 invalidate_inode_pages2(mapping); 690 if (ret < 0)
680 691 goto out;
692 }
693 ret = invalidate_inode_pages2(mapping);
694 if (ret < 0)
695 goto out;
696 }
681 spin_lock(&inode->i_lock); 697 spin_lock(&inode->i_lock);
682 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 698 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
683 if (S_ISDIR(inode->i_mode)) { 699 if (S_ISDIR(inode->i_mode)) {
@@ -687,10 +703,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
687 } 703 }
688 spin_unlock(&inode->i_lock); 704 spin_unlock(&inode->i_lock);
689 705
706 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
690 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 707 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
691 inode->i_sb->s_id, 708 inode->i_sb->s_id,
692 (long long)NFS_FILEID(inode)); 709 (long long)NFS_FILEID(inode));
693 } 710 }
711out:
694 return ret; 712 return ret;
695} 713}
696 714
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bea0b016bd70..d205466233f6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -93,15 +93,15 @@ extern void nfs_destroy_directcache(void);
93/* nfs2xdr.c */ 93/* nfs2xdr.c */
94extern int nfs_stat_to_errno(int); 94extern int nfs_stat_to_errno(int);
95extern struct rpc_procinfo nfs_procedures[]; 95extern struct rpc_procinfo nfs_procedures[];
96extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); 96extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
97 97
98/* nfs3xdr.c */ 98/* nfs3xdr.c */
99extern struct rpc_procinfo nfs3_procedures[]; 99extern struct rpc_procinfo nfs3_procedures[];
100extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int); 100extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
101 101
102/* nfs4xdr.c */ 102/* nfs4xdr.c */
103#ifdef CONFIG_NFS_V4 103#ifdef CONFIG_NFS_V4
104extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); 104extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
105#endif 105#endif
106 106
107/* nfs4proc.c */ 107/* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index d507b021207f..f75fe72b4160 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -95,7 +95,7 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
95 * XDR encode/decode functions for MOUNT 95 * XDR encode/decode functions for MOUNT
96 */ 96 */
97static int 97static int
98xdr_encode_dirpath(struct rpc_rqst *req, u32 *p, const char *path) 98xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, const char *path)
99{ 99{
100 p = xdr_encode_string(p, path); 100 p = xdr_encode_string(p, path);
101 101
@@ -104,7 +104,7 @@ xdr_encode_dirpath(struct rpc_rqst *req, u32 *p, const char *path)
104} 104}
105 105
106static int 106static int
107xdr_decode_fhstatus(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res) 107xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
108{ 108{
109 struct nfs_fh *fh = res->fh; 109 struct nfs_fh *fh = res->fh;
110 110
@@ -116,7 +116,7 @@ xdr_decode_fhstatus(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
116} 116}
117 117
118static int 118static int
119xdr_decode_fhstatus3(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res) 119xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
120{ 120{
121 struct nfs_fh *fh = res->fh; 121 struct nfs_fh *fh = res->fh;
122 122
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 60408646176b..ec1114b33d89 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -7,8 +7,6 @@
7 * NFS namespace 7 * NFS namespace
8 */ 8 */
9 9
10#include <linux/config.h>
11
12#include <linux/dcache.h> 10#include <linux/dcache.h>
13#include <linux/mount.h> 11#include <linux/mount.h>
14#include <linux/namei.h> 12#include <linux/namei.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index b49501fc0a79..3be4e72a0227 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -66,15 +66,15 @@
66/* 66/*
67 * Common NFS XDR functions as inlines 67 * Common NFS XDR functions as inlines
68 */ 68 */
69static inline u32 * 69static inline __be32 *
70xdr_encode_fhandle(u32 *p, struct nfs_fh *fhandle) 70xdr_encode_fhandle(__be32 *p, struct nfs_fh *fhandle)
71{ 71{
72 memcpy(p, fhandle->data, NFS2_FHSIZE); 72 memcpy(p, fhandle->data, NFS2_FHSIZE);
73 return p + XDR_QUADLEN(NFS2_FHSIZE); 73 return p + XDR_QUADLEN(NFS2_FHSIZE);
74} 74}
75 75
76static inline u32 * 76static inline __be32 *
77xdr_decode_fhandle(u32 *p, struct nfs_fh *fhandle) 77xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
78{ 78{
79 /* NFSv2 handles have a fixed length */ 79 /* NFSv2 handles have a fixed length */
80 fhandle->size = NFS2_FHSIZE; 80 fhandle->size = NFS2_FHSIZE;
@@ -82,8 +82,8 @@ xdr_decode_fhandle(u32 *p, struct nfs_fh *fhandle)
82 return p + XDR_QUADLEN(NFS2_FHSIZE); 82 return p + XDR_QUADLEN(NFS2_FHSIZE);
83} 83}
84 84
85static inline u32* 85static inline __be32*
86xdr_encode_time(u32 *p, struct timespec *timep) 86xdr_encode_time(__be32 *p, struct timespec *timep)
87{ 87{
88 *p++ = htonl(timep->tv_sec); 88 *p++ = htonl(timep->tv_sec);
89 /* Convert nanoseconds into microseconds */ 89 /* Convert nanoseconds into microseconds */
@@ -91,8 +91,8 @@ xdr_encode_time(u32 *p, struct timespec *timep)
91 return p; 91 return p;
92} 92}
93 93
94static inline u32* 94static inline __be32*
95xdr_encode_current_server_time(u32 *p, struct timespec *timep) 95xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
96{ 96{
97 /* 97 /*
98 * Passing the invalid value useconds=1000000 is a 98 * Passing the invalid value useconds=1000000 is a
@@ -108,8 +108,8 @@ xdr_encode_current_server_time(u32 *p, struct timespec *timep)
108 return p; 108 return p;
109} 109}
110 110
111static inline u32* 111static inline __be32*
112xdr_decode_time(u32 *p, struct timespec *timep) 112xdr_decode_time(__be32 *p, struct timespec *timep)
113{ 113{
114 timep->tv_sec = ntohl(*p++); 114 timep->tv_sec = ntohl(*p++);
115 /* Convert microseconds into nanoseconds */ 115 /* Convert microseconds into nanoseconds */
@@ -117,8 +117,8 @@ xdr_decode_time(u32 *p, struct timespec *timep)
117 return p; 117 return p;
118} 118}
119 119
120static u32 * 120static __be32 *
121xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr) 121xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
122{ 122{
123 u32 rdev; 123 u32 rdev;
124 fattr->type = (enum nfs_ftype) ntohl(*p++); 124 fattr->type = (enum nfs_ftype) ntohl(*p++);
@@ -146,10 +146,10 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
146 return p; 146 return p;
147} 147}
148 148
149static inline u32 * 149static inline __be32 *
150xdr_encode_sattr(u32 *p, struct iattr *attr) 150xdr_encode_sattr(__be32 *p, struct iattr *attr)
151{ 151{
152 const u32 not_set = __constant_htonl(0xFFFFFFFF); 152 const __be32 not_set = __constant_htonl(0xFFFFFFFF);
153 153
154 *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set; 154 *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
155 *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set; 155 *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
@@ -184,7 +184,7 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
184 * GETATTR, READLINK, STATFS 184 * GETATTR, READLINK, STATFS
185 */ 185 */
186static int 186static int
187nfs_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh) 187nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
188{ 188{
189 p = xdr_encode_fhandle(p, fh); 189 p = xdr_encode_fhandle(p, fh);
190 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 190 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
@@ -195,7 +195,7 @@ nfs_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
195 * Encode SETATTR arguments 195 * Encode SETATTR arguments
196 */ 196 */
197static int 197static int
198nfs_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs_sattrargs *args) 198nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
199{ 199{
200 p = xdr_encode_fhandle(p, args->fh); 200 p = xdr_encode_fhandle(p, args->fh);
201 p = xdr_encode_sattr(p, args->sattr); 201 p = xdr_encode_sattr(p, args->sattr);
@@ -208,7 +208,7 @@ nfs_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs_sattrargs *args)
208 * LOOKUP, REMOVE, RMDIR 208 * LOOKUP, REMOVE, RMDIR
209 */ 209 */
210static int 210static int
211nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args) 211nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
212{ 212{
213 p = xdr_encode_fhandle(p, args->fh); 213 p = xdr_encode_fhandle(p, args->fh);
214 p = xdr_encode_array(p, args->name, args->len); 214 p = xdr_encode_array(p, args->name, args->len);
@@ -222,7 +222,7 @@ nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args)
222 * exactly to the page we want to fetch. 222 * exactly to the page we want to fetch.
223 */ 223 */
224static int 224static int
225nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args) 225nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
226{ 226{
227 struct rpc_auth *auth = req->rq_task->tk_auth; 227 struct rpc_auth *auth = req->rq_task->tk_auth;
228 unsigned int replen; 228 unsigned int replen;
@@ -246,7 +246,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
246 * Decode READ reply 246 * Decode READ reply
247 */ 247 */
248static int 248static int
249nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res) 249nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
250{ 250{
251 struct kvec *iov = req->rq_rcv_buf.head; 251 struct kvec *iov = req->rq_rcv_buf.head;
252 int status, count, recvd, hdrlen; 252 int status, count, recvd, hdrlen;
@@ -286,7 +286,7 @@ nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
286 * Write arguments. Splice the buffer to be written into the iovec. 286 * Write arguments. Splice the buffer to be written into the iovec.
287 */ 287 */
288static int 288static int
289nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args) 289nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
290{ 290{
291 struct xdr_buf *sndbuf = &req->rq_snd_buf; 291 struct xdr_buf *sndbuf = &req->rq_snd_buf;
292 u32 offset = (u32)args->offset; 292 u32 offset = (u32)args->offset;
@@ -309,7 +309,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
309 * CREATE, MKDIR 309 * CREATE, MKDIR
310 */ 310 */
311static int 311static int
312nfs_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs_createargs *args) 312nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
313{ 313{
314 p = xdr_encode_fhandle(p, args->fh); 314 p = xdr_encode_fhandle(p, args->fh);
315 p = xdr_encode_array(p, args->name, args->len); 315 p = xdr_encode_array(p, args->name, args->len);
@@ -322,7 +322,7 @@ nfs_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs_createargs *args)
322 * Encode RENAME arguments 322 * Encode RENAME arguments
323 */ 323 */
324static int 324static int
325nfs_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs_renameargs *args) 325nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
326{ 326{
327 p = xdr_encode_fhandle(p, args->fromfh); 327 p = xdr_encode_fhandle(p, args->fromfh);
328 p = xdr_encode_array(p, args->fromname, args->fromlen); 328 p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -336,7 +336,7 @@ nfs_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs_renameargs *args)
336 * Encode LINK arguments 336 * Encode LINK arguments
337 */ 337 */
338static int 338static int
339nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args) 339nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
340{ 340{
341 p = xdr_encode_fhandle(p, args->fromfh); 341 p = xdr_encode_fhandle(p, args->fromfh);
342 p = xdr_encode_fhandle(p, args->tofh); 342 p = xdr_encode_fhandle(p, args->tofh);
@@ -349,7 +349,7 @@ nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
349 * Encode SYMLINK arguments 349 * Encode SYMLINK arguments
350 */ 350 */
351static int 351static int
352nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args) 352nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
353{ 353{
354 struct xdr_buf *sndbuf = &req->rq_snd_buf; 354 struct xdr_buf *sndbuf = &req->rq_snd_buf;
355 size_t pad; 355 size_t pad;
@@ -378,7 +378,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
378 * Encode arguments to readdir call 378 * Encode arguments to readdir call
379 */ 379 */
380static int 380static int
381nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args) 381nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
382{ 382{
383 struct rpc_task *task = req->rq_task; 383 struct rpc_task *task = req->rq_task;
384 struct rpc_auth *auth = task->tk_auth; 384 struct rpc_auth *auth = task->tk_auth;
@@ -404,7 +404,7 @@ nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args)
404 * from nfs_readdir for each entry. 404 * from nfs_readdir for each entry.
405 */ 405 */
406static int 406static int
407nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy) 407nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
408{ 408{
409 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 409 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
410 struct kvec *iov = rcvbuf->head; 410 struct kvec *iov = rcvbuf->head;
@@ -412,7 +412,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
412 int hdrlen, recvd; 412 int hdrlen, recvd;
413 int status, nr; 413 int status, nr;
414 unsigned int len, pglen; 414 unsigned int len, pglen;
415 u32 *end, *entry, *kaddr; 415 __be32 *end, *entry, *kaddr;
416 416
417 if ((status = ntohl(*p++))) 417 if ((status = ntohl(*p++)))
418 return -nfs_stat_to_errno(status); 418 return -nfs_stat_to_errno(status);
@@ -432,8 +432,8 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
432 if (pglen > recvd) 432 if (pglen > recvd)
433 pglen = recvd; 433 pglen = recvd;
434 page = rcvbuf->pages; 434 page = rcvbuf->pages;
435 kaddr = p = (u32 *)kmap_atomic(*page, KM_USER0); 435 kaddr = p = kmap_atomic(*page, KM_USER0);
436 end = (u32 *)((char *)p + pglen); 436 end = (__be32 *)((char *)p + pglen);
437 entry = p; 437 entry = p;
438 for (nr = 0; *p++; nr++) { 438 for (nr = 0; *p++; nr++) {
439 if (p + 2 > end) 439 if (p + 2 > end)
@@ -468,8 +468,8 @@ err_unmap:
468 goto out; 468 goto out;
469} 469}
470 470
471u32 * 471__be32 *
472nfs_decode_dirent(u32 *p, struct nfs_entry *entry, int plus) 472nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
473{ 473{
474 if (!*p++) { 474 if (!*p++) {
475 if (!*p) 475 if (!*p)
@@ -496,7 +496,7 @@ nfs_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
496 * Decode simple status reply 496 * Decode simple status reply
497 */ 497 */
498static int 498static int
499nfs_xdr_stat(struct rpc_rqst *req, u32 *p, void *dummy) 499nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
500{ 500{
501 int status; 501 int status;
502 502
@@ -510,7 +510,7 @@ nfs_xdr_stat(struct rpc_rqst *req, u32 *p, void *dummy)
510 * GETATTR, SETATTR, WRITE 510 * GETATTR, SETATTR, WRITE
511 */ 511 */
512static int 512static int
513nfs_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr) 513nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
514{ 514{
515 int status; 515 int status;
516 516
@@ -525,7 +525,7 @@ nfs_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
525 * LOOKUP, CREATE, MKDIR 525 * LOOKUP, CREATE, MKDIR
526 */ 526 */
527static int 527static int
528nfs_xdr_diropres(struct rpc_rqst *req, u32 *p, struct nfs_diropok *res) 528nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
529{ 529{
530 int status; 530 int status;
531 531
@@ -540,7 +540,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, u32 *p, struct nfs_diropok *res)
540 * Encode READLINK args 540 * Encode READLINK args
541 */ 541 */
542static int 542static int
543nfs_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_readlinkargs *args) 543nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
544{ 544{
545 struct rpc_auth *auth = req->rq_task->tk_auth; 545 struct rpc_auth *auth = req->rq_task->tk_auth;
546 unsigned int replen; 546 unsigned int replen;
@@ -558,7 +558,7 @@ nfs_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_readlinkargs *args
558 * Decode READLINK reply 558 * Decode READLINK reply
559 */ 559 */
560static int 560static int
561nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy) 561nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
562{ 562{
563 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 563 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
564 struct kvec *iov = rcvbuf->head; 564 struct kvec *iov = rcvbuf->head;
@@ -601,7 +601,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy)
601 * Decode WRITE reply 601 * Decode WRITE reply
602 */ 602 */
603static int 603static int
604nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res) 604nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
605{ 605{
606 res->verf->committed = NFS_FILE_SYNC; 606 res->verf->committed = NFS_FILE_SYNC;
607 return nfs_xdr_attrstat(req, p, res->fattr); 607 return nfs_xdr_attrstat(req, p, res->fattr);
@@ -611,7 +611,7 @@ nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
611 * Decode STATFS reply 611 * Decode STATFS reply
612 */ 612 */
613static int 613static int
614nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs2_fsstat *res) 614nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
615{ 615{
616 int status; 616 int status;
617 617
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3b234d4601e7..e5f128ffc32d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -668,7 +668,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
668{ 668{
669 struct inode *dir = dentry->d_inode; 669 struct inode *dir = dentry->d_inode;
670 struct nfs_fattr dir_attr; 670 struct nfs_fattr dir_attr;
671 u32 *verf = NFS_COOKIEVERF(dir); 671 __be32 *verf = NFS_COOKIEVERF(dir);
672 struct nfs3_readdirargs arg = { 672 struct nfs3_readdirargs arg = {
673 .fh = NFS_FH(dir), 673 .fh = NFS_FH(dir),
674 .cookie = cookie, 674 .cookie = cookie,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 16556fa4effb..0ace092d126f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -105,14 +105,14 @@ static struct {
105/* 105/*
106 * Common NFS XDR functions as inlines 106 * Common NFS XDR functions as inlines
107 */ 107 */
108static inline u32 * 108static inline __be32 *
109xdr_encode_fhandle(u32 *p, struct nfs_fh *fh) 109xdr_encode_fhandle(__be32 *p, struct nfs_fh *fh)
110{ 110{
111 return xdr_encode_array(p, fh->data, fh->size); 111 return xdr_encode_array(p, fh->data, fh->size);
112} 112}
113 113
114static inline u32 * 114static inline __be32 *
115xdr_decode_fhandle(u32 *p, struct nfs_fh *fh) 115xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
116{ 116{
117 if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) { 117 if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
118 memcpy(fh->data, p, fh->size); 118 memcpy(fh->data, p, fh->size);
@@ -124,24 +124,24 @@ xdr_decode_fhandle(u32 *p, struct nfs_fh *fh)
124/* 124/*
125 * Encode/decode time. 125 * Encode/decode time.
126 */ 126 */
127static inline u32 * 127static inline __be32 *
128xdr_encode_time3(u32 *p, struct timespec *timep) 128xdr_encode_time3(__be32 *p, struct timespec *timep)
129{ 129{
130 *p++ = htonl(timep->tv_sec); 130 *p++ = htonl(timep->tv_sec);
131 *p++ = htonl(timep->tv_nsec); 131 *p++ = htonl(timep->tv_nsec);
132 return p; 132 return p;
133} 133}
134 134
135static inline u32 * 135static inline __be32 *
136xdr_decode_time3(u32 *p, struct timespec *timep) 136xdr_decode_time3(__be32 *p, struct timespec *timep)
137{ 137{
138 timep->tv_sec = ntohl(*p++); 138 timep->tv_sec = ntohl(*p++);
139 timep->tv_nsec = ntohl(*p++); 139 timep->tv_nsec = ntohl(*p++);
140 return p; 140 return p;
141} 141}
142 142
143static u32 * 143static __be32 *
144xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr) 144xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
145{ 145{
146 unsigned int type, major, minor; 146 unsigned int type, major, minor;
147 int fmode; 147 int fmode;
@@ -177,8 +177,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
177 return p; 177 return p;
178} 178}
179 179
180static inline u32 * 180static inline __be32 *
181xdr_encode_sattr(u32 *p, struct iattr *attr) 181xdr_encode_sattr(__be32 *p, struct iattr *attr)
182{ 182{
183 if (attr->ia_valid & ATTR_MODE) { 183 if (attr->ia_valid & ATTR_MODE) {
184 *p++ = xdr_one; 184 *p++ = xdr_one;
@@ -223,8 +223,8 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
223 return p; 223 return p;
224} 224}
225 225
226static inline u32 * 226static inline __be32 *
227xdr_decode_wcc_attr(u32 *p, struct nfs_fattr *fattr) 227xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
228{ 228{
229 p = xdr_decode_hyper(p, &fattr->pre_size); 229 p = xdr_decode_hyper(p, &fattr->pre_size);
230 p = xdr_decode_time3(p, &fattr->pre_mtime); 230 p = xdr_decode_time3(p, &fattr->pre_mtime);
@@ -233,16 +233,16 @@ xdr_decode_wcc_attr(u32 *p, struct nfs_fattr *fattr)
233 return p; 233 return p;
234} 234}
235 235
236static inline u32 * 236static inline __be32 *
237xdr_decode_post_op_attr(u32 *p, struct nfs_fattr *fattr) 237xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
238{ 238{
239 if (*p++) 239 if (*p++)
240 p = xdr_decode_fattr(p, fattr); 240 p = xdr_decode_fattr(p, fattr);
241 return p; 241 return p;
242} 242}
243 243
244static inline u32 * 244static inline __be32 *
245xdr_decode_pre_op_attr(u32 *p, struct nfs_fattr *fattr) 245xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
246{ 246{
247 if (*p++) 247 if (*p++)
248 return xdr_decode_wcc_attr(p, fattr); 248 return xdr_decode_wcc_attr(p, fattr);
@@ -250,8 +250,8 @@ xdr_decode_pre_op_attr(u32 *p, struct nfs_fattr *fattr)
250} 250}
251 251
252 252
253static inline u32 * 253static inline __be32 *
254xdr_decode_wcc_data(u32 *p, struct nfs_fattr *fattr) 254xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
255{ 255{
256 p = xdr_decode_pre_op_attr(p, fattr); 256 p = xdr_decode_pre_op_attr(p, fattr);
257 return xdr_decode_post_op_attr(p, fattr); 257 return xdr_decode_post_op_attr(p, fattr);
@@ -265,7 +265,7 @@ xdr_decode_wcc_data(u32 *p, struct nfs_fattr *fattr)
265 * Encode file handle argument 265 * Encode file handle argument
266 */ 266 */
267static int 267static int
268nfs3_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh) 268nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
269{ 269{
270 p = xdr_encode_fhandle(p, fh); 270 p = xdr_encode_fhandle(p, fh);
271 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); 271 req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
@@ -276,7 +276,7 @@ nfs3_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
276 * Encode SETATTR arguments 276 * Encode SETATTR arguments
277 */ 277 */
278static int 278static int
279nfs3_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs3_sattrargs *args) 279nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
280{ 280{
281 p = xdr_encode_fhandle(p, args->fh); 281 p = xdr_encode_fhandle(p, args->fh);
282 p = xdr_encode_sattr(p, args->sattr); 282 p = xdr_encode_sattr(p, args->sattr);
@@ -291,7 +291,7 @@ nfs3_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs3_sattrargs *args)
291 * Encode directory ops argument 291 * Encode directory ops argument
292 */ 292 */
293static int 293static int
294nfs3_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs3_diropargs *args) 294nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
295{ 295{
296 p = xdr_encode_fhandle(p, args->fh); 296 p = xdr_encode_fhandle(p, args->fh);
297 p = xdr_encode_array(p, args->name, args->len); 297 p = xdr_encode_array(p, args->name, args->len);
@@ -303,7 +303,7 @@ nfs3_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs3_diropargs *args)
303 * Encode access() argument 303 * Encode access() argument
304 */ 304 */
305static int 305static int
306nfs3_xdr_accessargs(struct rpc_rqst *req, u32 *p, struct nfs3_accessargs *args) 306nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
307{ 307{
308 p = xdr_encode_fhandle(p, args->fh); 308 p = xdr_encode_fhandle(p, args->fh);
309 *p++ = htonl(args->access); 309 *p++ = htonl(args->access);
@@ -317,7 +317,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, u32 *p, struct nfs3_accessargs *args)
317 * exactly to the page we want to fetch. 317 * exactly to the page we want to fetch.
318 */ 318 */
319static int 319static int
320nfs3_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args) 320nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
321{ 321{
322 struct rpc_auth *auth = req->rq_task->tk_auth; 322 struct rpc_auth *auth = req->rq_task->tk_auth;
323 unsigned int replen; 323 unsigned int replen;
@@ -339,7 +339,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
339 * Write arguments. Splice the buffer to be written into the iovec. 339 * Write arguments. Splice the buffer to be written into the iovec.
340 */ 340 */
341static int 341static int
342nfs3_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args) 342nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
343{ 343{
344 struct xdr_buf *sndbuf = &req->rq_snd_buf; 344 struct xdr_buf *sndbuf = &req->rq_snd_buf;
345 u32 count = args->count; 345 u32 count = args->count;
@@ -360,7 +360,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
360 * Encode CREATE arguments 360 * Encode CREATE arguments
361 */ 361 */
362static int 362static int
363nfs3_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs3_createargs *args) 363nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
364{ 364{
365 p = xdr_encode_fhandle(p, args->fh); 365 p = xdr_encode_fhandle(p, args->fh);
366 p = xdr_encode_array(p, args->name, args->len); 366 p = xdr_encode_array(p, args->name, args->len);
@@ -380,7 +380,7 @@ nfs3_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs3_createargs *args)
380 * Encode MKDIR arguments 380 * Encode MKDIR arguments
381 */ 381 */
382static int 382static int
383nfs3_xdr_mkdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_mkdirargs *args) 383nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
384{ 384{
385 p = xdr_encode_fhandle(p, args->fh); 385 p = xdr_encode_fhandle(p, args->fh);
386 p = xdr_encode_array(p, args->name, args->len); 386 p = xdr_encode_array(p, args->name, args->len);
@@ -393,7 +393,7 @@ nfs3_xdr_mkdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_mkdirargs *args)
393 * Encode SYMLINK arguments 393 * Encode SYMLINK arguments
394 */ 394 */
395static int 395static int
396nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args) 396nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
397{ 397{
398 p = xdr_encode_fhandle(p, args->fromfh); 398 p = xdr_encode_fhandle(p, args->fromfh);
399 p = xdr_encode_array(p, args->fromname, args->fromlen); 399 p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -410,7 +410,7 @@ nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args
410 * Encode MKNOD arguments 410 * Encode MKNOD arguments
411 */ 411 */
412static int 412static int
413nfs3_xdr_mknodargs(struct rpc_rqst *req, u32 *p, struct nfs3_mknodargs *args) 413nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
414{ 414{
415 p = xdr_encode_fhandle(p, args->fh); 415 p = xdr_encode_fhandle(p, args->fh);
416 p = xdr_encode_array(p, args->name, args->len); 416 p = xdr_encode_array(p, args->name, args->len);
@@ -429,7 +429,7 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, u32 *p, struct nfs3_mknodargs *args)
429 * Encode RENAME arguments 429 * Encode RENAME arguments
430 */ 430 */
431static int 431static int
432nfs3_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs3_renameargs *args) 432nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
433{ 433{
434 p = xdr_encode_fhandle(p, args->fromfh); 434 p = xdr_encode_fhandle(p, args->fromfh);
435 p = xdr_encode_array(p, args->fromname, args->fromlen); 435 p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -443,7 +443,7 @@ nfs3_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs3_renameargs *args)
443 * Encode LINK arguments 443 * Encode LINK arguments
444 */ 444 */
445static int 445static int
446nfs3_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs3_linkargs *args) 446nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
447{ 447{
448 p = xdr_encode_fhandle(p, args->fromfh); 448 p = xdr_encode_fhandle(p, args->fromfh);
449 p = xdr_encode_fhandle(p, args->tofh); 449 p = xdr_encode_fhandle(p, args->tofh);
@@ -456,7 +456,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs3_linkargs *args)
456 * Encode arguments to readdir call 456 * Encode arguments to readdir call
457 */ 457 */
458static int 458static int
459nfs3_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_readdirargs *args) 459nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
460{ 460{
461 struct rpc_auth *auth = req->rq_task->tk_auth; 461 struct rpc_auth *auth = req->rq_task->tk_auth;
462 unsigned int replen; 462 unsigned int replen;
@@ -485,7 +485,7 @@ nfs3_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_readdirargs *args
485 * We just check for syntactical correctness. 485 * We just check for syntactical correctness.
486 */ 486 */
487static int 487static int
488nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res) 488nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
489{ 489{
490 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 490 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
491 struct kvec *iov = rcvbuf->head; 491 struct kvec *iov = rcvbuf->head;
@@ -493,7 +493,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
493 int hdrlen, recvd; 493 int hdrlen, recvd;
494 int status, nr; 494 int status, nr;
495 unsigned int len, pglen; 495 unsigned int len, pglen;
496 u32 *entry, *end, *kaddr; 496 __be32 *entry, *end, *kaddr;
497 497
498 status = ntohl(*p++); 498 status = ntohl(*p++);
499 /* Decode post_op_attrs */ 499 /* Decode post_op_attrs */
@@ -523,8 +523,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
523 if (pglen > recvd) 523 if (pglen > recvd)
524 pglen = recvd; 524 pglen = recvd;
525 page = rcvbuf->pages; 525 page = rcvbuf->pages;
526 kaddr = p = (u32 *)kmap_atomic(*page, KM_USER0); 526 kaddr = p = kmap_atomic(*page, KM_USER0);
527 end = (u32 *)((char *)p + pglen); 527 end = (__be32 *)((char *)p + pglen);
528 entry = p; 528 entry = p;
529 for (nr = 0; *p++; nr++) { 529 for (nr = 0; *p++; nr++) {
530 if (p + 3 > end) 530 if (p + 3 > end)
@@ -583,8 +583,8 @@ err_unmap:
583 goto out; 583 goto out;
584} 584}
585 585
586u32 * 586__be32 *
587nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus) 587nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
588{ 588{
589 struct nfs_entry old = *entry; 589 struct nfs_entry old = *entry;
590 590
@@ -626,7 +626,7 @@ nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
626 * Encode COMMIT arguments 626 * Encode COMMIT arguments
627 */ 627 */
628static int 628static int
629nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args) 629nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
630{ 630{
631 p = xdr_encode_fhandle(p, args->fh); 631 p = xdr_encode_fhandle(p, args->fh);
632 p = xdr_encode_hyper(p, args->offset); 632 p = xdr_encode_hyper(p, args->offset);
@@ -640,7 +640,7 @@ nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
640 * Encode GETACL arguments 640 * Encode GETACL arguments
641 */ 641 */
642static int 642static int
643nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p, 643nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
644 struct nfs3_getaclargs *args) 644 struct nfs3_getaclargs *args)
645{ 645{
646 struct rpc_auth *auth = req->rq_task->tk_auth; 646 struct rpc_auth *auth = req->rq_task->tk_auth;
@@ -664,7 +664,7 @@ nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p,
664 * Encode SETACL arguments 664 * Encode SETACL arguments
665 */ 665 */
666static int 666static int
667nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p, 667nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
668 struct nfs3_setaclargs *args) 668 struct nfs3_setaclargs *args)
669{ 669{
670 struct xdr_buf *buf = &req->rq_snd_buf; 670 struct xdr_buf *buf = &req->rq_snd_buf;
@@ -711,7 +711,7 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p,
711 * Decode attrstat reply. 711 * Decode attrstat reply.
712 */ 712 */
713static int 713static int
714nfs3_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr) 714nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
715{ 715{
716 int status; 716 int status;
717 717
@@ -726,7 +726,7 @@ nfs3_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
726 * SATTR, REMOVE, RMDIR 726 * SATTR, REMOVE, RMDIR
727 */ 727 */
728static int 728static int
729nfs3_xdr_wccstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr) 729nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
730{ 730{
731 int status; 731 int status;
732 732
@@ -740,7 +740,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
740 * Decode LOOKUP reply 740 * Decode LOOKUP reply
741 */ 741 */
742static int 742static int
743nfs3_xdr_lookupres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res) 743nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
744{ 744{
745 int status; 745 int status;
746 746
@@ -759,7 +759,7 @@ nfs3_xdr_lookupres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
759 * Decode ACCESS reply 759 * Decode ACCESS reply
760 */ 760 */
761static int 761static int
762nfs3_xdr_accessres(struct rpc_rqst *req, u32 *p, struct nfs3_accessres *res) 762nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
763{ 763{
764 int status = ntohl(*p++); 764 int status = ntohl(*p++);
765 765
@@ -771,7 +771,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, u32 *p, struct nfs3_accessres *res)
771} 771}
772 772
773static int 773static int
774nfs3_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_readlinkargs *args) 774nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
775{ 775{
776 struct rpc_auth *auth = req->rq_task->tk_auth; 776 struct rpc_auth *auth = req->rq_task->tk_auth;
777 unsigned int replen; 777 unsigned int replen;
@@ -789,7 +789,7 @@ nfs3_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_readlinkargs *ar
789 * Decode READLINK reply 789 * Decode READLINK reply
790 */ 790 */
791static int 791static int
792nfs3_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr) 792nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
793{ 793{
794 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 794 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
795 struct kvec *iov = rcvbuf->head; 795 struct kvec *iov = rcvbuf->head;
@@ -837,7 +837,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
837 * Decode READ reply 837 * Decode READ reply
838 */ 838 */
839static int 839static int
840nfs3_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res) 840nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
841{ 841{
842 struct kvec *iov = req->rq_rcv_buf.head; 842 struct kvec *iov = req->rq_rcv_buf.head;
843 int status, count, ocount, recvd, hdrlen; 843 int status, count, ocount, recvd, hdrlen;
@@ -888,7 +888,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
888 * Decode WRITE response 888 * Decode WRITE response
889 */ 889 */
890static int 890static int
891nfs3_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res) 891nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
892{ 892{
893 int status; 893 int status;
894 894
@@ -910,7 +910,7 @@ nfs3_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
910 * Decode a CREATE response 910 * Decode a CREATE response
911 */ 911 */
912static int 912static int
913nfs3_xdr_createres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res) 913nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
914{ 914{
915 int status; 915 int status;
916 916
@@ -937,7 +937,7 @@ nfs3_xdr_createres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
937 * Decode RENAME reply 937 * Decode RENAME reply
938 */ 938 */
939static int 939static int
940nfs3_xdr_renameres(struct rpc_rqst *req, u32 *p, struct nfs3_renameres *res) 940nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
941{ 941{
942 int status; 942 int status;
943 943
@@ -952,7 +952,7 @@ nfs3_xdr_renameres(struct rpc_rqst *req, u32 *p, struct nfs3_renameres *res)
952 * Decode LINK reply 952 * Decode LINK reply
953 */ 953 */
954static int 954static int
955nfs3_xdr_linkres(struct rpc_rqst *req, u32 *p, struct nfs3_linkres *res) 955nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
956{ 956{
957 int status; 957 int status;
958 958
@@ -967,7 +967,7 @@ nfs3_xdr_linkres(struct rpc_rqst *req, u32 *p, struct nfs3_linkres *res)
967 * Decode FSSTAT reply 967 * Decode FSSTAT reply
968 */ 968 */
969static int 969static int
970nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res) 970nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
971{ 971{
972 int status; 972 int status;
973 973
@@ -992,7 +992,7 @@ nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res)
992 * Decode FSINFO reply 992 * Decode FSINFO reply
993 */ 993 */
994static int 994static int
995nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) 995nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
996{ 996{
997 int status; 997 int status;
998 998
@@ -1020,7 +1020,7 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res)
1020 * Decode PATHCONF reply 1020 * Decode PATHCONF reply
1021 */ 1021 */
1022static int 1022static int
1023nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res) 1023nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
1024{ 1024{
1025 int status; 1025 int status;
1026 1026
@@ -1040,7 +1040,7 @@ nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res)
1040 * Decode COMMIT reply 1040 * Decode COMMIT reply
1041 */ 1041 */
1042static int 1042static int
1043nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res) 1043nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
1044{ 1044{
1045 int status; 1045 int status;
1046 1046
@@ -1059,7 +1059,7 @@ nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
1059 * Decode GETACL reply 1059 * Decode GETACL reply
1060 */ 1060 */
1061static int 1061static int
1062nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p, 1062nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
1063 struct nfs3_getaclres *res) 1063 struct nfs3_getaclres *res)
1064{ 1064{
1065 struct xdr_buf *buf = &req->rq_rcv_buf; 1065 struct xdr_buf *buf = &req->rq_rcv_buf;
@@ -1091,7 +1091,7 @@ nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p,
1091 * Decode setacl reply. 1091 * Decode setacl reply.
1092 */ 1092 */
1093static int 1093static int
1094nfs3_xdr_setaclres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr) 1094nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
1095{ 1095{
1096 int status = ntohl(*p++); 1096 int status = ntohl(*p++);
1097 1097
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 61095fe4b5ca..6f346677332d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -212,7 +212,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
212extern const nfs4_stateid zero_stateid; 212extern const nfs4_stateid zero_stateid;
213 213
214/* nfs4xdr.c */ 214/* nfs4xdr.c */
215extern uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus); 215extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
216extern struct rpc_procinfo nfs4_procedures[]; 216extern struct rpc_procinfo nfs4_procedures[];
217 217
218struct nfs4_mount_data; 218struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 24e47f3bbd17..b872779d7cd5 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -7,8 +7,6 @@
7 * NFSv4 namespace 7 * NFSv4 namespace
8 */ 8 */
9 9
10#include <linux/config.h>
11
12#include <linux/dcache.h> 10#include <linux/dcache.h>
13#include <linux/mount.h> 11#include <linux/mount.h>
14#include <linux/namei.h> 12#include <linux/namei.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47c7e6e3910d..8118036cc449 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -138,10 +138,10 @@ const u32 nfs4_fs_locations_bitmap[2] = {
138 | FATTR4_WORD1_MOUNTED_ON_FILEID 138 | FATTR4_WORD1_MOUNTED_ON_FILEID
139}; 139};
140 140
141static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry, 141static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
142 struct nfs4_readdir_arg *readdir) 142 struct nfs4_readdir_arg *readdir)
143{ 143{
144 u32 *start, *p; 144 __be32 *start, *p;
145 145
146 BUG_ON(readdir->count < 80); 146 BUG_ON(readdir->count < 80);
147 if (cookie > 2) { 147 if (cookie > 2) {
@@ -162,7 +162,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
162 * when talking to the server, we always send cookie 0 162 * when talking to the server, we always send cookie 0
163 * instead of 1 or 2. 163 * instead of 1 or 2.
164 */ 164 */
165 start = p = (u32 *)kmap_atomic(*readdir->pages, KM_USER0); 165 start = p = kmap_atomic(*readdir->pages, KM_USER0);
166 166
167 if (cookie == 0) { 167 if (cookie == 0) {
168 *p++ = xdr_one; /* next */ 168 *p++ = xdr_one; /* next */
@@ -1314,11 +1314,9 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
1314 case -EROFS: 1314 case -EROFS:
1315 lookup_instantiate_filp(nd, (struct dentry *)state, NULL); 1315 lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
1316 return 1; 1316 return 1;
1317 case -ENOENT: 1317 default:
1318 if (dentry->d_inode == NULL) 1318 goto out_drop;
1319 return 1;
1320 } 1319 }
1321 goto out_drop;
1322 } 1320 }
1323 if (state->inode == dentry->d_inode) { 1321 if (state->inode == dentry->d_inode) {
1324 nfs4_intent_set_file(nd, dentry, state); 1322 nfs4_intent_set_file(nd, dentry, state);
@@ -2917,11 +2915,11 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
2917 .rpc_resp = clp, 2915 .rpc_resp = clp,
2918 .rpc_cred = cred, 2916 .rpc_cred = cred,
2919 }; 2917 };
2920 u32 *p; 2918 __be32 *p;
2921 int loop = 0; 2919 int loop = 0;
2922 int status; 2920 int status;
2923 2921
2924 p = (u32*)sc_verifier.data; 2922 p = (__be32*)sc_verifier.data;
2925 *p++ = htonl((u32)clp->cl_boot_time.tv_sec); 2923 *p++ = htonl((u32)clp->cl_boot_time.tv_sec);
2926 *p = htonl((u32)clp->cl_boot_time.tv_nsec); 2924 *p = htonl((u32)clp->cl_boot_time.tv_nsec);
2927 2925
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3dd413f52da1..0cf3fa312a33 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -471,7 +471,7 @@ struct compound_hdr {
471 471
472static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 472static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
473{ 473{
474 uint32_t *p; 474 __be32 *p;
475 475
476 p = xdr_reserve_space(xdr, 4 + len); 476 p = xdr_reserve_space(xdr, 4 + len);
477 BUG_ON(p == NULL); 477 BUG_ON(p == NULL);
@@ -480,7 +480,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
480 480
481static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) 481static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
482{ 482{
483 uint32_t *p; 483 __be32 *p;
484 484
485 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); 485 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
486 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); 486 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
@@ -494,7 +494,7 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
494 494
495static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) 495static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
496{ 496{
497 uint32_t *p; 497 __be32 *p;
498 498
499 p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); 499 p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
500 BUG_ON(p == NULL); 500 BUG_ON(p == NULL);
@@ -507,8 +507,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
507 char owner_group[IDMAP_NAMESZ]; 507 char owner_group[IDMAP_NAMESZ];
508 int owner_namelen = 0; 508 int owner_namelen = 0;
509 int owner_grouplen = 0; 509 int owner_grouplen = 0;
510 uint32_t *p; 510 __be32 *p;
511 uint32_t *q; 511 __be32 *q;
512 int len; 512 int len;
513 uint32_t bmval0 = 0; 513 uint32_t bmval0 = 0;
514 uint32_t bmval1 = 0; 514 uint32_t bmval1 = 0;
@@ -630,7 +630,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
630 630
631static int encode_access(struct xdr_stream *xdr, u32 access) 631static int encode_access(struct xdr_stream *xdr, u32 access)
632{ 632{
633 uint32_t *p; 633 __be32 *p;
634 634
635 RESERVE_SPACE(8); 635 RESERVE_SPACE(8);
636 WRITE32(OP_ACCESS); 636 WRITE32(OP_ACCESS);
@@ -641,7 +641,7 @@ static int encode_access(struct xdr_stream *xdr, u32 access)
641 641
642static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg) 642static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
643{ 643{
644 uint32_t *p; 644 __be32 *p;
645 645
646 RESERVE_SPACE(8+sizeof(arg->stateid->data)); 646 RESERVE_SPACE(8+sizeof(arg->stateid->data));
647 WRITE32(OP_CLOSE); 647 WRITE32(OP_CLOSE);
@@ -653,7 +653,7 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
653 653
654static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args) 654static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args)
655{ 655{
656 uint32_t *p; 656 __be32 *p;
657 657
658 RESERVE_SPACE(16); 658 RESERVE_SPACE(16);
659 WRITE32(OP_COMMIT); 659 WRITE32(OP_COMMIT);
@@ -665,7 +665,7 @@ static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *arg
665 665
666static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create) 666static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create)
667{ 667{
668 uint32_t *p; 668 __be32 *p;
669 669
670 RESERVE_SPACE(8); 670 RESERVE_SPACE(8);
671 WRITE32(OP_CREATE); 671 WRITE32(OP_CREATE);
@@ -697,7 +697,7 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
697 697
698static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap) 698static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
699{ 699{
700 uint32_t *p; 700 __be32 *p;
701 701
702 RESERVE_SPACE(12); 702 RESERVE_SPACE(12);
703 WRITE32(OP_GETATTR); 703 WRITE32(OP_GETATTR);
@@ -708,7 +708,7 @@ static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
708 708
709static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1) 709static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1)
710{ 710{
711 uint32_t *p; 711 __be32 *p;
712 712
713 RESERVE_SPACE(16); 713 RESERVE_SPACE(16);
714 WRITE32(OP_GETATTR); 714 WRITE32(OP_GETATTR);
@@ -740,7 +740,7 @@ static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
740 740
741static int encode_getfh(struct xdr_stream *xdr) 741static int encode_getfh(struct xdr_stream *xdr)
742{ 742{
743 uint32_t *p; 743 __be32 *p;
744 744
745 RESERVE_SPACE(4); 745 RESERVE_SPACE(4);
746 WRITE32(OP_GETFH); 746 WRITE32(OP_GETFH);
@@ -750,7 +750,7 @@ static int encode_getfh(struct xdr_stream *xdr)
750 750
751static int encode_link(struct xdr_stream *xdr, const struct qstr *name) 751static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
752{ 752{
753 uint32_t *p; 753 __be32 *p;
754 754
755 RESERVE_SPACE(8 + name->len); 755 RESERVE_SPACE(8 + name->len);
756 WRITE32(OP_LINK); 756 WRITE32(OP_LINK);
@@ -780,7 +780,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
780 */ 780 */
781static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args) 781static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
782{ 782{
783 uint32_t *p; 783 __be32 *p;
784 784
785 RESERVE_SPACE(32); 785 RESERVE_SPACE(32);
786 WRITE32(OP_LOCK); 786 WRITE32(OP_LOCK);
@@ -809,7 +809,7 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
809 809
810static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args) 810static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
811{ 811{
812 uint32_t *p; 812 __be32 *p;
813 813
814 RESERVE_SPACE(40); 814 RESERVE_SPACE(40);
815 WRITE32(OP_LOCKT); 815 WRITE32(OP_LOCKT);
@@ -825,7 +825,7 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
825 825
826static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args) 826static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
827{ 827{
828 uint32_t *p; 828 __be32 *p;
829 829
830 RESERVE_SPACE(44); 830 RESERVE_SPACE(44);
831 WRITE32(OP_LOCKU); 831 WRITE32(OP_LOCKU);
@@ -841,7 +841,7 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
841static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name) 841static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
842{ 842{
843 int len = name->len; 843 int len = name->len;
844 uint32_t *p; 844 __be32 *p;
845 845
846 RESERVE_SPACE(8 + len); 846 RESERVE_SPACE(8 + len);
847 WRITE32(OP_LOOKUP); 847 WRITE32(OP_LOOKUP);
@@ -853,7 +853,7 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
853 853
854static void encode_share_access(struct xdr_stream *xdr, int open_flags) 854static void encode_share_access(struct xdr_stream *xdr, int open_flags)
855{ 855{
856 uint32_t *p; 856 __be32 *p;
857 857
858 RESERVE_SPACE(8); 858 RESERVE_SPACE(8);
859 switch (open_flags & (FMODE_READ|FMODE_WRITE)) { 859 switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
@@ -874,7 +874,7 @@ static void encode_share_access(struct xdr_stream *xdr, int open_flags)
874 874
875static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) 875static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
876{ 876{
877 uint32_t *p; 877 __be32 *p;
878 /* 878 /*
879 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, 879 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
880 * owner 4 = 32 880 * owner 4 = 32
@@ -891,7 +891,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
891 891
892static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) 892static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
893{ 893{
894 uint32_t *p; 894 __be32 *p;
895 895
896 RESERVE_SPACE(4); 896 RESERVE_SPACE(4);
897 switch(arg->open_flags & O_EXCL) { 897 switch(arg->open_flags & O_EXCL) {
@@ -907,7 +907,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
907 907
908static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg) 908static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
909{ 909{
910 uint32_t *p; 910 __be32 *p;
911 911
912 RESERVE_SPACE(4); 912 RESERVE_SPACE(4);
913 switch (arg->open_flags & O_CREAT) { 913 switch (arg->open_flags & O_CREAT) {
@@ -923,7 +923,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
923 923
924static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type) 924static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
925{ 925{
926 uint32_t *p; 926 __be32 *p;
927 927
928 RESERVE_SPACE(4); 928 RESERVE_SPACE(4);
929 switch (delegation_type) { 929 switch (delegation_type) {
@@ -943,7 +943,7 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation
943 943
944static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name) 944static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
945{ 945{
946 uint32_t *p; 946 __be32 *p;
947 947
948 RESERVE_SPACE(4); 948 RESERVE_SPACE(4);
949 WRITE32(NFS4_OPEN_CLAIM_NULL); 949 WRITE32(NFS4_OPEN_CLAIM_NULL);
@@ -952,7 +952,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
952 952
953static inline void encode_claim_previous(struct xdr_stream *xdr, int type) 953static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
954{ 954{
955 uint32_t *p; 955 __be32 *p;
956 956
957 RESERVE_SPACE(4); 957 RESERVE_SPACE(4);
958 WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); 958 WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
@@ -961,7 +961,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
961 961
962static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid) 962static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
963{ 963{
964 uint32_t *p; 964 __be32 *p;
965 965
966 RESERVE_SPACE(4+sizeof(stateid->data)); 966 RESERVE_SPACE(4+sizeof(stateid->data));
967 WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); 967 WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
@@ -991,7 +991,7 @@ static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
991 991
992static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg) 992static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
993{ 993{
994 uint32_t *p; 994 __be32 *p;
995 995
996 RESERVE_SPACE(8+sizeof(arg->stateid->data)); 996 RESERVE_SPACE(8+sizeof(arg->stateid->data));
997 WRITE32(OP_OPEN_CONFIRM); 997 WRITE32(OP_OPEN_CONFIRM);
@@ -1003,7 +1003,7 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
1003 1003
1004static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg) 1004static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
1005{ 1005{
1006 uint32_t *p; 1006 __be32 *p;
1007 1007
1008 RESERVE_SPACE(8+sizeof(arg->stateid->data)); 1008 RESERVE_SPACE(8+sizeof(arg->stateid->data));
1009 WRITE32(OP_OPEN_DOWNGRADE); 1009 WRITE32(OP_OPEN_DOWNGRADE);
@@ -1017,7 +1017,7 @@ static int
1017encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh) 1017encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
1018{ 1018{
1019 int len = fh->size; 1019 int len = fh->size;
1020 uint32_t *p; 1020 __be32 *p;
1021 1021
1022 RESERVE_SPACE(8 + len); 1022 RESERVE_SPACE(8 + len);
1023 WRITE32(OP_PUTFH); 1023 WRITE32(OP_PUTFH);
@@ -1029,7 +1029,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
1029 1029
1030static int encode_putrootfh(struct xdr_stream *xdr) 1030static int encode_putrootfh(struct xdr_stream *xdr)
1031{ 1031{
1032 uint32_t *p; 1032 __be32 *p;
1033 1033
1034 RESERVE_SPACE(4); 1034 RESERVE_SPACE(4);
1035 WRITE32(OP_PUTROOTFH); 1035 WRITE32(OP_PUTROOTFH);
@@ -1040,7 +1040,7 @@ static int encode_putrootfh(struct xdr_stream *xdr)
1040static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) 1040static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
1041{ 1041{
1042 nfs4_stateid stateid; 1042 nfs4_stateid stateid;
1043 uint32_t *p; 1043 __be32 *p;
1044 1044
1045 RESERVE_SPACE(16); 1045 RESERVE_SPACE(16);
1046 if (ctx->state != NULL) { 1046 if (ctx->state != NULL) {
@@ -1052,7 +1052,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
1052 1052
1053static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args) 1053static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
1054{ 1054{
1055 uint32_t *p; 1055 __be32 *p;
1056 1056
1057 RESERVE_SPACE(4); 1057 RESERVE_SPACE(4);
1058 WRITE32(OP_READ); 1058 WRITE32(OP_READ);
@@ -1074,7 +1074,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1074 FATTR4_WORD1_MOUNTED_ON_FILEID, 1074 FATTR4_WORD1_MOUNTED_ON_FILEID,
1075 }; 1075 };
1076 int replen; 1076 int replen;
1077 uint32_t *p; 1077 __be32 *p;
1078 1078
1079 RESERVE_SPACE(32+sizeof(nfs4_verifier)); 1079 RESERVE_SPACE(32+sizeof(nfs4_verifier));
1080 WRITE32(OP_READDIR); 1080 WRITE32(OP_READDIR);
@@ -1116,7 +1116,7 @@ static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *r
1116{ 1116{
1117 struct rpc_auth *auth = req->rq_task->tk_auth; 1117 struct rpc_auth *auth = req->rq_task->tk_auth;
1118 unsigned int replen; 1118 unsigned int replen;
1119 uint32_t *p; 1119 __be32 *p;
1120 1120
1121 RESERVE_SPACE(4); 1121 RESERVE_SPACE(4);
1122 WRITE32(OP_READLINK); 1122 WRITE32(OP_READLINK);
@@ -1134,7 +1134,7 @@ static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *r
1134 1134
1135static int encode_remove(struct xdr_stream *xdr, const struct qstr *name) 1135static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
1136{ 1136{
1137 uint32_t *p; 1137 __be32 *p;
1138 1138
1139 RESERVE_SPACE(8 + name->len); 1139 RESERVE_SPACE(8 + name->len);
1140 WRITE32(OP_REMOVE); 1140 WRITE32(OP_REMOVE);
@@ -1146,7 +1146,7 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
1146 1146
1147static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname) 1147static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname)
1148{ 1148{
1149 uint32_t *p; 1149 __be32 *p;
1150 1150
1151 RESERVE_SPACE(8 + oldname->len); 1151 RESERVE_SPACE(8 + oldname->len);
1152 WRITE32(OP_RENAME); 1152 WRITE32(OP_RENAME);
@@ -1162,7 +1162,7 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
1162 1162
1163static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid) 1163static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
1164{ 1164{
1165 uint32_t *p; 1165 __be32 *p;
1166 1166
1167 RESERVE_SPACE(12); 1167 RESERVE_SPACE(12);
1168 WRITE32(OP_RENEW); 1168 WRITE32(OP_RENEW);
@@ -1174,7 +1174,7 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_
1174static int 1174static int
1175encode_restorefh(struct xdr_stream *xdr) 1175encode_restorefh(struct xdr_stream *xdr)
1176{ 1176{
1177 uint32_t *p; 1177 __be32 *p;
1178 1178
1179 RESERVE_SPACE(4); 1179 RESERVE_SPACE(4);
1180 WRITE32(OP_RESTOREFH); 1180 WRITE32(OP_RESTOREFH);
@@ -1185,7 +1185,7 @@ encode_restorefh(struct xdr_stream *xdr)
1185static int 1185static int
1186encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) 1186encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
1187{ 1187{
1188 uint32_t *p; 1188 __be32 *p;
1189 1189
1190 RESERVE_SPACE(4+sizeof(zero_stateid.data)); 1190 RESERVE_SPACE(4+sizeof(zero_stateid.data));
1191 WRITE32(OP_SETATTR); 1191 WRITE32(OP_SETATTR);
@@ -1204,7 +1204,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
1204static int 1204static int
1205encode_savefh(struct xdr_stream *xdr) 1205encode_savefh(struct xdr_stream *xdr)
1206{ 1206{
1207 uint32_t *p; 1207 __be32 *p;
1208 1208
1209 RESERVE_SPACE(4); 1209 RESERVE_SPACE(4);
1210 WRITE32(OP_SAVEFH); 1210 WRITE32(OP_SAVEFH);
@@ -1215,7 +1215,7 @@ encode_savefh(struct xdr_stream *xdr)
1215static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server) 1215static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server)
1216{ 1216{
1217 int status; 1217 int status;
1218 uint32_t *p; 1218 __be32 *p;
1219 1219
1220 RESERVE_SPACE(4+sizeof(arg->stateid.data)); 1220 RESERVE_SPACE(4+sizeof(arg->stateid.data));
1221 WRITE32(OP_SETATTR); 1221 WRITE32(OP_SETATTR);
@@ -1229,7 +1229,7 @@ static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *
1229 1229
1230static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid) 1230static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
1231{ 1231{
1232 uint32_t *p; 1232 __be32 *p;
1233 1233
1234 RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data)); 1234 RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data));
1235 WRITE32(OP_SETCLIENTID); 1235 WRITE32(OP_SETCLIENTID);
@@ -1248,7 +1248,7 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
1248 1248
1249static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state) 1249static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
1250{ 1250{
1251 uint32_t *p; 1251 __be32 *p;
1252 1252
1253 RESERVE_SPACE(12 + sizeof(client_state->cl_confirm.data)); 1253 RESERVE_SPACE(12 + sizeof(client_state->cl_confirm.data));
1254 WRITE32(OP_SETCLIENTID_CONFIRM); 1254 WRITE32(OP_SETCLIENTID_CONFIRM);
@@ -1260,7 +1260,7 @@ static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_c
1260 1260
1261static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args) 1261static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args)
1262{ 1262{
1263 uint32_t *p; 1263 __be32 *p;
1264 1264
1265 RESERVE_SPACE(4); 1265 RESERVE_SPACE(4);
1266 WRITE32(OP_WRITE); 1266 WRITE32(OP_WRITE);
@@ -1279,7 +1279,7 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
1279 1279
1280static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid) 1280static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
1281{ 1281{
1282 uint32_t *p; 1282 __be32 *p;
1283 1283
1284 RESERVE_SPACE(20); 1284 RESERVE_SPACE(20);
1285 1285
@@ -1295,7 +1295,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
1295/* 1295/*
1296 * Encode an ACCESS request 1296 * Encode an ACCESS request
1297 */ 1297 */
1298static int nfs4_xdr_enc_access(struct rpc_rqst *req, uint32_t *p, const struct nfs4_accessargs *args) 1298static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
1299{ 1299{
1300 struct xdr_stream xdr; 1300 struct xdr_stream xdr;
1301 struct compound_hdr hdr = { 1301 struct compound_hdr hdr = {
@@ -1313,7 +1313,7 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, uint32_t *p, const struct n
1313/* 1313/*
1314 * Encode LOOKUP request 1314 * Encode LOOKUP request
1315 */ 1315 */
1316static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, uint32_t *p, const struct nfs4_lookup_arg *args) 1316static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
1317{ 1317{
1318 struct xdr_stream xdr; 1318 struct xdr_stream xdr;
1319 struct compound_hdr hdr = { 1319 struct compound_hdr hdr = {
@@ -1337,7 +1337,7 @@ out:
1337/* 1337/*
1338 * Encode LOOKUP_ROOT request 1338 * Encode LOOKUP_ROOT request
1339 */ 1339 */
1340static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, uint32_t *p, const struct nfs4_lookup_root_arg *args) 1340static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
1341{ 1341{
1342 struct xdr_stream xdr; 1342 struct xdr_stream xdr;
1343 struct compound_hdr hdr = { 1343 struct compound_hdr hdr = {
@@ -1358,7 +1358,7 @@ out:
1358/* 1358/*
1359 * Encode REMOVE request 1359 * Encode REMOVE request
1360 */ 1360 */
1361static int nfs4_xdr_enc_remove(struct rpc_rqst *req, uint32_t *p, const struct nfs4_remove_arg *args) 1361static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs4_remove_arg *args)
1362{ 1362{
1363 struct xdr_stream xdr; 1363 struct xdr_stream xdr;
1364 struct compound_hdr hdr = { 1364 struct compound_hdr hdr = {
@@ -1380,7 +1380,7 @@ out:
1380/* 1380/*
1381 * Encode RENAME request 1381 * Encode RENAME request
1382 */ 1382 */
1383static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct nfs4_rename_arg *args) 1383static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
1384{ 1384{
1385 struct xdr_stream xdr; 1385 struct xdr_stream xdr;
1386 struct compound_hdr hdr = { 1386 struct compound_hdr hdr = {
@@ -1410,7 +1410,7 @@ out:
1410/* 1410/*
1411 * Encode LINK request 1411 * Encode LINK request
1412 */ 1412 */
1413static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs4_link_arg *args) 1413static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
1414{ 1414{
1415 struct xdr_stream xdr; 1415 struct xdr_stream xdr;
1416 struct compound_hdr hdr = { 1416 struct compound_hdr hdr = {
@@ -1440,7 +1440,7 @@ out:
1440/* 1440/*
1441 * Encode CREATE request 1441 * Encode CREATE request
1442 */ 1442 */
1443static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args) 1443static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
1444{ 1444{
1445 struct xdr_stream xdr; 1445 struct xdr_stream xdr;
1446 struct compound_hdr hdr = { 1446 struct compound_hdr hdr = {
@@ -1470,7 +1470,7 @@ out:
1470/* 1470/*
1471 * Encode SYMLINK request 1471 * Encode SYMLINK request
1472 */ 1472 */
1473static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args) 1473static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
1474{ 1474{
1475 return nfs4_xdr_enc_create(req, p, args); 1475 return nfs4_xdr_enc_create(req, p, args);
1476} 1476}
@@ -1478,7 +1478,7 @@ static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct
1478/* 1478/*
1479 * Encode GETATTR request 1479 * Encode GETATTR request
1480 */ 1480 */
1481static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct nfs4_getattr_arg *args) 1481static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
1482{ 1482{
1483 struct xdr_stream xdr; 1483 struct xdr_stream xdr;
1484 struct compound_hdr hdr = { 1484 struct compound_hdr hdr = {
@@ -1496,7 +1496,7 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct
1496/* 1496/*
1497 * Encode a CLOSE request 1497 * Encode a CLOSE request
1498 */ 1498 */
1499static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args) 1499static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
1500{ 1500{
1501 struct xdr_stream xdr; 1501 struct xdr_stream xdr;
1502 struct compound_hdr hdr = { 1502 struct compound_hdr hdr = {
@@ -1520,7 +1520,7 @@ out:
1520/* 1520/*
1521 * Encode an OPEN request 1521 * Encode an OPEN request
1522 */ 1522 */
1523static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args) 1523static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
1524{ 1524{
1525 struct xdr_stream xdr; 1525 struct xdr_stream xdr;
1526 struct compound_hdr hdr = { 1526 struct compound_hdr hdr = {
@@ -1556,7 +1556,7 @@ out:
1556/* 1556/*
1557 * Encode an OPEN_CONFIRM request 1557 * Encode an OPEN_CONFIRM request
1558 */ 1558 */
1559static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_open_confirmargs *args) 1559static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
1560{ 1560{
1561 struct xdr_stream xdr; 1561 struct xdr_stream xdr;
1562 struct compound_hdr hdr = { 1562 struct compound_hdr hdr = {
@@ -1577,7 +1577,7 @@ out:
1577/* 1577/*
1578 * Encode an OPEN request with no attributes. 1578 * Encode an OPEN request with no attributes.
1579 */ 1579 */
1580static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args) 1580static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
1581{ 1581{
1582 struct xdr_stream xdr; 1582 struct xdr_stream xdr;
1583 struct compound_hdr hdr = { 1583 struct compound_hdr hdr = {
@@ -1601,7 +1601,7 @@ out:
1601/* 1601/*
1602 * Encode an OPEN_DOWNGRADE request 1602 * Encode an OPEN_DOWNGRADE request
1603 */ 1603 */
1604static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args) 1604static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
1605{ 1605{
1606 struct xdr_stream xdr; 1606 struct xdr_stream xdr;
1607 struct compound_hdr hdr = { 1607 struct compound_hdr hdr = {
@@ -1625,7 +1625,7 @@ out:
1625/* 1625/*
1626 * Encode a LOCK request 1626 * Encode a LOCK request
1627 */ 1627 */
1628static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_args *args) 1628static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
1629{ 1629{
1630 struct xdr_stream xdr; 1630 struct xdr_stream xdr;
1631 struct compound_hdr hdr = { 1631 struct compound_hdr hdr = {
@@ -1646,7 +1646,7 @@ out:
1646/* 1646/*
1647 * Encode a LOCKT request 1647 * Encode a LOCKT request
1648 */ 1648 */
1649static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockt_args *args) 1649static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
1650{ 1650{
1651 struct xdr_stream xdr; 1651 struct xdr_stream xdr;
1652 struct compound_hdr hdr = { 1652 struct compound_hdr hdr = {
@@ -1667,7 +1667,7 @@ out:
1667/* 1667/*
1668 * Encode a LOCKU request 1668 * Encode a LOCKU request
1669 */ 1669 */
1670static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_locku_args *args) 1670static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
1671{ 1671{
1672 struct xdr_stream xdr; 1672 struct xdr_stream xdr;
1673 struct compound_hdr hdr = { 1673 struct compound_hdr hdr = {
@@ -1688,7 +1688,7 @@ out:
1688/* 1688/*
1689 * Encode a READLINK request 1689 * Encode a READLINK request
1690 */ 1690 */
1691static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_readlink *args) 1691static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
1692{ 1692{
1693 struct xdr_stream xdr; 1693 struct xdr_stream xdr;
1694 struct compound_hdr hdr = { 1694 struct compound_hdr hdr = {
@@ -1709,7 +1709,7 @@ out:
1709/* 1709/*
1710 * Encode a READDIR request 1710 * Encode a READDIR request
1711 */ 1711 */
1712static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, uint32_t *p, const struct nfs4_readdir_arg *args) 1712static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
1713{ 1713{
1714 struct xdr_stream xdr; 1714 struct xdr_stream xdr;
1715 struct compound_hdr hdr = { 1715 struct compound_hdr hdr = {
@@ -1730,7 +1730,7 @@ out:
1730/* 1730/*
1731 * Encode a READ request 1731 * Encode a READ request
1732 */ 1732 */
1733static int nfs4_xdr_enc_read(struct rpc_rqst *req, uint32_t *p, struct nfs_readargs *args) 1733static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
1734{ 1734{
1735 struct rpc_auth *auth = req->rq_task->tk_auth; 1735 struct rpc_auth *auth = req->rq_task->tk_auth;
1736 struct xdr_stream xdr; 1736 struct xdr_stream xdr;
@@ -1762,7 +1762,7 @@ out:
1762/* 1762/*
1763 * Encode an SETATTR request 1763 * Encode an SETATTR request
1764 */ 1764 */
1765static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, uint32_t *p, struct nfs_setattrargs *args) 1765static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
1766 1766
1767{ 1767{
1768 struct xdr_stream xdr; 1768 struct xdr_stream xdr;
@@ -1788,7 +1788,7 @@ out:
1788 * Encode a GETACL request 1788 * Encode a GETACL request
1789 */ 1789 */
1790static int 1790static int
1791nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p, 1791nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
1792 struct nfs_getaclargs *args) 1792 struct nfs_getaclargs *args)
1793{ 1793{
1794 struct xdr_stream xdr; 1794 struct xdr_stream xdr;
@@ -1815,7 +1815,7 @@ out:
1815/* 1815/*
1816 * Encode a WRITE request 1816 * Encode a WRITE request
1817 */ 1817 */
1818static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args) 1818static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
1819{ 1819{
1820 struct xdr_stream xdr; 1820 struct xdr_stream xdr;
1821 struct compound_hdr hdr = { 1821 struct compound_hdr hdr = {
@@ -1839,7 +1839,7 @@ out:
1839/* 1839/*
1840 * a COMMIT request 1840 * a COMMIT request
1841 */ 1841 */
1842static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args) 1842static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
1843{ 1843{
1844 struct xdr_stream xdr; 1844 struct xdr_stream xdr;
1845 struct compound_hdr hdr = { 1845 struct compound_hdr hdr = {
@@ -1863,7 +1863,7 @@ out:
1863/* 1863/*
1864 * FSINFO request 1864 * FSINFO request
1865 */ 1865 */
1866static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fsinfo_arg *args) 1866static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
1867{ 1867{
1868 struct xdr_stream xdr; 1868 struct xdr_stream xdr;
1869 struct compound_hdr hdr = { 1869 struct compound_hdr hdr = {
@@ -1882,7 +1882,7 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs
1882/* 1882/*
1883 * a PATHCONF request 1883 * a PATHCONF request
1884 */ 1884 */
1885static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct nfs4_pathconf_arg *args) 1885static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
1886{ 1886{
1887 struct xdr_stream xdr; 1887 struct xdr_stream xdr;
1888 struct compound_hdr hdr = { 1888 struct compound_hdr hdr = {
@@ -1902,7 +1902,7 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct
1902/* 1902/*
1903 * a STATFS request 1903 * a STATFS request
1904 */ 1904 */
1905static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct nfs4_statfs_arg *args) 1905static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
1906{ 1906{
1907 struct xdr_stream xdr; 1907 struct xdr_stream xdr;
1908 struct compound_hdr hdr = { 1908 struct compound_hdr hdr = {
@@ -1923,7 +1923,7 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct n
1923/* 1923/*
1924 * GETATTR_BITMAP request 1924 * GETATTR_BITMAP request
1925 */ 1925 */
1926static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const struct nfs_fh *fhandle) 1926static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle)
1927{ 1927{
1928 struct xdr_stream xdr; 1928 struct xdr_stream xdr;
1929 struct compound_hdr hdr = { 1929 struct compound_hdr hdr = {
@@ -1945,7 +1945,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const str
1945/* 1945/*
1946 * a RENEW request 1946 * a RENEW request
1947 */ 1947 */
1948static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp) 1948static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
1949{ 1949{
1950 struct xdr_stream xdr; 1950 struct xdr_stream xdr;
1951 struct compound_hdr hdr = { 1951 struct compound_hdr hdr = {
@@ -1960,7 +1960,7 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_clie
1960/* 1960/*
1961 * a SETCLIENTID request 1961 * a SETCLIENTID request
1962 */ 1962 */
1963static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nfs4_setclientid *sc) 1963static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
1964{ 1964{
1965 struct xdr_stream xdr; 1965 struct xdr_stream xdr;
1966 struct compound_hdr hdr = { 1966 struct compound_hdr hdr = {
@@ -1975,7 +1975,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nf
1975/* 1975/*
1976 * a SETCLIENTID_CONFIRM request 1976 * a SETCLIENTID_CONFIRM request
1977 */ 1977 */
1978static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp) 1978static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
1979{ 1979{
1980 struct xdr_stream xdr; 1980 struct xdr_stream xdr;
1981 struct compound_hdr hdr = { 1981 struct compound_hdr hdr = {
@@ -1997,7 +1997,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
1997/* 1997/*
1998 * DELEGRETURN request 1998 * DELEGRETURN request
1999 */ 1999 */
2000static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const struct nfs4_delegreturnargs *args) 2000static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
2001{ 2001{
2002 struct xdr_stream xdr; 2002 struct xdr_stream xdr;
2003 struct compound_hdr hdr = { 2003 struct compound_hdr hdr = {
@@ -2021,7 +2021,7 @@ out:
2021/* 2021/*
2022 * Encode FS_LOCATIONS request 2022 * Encode FS_LOCATIONS request
2023 */ 2023 */
2024static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args) 2024static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
2025{ 2025{
2026 struct xdr_stream xdr; 2026 struct xdr_stream xdr;
2027 struct compound_hdr hdr = { 2027 struct compound_hdr hdr = {
@@ -2086,7 +2086,7 @@ out:
2086 2086
2087static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) 2087static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
2088{ 2088{
2089 uint32_t *p; 2089 __be32 *p;
2090 2090
2091 READ_BUF(4); 2091 READ_BUF(4);
2092 READ32(*len); 2092 READ32(*len);
@@ -2097,7 +2097,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
2097 2097
2098static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) 2098static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
2099{ 2099{
2100 uint32_t *p; 2100 __be32 *p;
2101 2101
2102 READ_BUF(8); 2102 READ_BUF(8);
2103 READ32(hdr->status); 2103 READ32(hdr->status);
@@ -2112,7 +2112,7 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
2112 2112
2113static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 2113static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
2114{ 2114{
2115 uint32_t *p; 2115 __be32 *p;
2116 uint32_t opnum; 2116 uint32_t opnum;
2117 int32_t nfserr; 2117 int32_t nfserr;
2118 2118
@@ -2134,7 +2134,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
2134/* Dummy routine */ 2134/* Dummy routine */
2135static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) 2135static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
2136{ 2136{
2137 uint32_t *p; 2137 __be32 *p;
2138 unsigned int strlen; 2138 unsigned int strlen;
2139 char *str; 2139 char *str;
2140 2140
@@ -2144,7 +2144,8 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
2144 2144
2145static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) 2145static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
2146{ 2146{
2147 uint32_t bmlen, *p; 2147 uint32_t bmlen;
2148 __be32 *p;
2148 2149
2149 READ_BUF(4); 2150 READ_BUF(4);
2150 READ32(bmlen); 2151 READ32(bmlen);
@@ -2159,9 +2160,9 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
2159 return 0; 2160 return 0;
2160} 2161}
2161 2162
2162static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, uint32_t **savep) 2163static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
2163{ 2164{
2164 uint32_t *p; 2165 __be32 *p;
2165 2166
2166 READ_BUF(4); 2167 READ_BUF(4);
2167 READ32(*attrlen); 2168 READ32(*attrlen);
@@ -2182,7 +2183,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
2182 2183
2183static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type) 2184static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
2184{ 2185{
2185 uint32_t *p; 2186 __be32 *p;
2186 2187
2187 *type = 0; 2188 *type = 0;
2188 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) 2189 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2202,7 +2203,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2202 2203
2203static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 2204static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
2204{ 2205{
2205 uint32_t *p; 2206 __be32 *p;
2206 2207
2207 *change = 0; 2208 *change = 0;
2208 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) 2209 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2219,7 +2220,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2219 2220
2220static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) 2221static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
2221{ 2222{
2222 uint32_t *p; 2223 __be32 *p;
2223 2224
2224 *size = 0; 2225 *size = 0;
2225 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) 2226 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2235,7 +2236,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
2235 2236
2236static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2237static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2237{ 2238{
2238 uint32_t *p; 2239 __be32 *p;
2239 2240
2240 *res = 0; 2241 *res = 0;
2241 if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) 2242 if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
@@ -2251,7 +2252,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
2251 2252
2252static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2253static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2253{ 2254{
2254 uint32_t *p; 2255 __be32 *p;
2255 2256
2256 *res = 0; 2257 *res = 0;
2257 if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) 2258 if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
@@ -2267,7 +2268,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2267 2268
2268static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) 2269static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
2269{ 2270{
2270 uint32_t *p; 2271 __be32 *p;
2271 2272
2272 fsid->major = 0; 2273 fsid->major = 0;
2273 fsid->minor = 0; 2274 fsid->minor = 0;
@@ -2287,7 +2288,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2287 2288
2288static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2289static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2289{ 2290{
2290 uint32_t *p; 2291 __be32 *p;
2291 2292
2292 *res = 60; 2293 *res = 60;
2293 if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) 2294 if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
@@ -2303,7 +2304,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
2303 2304
2304static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2305static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2305{ 2306{
2306 uint32_t *p; 2307 __be32 *p;
2307 2308
2308 *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL; 2309 *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL;
2309 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) 2310 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
@@ -2319,7 +2320,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
2319 2320
2320static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2321static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2321{ 2322{
2322 uint32_t *p; 2323 __be32 *p;
2323 2324
2324 *fileid = 0; 2325 *fileid = 0;
2325 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) 2326 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2335,7 +2336,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2335 2336
2336static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2337static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
2337{ 2338{
2338 uint32_t *p; 2339 __be32 *p;
2339 2340
2340 *fileid = 0; 2341 *fileid = 0;
2341 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) 2342 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2351,7 +2352,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
2351 2352
2352static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2353static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2353{ 2354{
2354 uint32_t *p; 2355 __be32 *p;
2355 int status = 0; 2356 int status = 0;
2356 2357
2357 *res = 0; 2358 *res = 0;
@@ -2368,7 +2369,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
2368 2369
2369static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2370static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2370{ 2371{
2371 uint32_t *p; 2372 __be32 *p;
2372 int status = 0; 2373 int status = 0;
2373 2374
2374 *res = 0; 2375 *res = 0;
@@ -2385,7 +2386,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
2385 2386
2386static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2387static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2387{ 2388{
2388 uint32_t *p; 2389 __be32 *p;
2389 int status = 0; 2390 int status = 0;
2390 2391
2391 *res = 0; 2392 *res = 0;
@@ -2403,7 +2404,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2403static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) 2404static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
2404{ 2405{
2405 int n; 2406 int n;
2406 uint32_t *p; 2407 __be32 *p;
2407 int status = 0; 2408 int status = 0;
2408 2409
2409 READ_BUF(4); 2410 READ_BUF(4);
@@ -2448,7 +2449,7 @@ out_eio:
2448static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) 2449static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
2449{ 2450{
2450 int n; 2451 int n;
2451 uint32_t *p; 2452 __be32 *p;
2452 int status = -EIO; 2453 int status = -EIO;
2453 2454
2454 if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U))) 2455 if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
@@ -2512,7 +2513,7 @@ out_eio:
2512 2513
2513static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2514static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2514{ 2515{
2515 uint32_t *p; 2516 __be32 *p;
2516 int status = 0; 2517 int status = 0;
2517 2518
2518 *res = 0; 2519 *res = 0;
@@ -2529,7 +2530,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
2529 2530
2530static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) 2531static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
2531{ 2532{
2532 uint32_t *p; 2533 __be32 *p;
2533 int status = 0; 2534 int status = 0;
2534 2535
2535 *maxlink = 1; 2536 *maxlink = 1;
@@ -2546,7 +2547,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2546 2547
2547static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) 2548static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
2548{ 2549{
2549 uint32_t *p; 2550 __be32 *p;
2550 int status = 0; 2551 int status = 0;
2551 2552
2552 *maxname = 1024; 2553 *maxname = 1024;
@@ -2563,7 +2564,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2563 2564
2564static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2565static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2565{ 2566{
2566 uint32_t *p; 2567 __be32 *p;
2567 int status = 0; 2568 int status = 0;
2568 2569
2569 *res = 1024; 2570 *res = 1024;
@@ -2584,7 +2585,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2584 2585
2585static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2586static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
2586{ 2587{
2587 uint32_t *p; 2588 __be32 *p;
2588 int status = 0; 2589 int status = 0;
2589 2590
2590 *res = 1024; 2591 *res = 1024;
@@ -2605,7 +2606,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
2605 2606
2606static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode) 2607static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
2607{ 2608{
2608 uint32_t *p; 2609 __be32 *p;
2609 2610
2610 *mode = 0; 2611 *mode = 0;
2611 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) 2612 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
@@ -2622,7 +2623,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2622 2623
2623static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) 2624static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
2624{ 2625{
2625 uint32_t *p; 2626 __be32 *p;
2626 2627
2627 *nlink = 1; 2628 *nlink = 1;
2628 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) 2629 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2638,7 +2639,8 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
2638 2639
2639static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid) 2640static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid)
2640{ 2641{
2641 uint32_t len, *p; 2642 uint32_t len;
2643 __be32 *p;
2642 2644
2643 *uid = -2; 2645 *uid = -2;
2644 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) 2646 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2662,7 +2664,8 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2662 2664
2663static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid) 2665static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid)
2664{ 2666{
2665 uint32_t len, *p; 2667 uint32_t len;
2668 __be32 *p;
2666 2669
2667 *gid = -2; 2670 *gid = -2;
2668 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) 2671 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2686,7 +2689,8 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
2686 2689
2687static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) 2690static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
2688{ 2691{
2689 uint32_t major = 0, minor = 0, *p; 2692 uint32_t major = 0, minor = 0;
2693 __be32 *p;
2690 2694
2691 *rdev = MKDEV(0,0); 2695 *rdev = MKDEV(0,0);
2692 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U))) 2696 if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2708,7 +2712,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
2708 2712
2709static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2713static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2710{ 2714{
2711 uint32_t *p; 2715 __be32 *p;
2712 int status = 0; 2716 int status = 0;
2713 2717
2714 *res = 0; 2718 *res = 0;
@@ -2725,7 +2729,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
2725 2729
2726static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2730static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2727{ 2731{
2728 uint32_t *p; 2732 __be32 *p;
2729 int status = 0; 2733 int status = 0;
2730 2734
2731 *res = 0; 2735 *res = 0;
@@ -2742,7 +2746,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
2742 2746
2743static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2747static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
2744{ 2748{
2745 uint32_t *p; 2749 __be32 *p;
2746 int status = 0; 2750 int status = 0;
2747 2751
2748 *res = 0; 2752 *res = 0;
@@ -2759,7 +2763,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2759 2763
2760static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) 2764static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
2761{ 2765{
2762 uint32_t *p; 2766 __be32 *p;
2763 2767
2764 *used = 0; 2768 *used = 0;
2765 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) 2769 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2776,7 +2780,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
2776 2780
2777static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) 2781static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
2778{ 2782{
2779 uint32_t *p; 2783 __be32 *p;
2780 uint64_t sec; 2784 uint64_t sec;
2781 uint32_t nsec; 2785 uint32_t nsec;
2782 2786
@@ -2836,7 +2840,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
2836 return status; 2840 return status;
2837} 2841}
2838 2842
2839static int verify_attr_len(struct xdr_stream *xdr, uint32_t *savep, uint32_t attrlen) 2843static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen)
2840{ 2844{
2841 unsigned int attrwords = XDR_QUADLEN(attrlen); 2845 unsigned int attrwords = XDR_QUADLEN(attrlen);
2842 unsigned int nwords = xdr->p - savep; 2846 unsigned int nwords = xdr->p - savep;
@@ -2854,7 +2858,7 @@ static int verify_attr_len(struct xdr_stream *xdr, uint32_t *savep, uint32_t att
2854 2858
2855static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 2859static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
2856{ 2860{
2857 uint32_t *p; 2861 __be32 *p;
2858 2862
2859 READ_BUF(20); 2863 READ_BUF(20);
2860 READ32(cinfo->atomic); 2864 READ32(cinfo->atomic);
@@ -2865,7 +2869,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
2865 2869
2866static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) 2870static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
2867{ 2871{
2868 uint32_t *p; 2872 __be32 *p;
2869 uint32_t supp, acc; 2873 uint32_t supp, acc;
2870 int status; 2874 int status;
2871 2875
@@ -2882,7 +2886,7 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
2882 2886
2883static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) 2887static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
2884{ 2888{
2885 uint32_t *p; 2889 __be32 *p;
2886 int status; 2890 int status;
2887 2891
2888 status = decode_op_hdr(xdr, OP_CLOSE); 2892 status = decode_op_hdr(xdr, OP_CLOSE);
@@ -2895,7 +2899,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
2895 2899
2896static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) 2900static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
2897{ 2901{
2898 uint32_t *p; 2902 __be32 *p;
2899 int status; 2903 int status;
2900 2904
2901 status = decode_op_hdr(xdr, OP_COMMIT); 2905 status = decode_op_hdr(xdr, OP_COMMIT);
@@ -2908,7 +2912,7 @@ static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
2908 2912
2909static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 2913static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
2910{ 2914{
2911 uint32_t *p; 2915 __be32 *p;
2912 uint32_t bmlen; 2916 uint32_t bmlen;
2913 int status; 2917 int status;
2914 2918
@@ -2925,7 +2929,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
2925 2929
2926static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 2930static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
2927{ 2931{
2928 uint32_t *savep; 2932 __be32 *savep;
2929 uint32_t attrlen, 2933 uint32_t attrlen,
2930 bitmap[2] = {0}; 2934 bitmap[2] = {0};
2931 int status; 2935 int status;
@@ -2952,7 +2956,7 @@ xdr_error:
2952 2956
2953static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) 2957static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
2954{ 2958{
2955 uint32_t *savep; 2959 __be32 *savep;
2956 uint32_t attrlen, 2960 uint32_t attrlen,
2957 bitmap[2] = {0}; 2961 bitmap[2] = {0};
2958 int status; 2962 int status;
@@ -2985,7 +2989,7 @@ xdr_error:
2985 2989
2986static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) 2990static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
2987{ 2991{
2988 uint32_t *savep; 2992 __be32 *savep;
2989 uint32_t attrlen, 2993 uint32_t attrlen,
2990 bitmap[2] = {0}; 2994 bitmap[2] = {0};
2991 int status; 2995 int status;
@@ -3010,7 +3014,7 @@ xdr_error:
3010 3014
3011static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) 3015static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
3012{ 3016{
3013 uint32_t *savep; 3017 __be32 *savep;
3014 uint32_t attrlen, 3018 uint32_t attrlen,
3015 bitmap[2] = {0}, 3019 bitmap[2] = {0},
3016 type; 3020 type;
@@ -3079,7 +3083,7 @@ xdr_error:
3079 3083
3080static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) 3084static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
3081{ 3085{
3082 uint32_t *savep; 3086 __be32 *savep;
3083 uint32_t attrlen, bitmap[2]; 3087 uint32_t attrlen, bitmap[2];
3084 int status; 3088 int status;
3085 3089
@@ -3111,7 +3115,7 @@ xdr_error:
3111 3115
3112static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) 3116static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
3113{ 3117{
3114 uint32_t *p; 3118 __be32 *p;
3115 uint32_t len; 3119 uint32_t len;
3116 int status; 3120 int status;
3117 3121
@@ -3147,7 +3151,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
3147static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) 3151static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3148{ 3152{
3149 uint64_t offset, length, clientid; 3153 uint64_t offset, length, clientid;
3150 uint32_t *p; 3154 __be32 *p;
3151 uint32_t namelen, type; 3155 uint32_t namelen, type;
3152 3156
3153 READ_BUF(32); 3157 READ_BUF(32);
@@ -3172,7 +3176,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3172 3176
3173static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) 3177static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
3174{ 3178{
3175 uint32_t *p; 3179 __be32 *p;
3176 int status; 3180 int status;
3177 3181
3178 status = decode_op_hdr(xdr, OP_LOCK); 3182 status = decode_op_hdr(xdr, OP_LOCK);
@@ -3195,7 +3199,7 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
3195 3199
3196static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) 3200static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
3197{ 3201{
3198 uint32_t *p; 3202 __be32 *p;
3199 int status; 3203 int status;
3200 3204
3201 status = decode_op_hdr(xdr, OP_LOCKU); 3205 status = decode_op_hdr(xdr, OP_LOCKU);
@@ -3214,7 +3218,7 @@ static int decode_lookup(struct xdr_stream *xdr)
3214/* This is too sick! */ 3218/* This is too sick! */
3215static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) 3219static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
3216{ 3220{
3217 uint32_t *p; 3221 __be32 *p;
3218 uint32_t limit_type, nblocks, blocksize; 3222 uint32_t limit_type, nblocks, blocksize;
3219 3223
3220 READ_BUF(12); 3224 READ_BUF(12);
@@ -3233,7 +3237,7 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
3233 3237
3234static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) 3238static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3235{ 3239{
3236 uint32_t *p; 3240 __be32 *p;
3237 uint32_t delegation_type; 3241 uint32_t delegation_type;
3238 3242
3239 READ_BUF(4); 3243 READ_BUF(4);
@@ -3259,7 +3263,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3259 3263
3260static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 3264static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3261{ 3265{
3262 uint32_t *p; 3266 __be32 *p;
3263 uint32_t bmlen; 3267 uint32_t bmlen;
3264 int status; 3268 int status;
3265 3269
@@ -3287,7 +3291,7 @@ xdr_error:
3287 3291
3288static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) 3292static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
3289{ 3293{
3290 uint32_t *p; 3294 __be32 *p;
3291 int status; 3295 int status;
3292 3296
3293 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); 3297 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
@@ -3300,7 +3304,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
3300 3304
3301static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) 3305static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
3302{ 3306{
3303 uint32_t *p; 3307 __be32 *p;
3304 int status; 3308 int status;
3305 3309
3306 status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); 3310 status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
@@ -3324,7 +3328,7 @@ static int decode_putrootfh(struct xdr_stream *xdr)
3324static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) 3328static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res)
3325{ 3329{
3326 struct kvec *iov = req->rq_rcv_buf.head; 3330 struct kvec *iov = req->rq_rcv_buf.head;
3327 uint32_t *p; 3331 __be32 *p;
3328 uint32_t count, eof, recvd, hdrlen; 3332 uint32_t count, eof, recvd, hdrlen;
3329 int status; 3333 int status;
3330 3334
@@ -3354,7 +3358,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3354 struct page *page = *rcvbuf->pages; 3358 struct page *page = *rcvbuf->pages;
3355 struct kvec *iov = rcvbuf->head; 3359 struct kvec *iov = rcvbuf->head;
3356 unsigned int nr, pglen = rcvbuf->page_len; 3360 unsigned int nr, pglen = rcvbuf->page_len;
3357 uint32_t *end, *entry, *p, *kaddr; 3361 __be32 *end, *entry, *p, *kaddr;
3358 uint32_t len, attrlen, xlen; 3362 uint32_t len, attrlen, xlen;
3359 int hdrlen, recvd, status; 3363 int hdrlen, recvd, status;
3360 3364
@@ -3376,7 +3380,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3376 xdr_read_pages(xdr, pglen); 3380 xdr_read_pages(xdr, pglen);
3377 3381
3378 BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); 3382 BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
3379 kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0); 3383 kaddr = p = kmap_atomic(page, KM_USER0);
3380 end = p + ((pglen + readdir->pgbase) >> 2); 3384 end = p + ((pglen + readdir->pgbase) >> 2);
3381 entry = p; 3385 entry = p;
3382 for (nr = 0; *p++; nr++) { 3386 for (nr = 0; *p++; nr++) {
@@ -3428,7 +3432,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
3428 struct xdr_buf *rcvbuf = &req->rq_rcv_buf; 3432 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
3429 struct kvec *iov = rcvbuf->head; 3433 struct kvec *iov = rcvbuf->head;
3430 int hdrlen, len, recvd; 3434 int hdrlen, len, recvd;
3431 uint32_t *p; 3435 __be32 *p;
3432 char *kaddr; 3436 char *kaddr;
3433 int status; 3437 int status;
3434 3438
@@ -3505,7 +3509,7 @@ decode_restorefh(struct xdr_stream *xdr)
3505static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, 3509static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
3506 size_t *acl_len) 3510 size_t *acl_len)
3507{ 3511{
3508 uint32_t *savep; 3512 __be32 *savep;
3509 uint32_t attrlen, 3513 uint32_t attrlen,
3510 bitmap[2] = {0}; 3514 bitmap[2] = {0};
3511 struct kvec *iov = req->rq_rcv_buf.head; 3515 struct kvec *iov = req->rq_rcv_buf.head;
@@ -3551,7 +3555,7 @@ decode_savefh(struct xdr_stream *xdr)
3551 3555
3552static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res) 3556static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
3553{ 3557{
3554 uint32_t *p; 3558 __be32 *p;
3555 uint32_t bmlen; 3559 uint32_t bmlen;
3556 int status; 3560 int status;
3557 3561
@@ -3567,7 +3571,7 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
3567 3571
3568static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) 3572static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
3569{ 3573{
3570 uint32_t *p; 3574 __be32 *p;
3571 uint32_t opnum; 3575 uint32_t opnum;
3572 int32_t nfserr; 3576 int32_t nfserr;
3573 3577
@@ -3610,7 +3614,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)
3610 3614
3611static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) 3615static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
3612{ 3616{
3613 uint32_t *p; 3617 __be32 *p;
3614 int status; 3618 int status;
3615 3619
3616 status = decode_op_hdr(xdr, OP_WRITE); 3620 status = decode_op_hdr(xdr, OP_WRITE);
@@ -3632,7 +3636,7 @@ static int decode_delegreturn(struct xdr_stream *xdr)
3632/* 3636/*
3633 * Decode OPEN_DOWNGRADE response 3637 * Decode OPEN_DOWNGRADE response
3634 */ 3638 */
3635static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res) 3639static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
3636{ 3640{
3637 struct xdr_stream xdr; 3641 struct xdr_stream xdr;
3638 struct compound_hdr hdr; 3642 struct compound_hdr hdr;
@@ -3660,7 +3664,7 @@ out:
3660/* 3664/*
3661 * Decode ACCESS response 3665 * Decode ACCESS response
3662 */ 3666 */
3663static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_accessres *res) 3667static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
3664{ 3668{
3665 struct xdr_stream xdr; 3669 struct xdr_stream xdr;
3666 struct compound_hdr hdr; 3670 struct compound_hdr hdr;
@@ -3678,7 +3682,7 @@ out:
3678/* 3682/*
3679 * Decode LOOKUP response 3683 * Decode LOOKUP response
3680 */ 3684 */
3681static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_lookup_res *res) 3685static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
3682{ 3686{
3683 struct xdr_stream xdr; 3687 struct xdr_stream xdr;
3684 struct compound_hdr hdr; 3688 struct compound_hdr hdr;
@@ -3701,7 +3705,7 @@ out:
3701/* 3705/*
3702 * Decode LOOKUP_ROOT response 3706 * Decode LOOKUP_ROOT response
3703 */ 3707 */
3704static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_lookup_res *res) 3708static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
3705{ 3709{
3706 struct xdr_stream xdr; 3710 struct xdr_stream xdr;
3707 struct compound_hdr hdr; 3711 struct compound_hdr hdr;
@@ -3721,7 +3725,7 @@ out:
3721/* 3725/*
3722 * Decode REMOVE response 3726 * Decode REMOVE response
3723 */ 3727 */
3724static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_remove_res *res) 3728static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_remove_res *res)
3725{ 3729{
3726 struct xdr_stream xdr; 3730 struct xdr_stream xdr;
3727 struct compound_hdr hdr; 3731 struct compound_hdr hdr;
@@ -3742,7 +3746,7 @@ out:
3742/* 3746/*
3743 * Decode RENAME response 3747 * Decode RENAME response
3744 */ 3748 */
3745static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_rename_res *res) 3749static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
3746{ 3750{
3747 struct xdr_stream xdr; 3751 struct xdr_stream xdr;
3748 struct compound_hdr hdr; 3752 struct compound_hdr hdr;
@@ -3772,7 +3776,7 @@ out:
3772/* 3776/*
3773 * Decode LINK response 3777 * Decode LINK response
3774 */ 3778 */
3775static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_link_res *res) 3779static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
3776{ 3780{
3777 struct xdr_stream xdr; 3781 struct xdr_stream xdr;
3778 struct compound_hdr hdr; 3782 struct compound_hdr hdr;
@@ -3805,7 +3809,7 @@ out:
3805/* 3809/*
3806 * Decode CREATE response 3810 * Decode CREATE response
3807 */ 3811 */
3808static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res) 3812static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
3809{ 3813{
3810 struct xdr_stream xdr; 3814 struct xdr_stream xdr;
3811 struct compound_hdr hdr; 3815 struct compound_hdr hdr;
@@ -3834,7 +3838,7 @@ out:
3834/* 3838/*
3835 * Decode SYMLINK response 3839 * Decode SYMLINK response
3836 */ 3840 */
3837static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res) 3841static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
3838{ 3842{
3839 return nfs4_xdr_dec_create(rqstp, p, res); 3843 return nfs4_xdr_dec_create(rqstp, p, res);
3840} 3844}
@@ -3842,7 +3846,7 @@ static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4
3842/* 3846/*
3843 * Decode GETATTR response 3847 * Decode GETATTR response
3844 */ 3848 */
3845static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_getattr_res *res) 3849static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
3846{ 3850{
3847 struct xdr_stream xdr; 3851 struct xdr_stream xdr;
3848 struct compound_hdr hdr; 3852 struct compound_hdr hdr;
@@ -3865,7 +3869,7 @@ out:
3865 * Encode an SETACL request 3869 * Encode an SETACL request
3866 */ 3870 */
3867static int 3871static int
3868nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args) 3872nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
3869{ 3873{
3870 struct xdr_stream xdr; 3874 struct xdr_stream xdr;
3871 struct compound_hdr hdr = { 3875 struct compound_hdr hdr = {
@@ -3886,7 +3890,7 @@ out:
3886 * Decode SETACL response 3890 * Decode SETACL response
3887 */ 3891 */
3888static int 3892static int
3889nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res) 3893nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
3890{ 3894{
3891 struct xdr_stream xdr; 3895 struct xdr_stream xdr;
3892 struct compound_hdr hdr; 3896 struct compound_hdr hdr;
@@ -3908,7 +3912,7 @@ out:
3908 * Decode GETACL response 3912 * Decode GETACL response
3909 */ 3913 */
3910static int 3914static int
3911nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, size_t *acl_len) 3915nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
3912{ 3916{
3913 struct xdr_stream xdr; 3917 struct xdr_stream xdr;
3914 struct compound_hdr hdr; 3918 struct compound_hdr hdr;
@@ -3930,7 +3934,7 @@ out:
3930/* 3934/*
3931 * Decode CLOSE response 3935 * Decode CLOSE response
3932 */ 3936 */
3933static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res) 3937static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
3934{ 3938{
3935 struct xdr_stream xdr; 3939 struct xdr_stream xdr;
3936 struct compound_hdr hdr; 3940 struct compound_hdr hdr;
@@ -3960,7 +3964,7 @@ out:
3960/* 3964/*
3961 * Decode OPEN response 3965 * Decode OPEN response
3962 */ 3966 */
3963static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res) 3967static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
3964{ 3968{
3965 struct xdr_stream xdr; 3969 struct xdr_stream xdr;
3966 struct compound_hdr hdr; 3970 struct compound_hdr hdr;
@@ -3994,7 +3998,7 @@ out:
3994/* 3998/*
3995 * Decode OPEN_CONFIRM response 3999 * Decode OPEN_CONFIRM response
3996 */ 4000 */
3997static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_open_confirmres *res) 4001static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
3998{ 4002{
3999 struct xdr_stream xdr; 4003 struct xdr_stream xdr;
4000 struct compound_hdr hdr; 4004 struct compound_hdr hdr;
@@ -4015,7 +4019,7 @@ out:
4015/* 4019/*
4016 * Decode OPEN response 4020 * Decode OPEN response
4017 */ 4021 */
4018static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res) 4022static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
4019{ 4023{
4020 struct xdr_stream xdr; 4024 struct xdr_stream xdr;
4021 struct compound_hdr hdr; 4025 struct compound_hdr hdr;
@@ -4039,7 +4043,7 @@ out:
4039/* 4043/*
4040 * Decode SETATTR response 4044 * Decode SETATTR response
4041 */ 4045 */
4042static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_setattrres *res) 4046static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
4043{ 4047{
4044 struct xdr_stream xdr; 4048 struct xdr_stream xdr;
4045 struct compound_hdr hdr; 4049 struct compound_hdr hdr;
@@ -4065,7 +4069,7 @@ out:
4065/* 4069/*
4066 * Decode LOCK response 4070 * Decode LOCK response
4067 */ 4071 */
4068static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lock_res *res) 4072static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
4069{ 4073{
4070 struct xdr_stream xdr; 4074 struct xdr_stream xdr;
4071 struct compound_hdr hdr; 4075 struct compound_hdr hdr;
@@ -4086,7 +4090,7 @@ out:
4086/* 4090/*
4087 * Decode LOCKT response 4091 * Decode LOCKT response
4088 */ 4092 */
4089static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockt_res *res) 4093static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
4090{ 4094{
4091 struct xdr_stream xdr; 4095 struct xdr_stream xdr;
4092 struct compound_hdr hdr; 4096 struct compound_hdr hdr;
@@ -4107,7 +4111,7 @@ out:
4107/* 4111/*
4108 * Decode LOCKU response 4112 * Decode LOCKU response
4109 */ 4113 */
4110static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_locku_res *res) 4114static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
4111{ 4115{
4112 struct xdr_stream xdr; 4116 struct xdr_stream xdr;
4113 struct compound_hdr hdr; 4117 struct compound_hdr hdr;
@@ -4128,7 +4132,7 @@ out:
4128/* 4132/*
4129 * Decode READLINK response 4133 * Decode READLINK response
4130 */ 4134 */
4131static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, uint32_t *p, void *res) 4135static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
4132{ 4136{
4133 struct xdr_stream xdr; 4137 struct xdr_stream xdr;
4134 struct compound_hdr hdr; 4138 struct compound_hdr hdr;
@@ -4149,7 +4153,7 @@ out:
4149/* 4153/*
4150 * Decode READDIR response 4154 * Decode READDIR response
4151 */ 4155 */
4152static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_readdir_res *res) 4156static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
4153{ 4157{
4154 struct xdr_stream xdr; 4158 struct xdr_stream xdr;
4155 struct compound_hdr hdr; 4159 struct compound_hdr hdr;
@@ -4170,7 +4174,7 @@ out:
4170/* 4174/*
4171 * Decode Read response 4175 * Decode Read response
4172 */ 4176 */
4173static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_readres *res) 4177static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
4174{ 4178{
4175 struct xdr_stream xdr; 4179 struct xdr_stream xdr;
4176 struct compound_hdr hdr; 4180 struct compound_hdr hdr;
@@ -4193,7 +4197,7 @@ out:
4193/* 4197/*
4194 * Decode WRITE response 4198 * Decode WRITE response
4195 */ 4199 */
4196static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_writeres *res) 4200static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
4197{ 4201{
4198 struct xdr_stream xdr; 4202 struct xdr_stream xdr;
4199 struct compound_hdr hdr; 4203 struct compound_hdr hdr;
@@ -4219,7 +4223,7 @@ out:
4219/* 4223/*
4220 * Decode COMMIT response 4224 * Decode COMMIT response
4221 */ 4225 */
4222static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_writeres *res) 4226static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
4223{ 4227{
4224 struct xdr_stream xdr; 4228 struct xdr_stream xdr;
4225 struct compound_hdr hdr; 4229 struct compound_hdr hdr;
@@ -4243,7 +4247,7 @@ out:
4243/* 4247/*
4244 * FSINFO request 4248 * FSINFO request
4245 */ 4249 */
4246static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsinfo *fsinfo) 4250static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
4247{ 4251{
4248 struct xdr_stream xdr; 4252 struct xdr_stream xdr;
4249 struct compound_hdr hdr; 4253 struct compound_hdr hdr;
@@ -4263,7 +4267,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsi
4263/* 4267/*
4264 * PATHCONF request 4268 * PATHCONF request
4265 */ 4269 */
4266static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, uint32_t *p, struct nfs_pathconf *pathconf) 4270static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf)
4267{ 4271{
4268 struct xdr_stream xdr; 4272 struct xdr_stream xdr;
4269 struct compound_hdr hdr; 4273 struct compound_hdr hdr;
@@ -4281,7 +4285,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, uint32_t *p, struct nfs_p
4281/* 4285/*
4282 * STATFS request 4286 * STATFS request
4283 */ 4287 */
4284static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, uint32_t *p, struct nfs_fsstat *fsstat) 4288static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat)
4285{ 4289{
4286 struct xdr_stream xdr; 4290 struct xdr_stream xdr;
4287 struct compound_hdr hdr; 4291 struct compound_hdr hdr;
@@ -4299,7 +4303,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, uint32_t *p, struct nfs_fss
4299/* 4303/*
4300 * GETATTR_BITMAP request 4304 * GETATTR_BITMAP request
4301 */ 4305 */
4302static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, uint32_t *p, struct nfs4_server_caps_res *res) 4306static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
4303{ 4307{
4304 struct xdr_stream xdr; 4308 struct xdr_stream xdr;
4305 struct compound_hdr hdr; 4309 struct compound_hdr hdr;
@@ -4318,7 +4322,7 @@ out:
4318/* 4322/*
4319 * Decode RENEW response 4323 * Decode RENEW response
4320 */ 4324 */
4321static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy) 4325static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
4322{ 4326{
4323 struct xdr_stream xdr; 4327 struct xdr_stream xdr;
4324 struct compound_hdr hdr; 4328 struct compound_hdr hdr;
@@ -4334,7 +4338,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
4334/* 4338/*
4335 * a SETCLIENTID request 4339 * a SETCLIENTID request
4336 */ 4340 */
4337static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p, 4341static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
4338 struct nfs_client *clp) 4342 struct nfs_client *clp)
4339{ 4343{
4340 struct xdr_stream xdr; 4344 struct xdr_stream xdr;
@@ -4353,7 +4357,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
4353/* 4357/*
4354 * a SETCLIENTID_CONFIRM request 4358 * a SETCLIENTID_CONFIRM request
4355 */ 4359 */
4356static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_fsinfo *fsinfo) 4360static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
4357{ 4361{
4358 struct xdr_stream xdr; 4362 struct xdr_stream xdr;
4359 struct compound_hdr hdr; 4363 struct compound_hdr hdr;
@@ -4375,7 +4379,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
4375/* 4379/*
4376 * DELEGRETURN request 4380 * DELEGRETURN request
4377 */ 4381 */
4378static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_delegreturnres *res) 4382static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
4379{ 4383{
4380 struct xdr_stream xdr; 4384 struct xdr_stream xdr;
4381 struct compound_hdr hdr; 4385 struct compound_hdr hdr;
@@ -4397,7 +4401,7 @@ out:
4397/* 4401/*
4398 * FS_LOCATIONS request 4402 * FS_LOCATIONS request
4399 */ 4403 */
4400static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res) 4404static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res)
4401{ 4405{
4402 struct xdr_stream xdr; 4406 struct xdr_stream xdr;
4403 struct compound_hdr hdr; 4407 struct compound_hdr hdr;
@@ -4417,7 +4421,7 @@ out:
4417 return status; 4421 return status;
4418} 4422}
4419 4423
4420uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus) 4424__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
4421{ 4425{
4422 uint32_t bitmap[2] = {0}; 4426 uint32_t bitmap[2] = {0};
4423 uint32_t len; 4427 uint32_t len;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 1d656a645199..8dfefe41a8da 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -69,7 +69,6 @@
69 * Fabian Frederick: Option parser rebuilt (using parser lib) 69 * Fabian Frederick: Option parser rebuilt (using parser lib)
70*/ 70*/
71 71
72#include <linux/config.h>
73#include <linux/types.h> 72#include <linux/types.h>
74#include <linux/string.h> 73#include <linux/string.h>
75#include <linux/kernel.h> 74#include <linux/kernel.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e8d40030cab4..28108c82b887 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -20,7 +20,6 @@
20 * of another (see nfs_lookup()) 20 * of another (see nfs_lookup())
21 */ 21 */
22 22
23#include <linux/config.h>
24#include <linux/module.h> 23#include <linux/module.h>
25#include <linux/init.h> 24#include <linux/init.h>
26 25
@@ -835,7 +834,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
835 } 834 }
836 /* RFC3530: The default port for NFS is 2049 */ 835 /* RFC3530: The default port for NFS is 2049 */
837 if (addr.sin_port == 0) 836 if (addr.sin_port == 0)
838 addr.sin_port = NFS_PORT; 837 addr.sin_port = htons(NFS_PORT);
839 838
840 /* Grab the authentication type */ 839 /* Grab the authentication type */
841 authflavour = RPC_AUTH_UNIX; 840 authflavour = RPC_AUTH_UNIX;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f6675d2c386c..883dd4a1c157 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -57,6 +57,8 @@
57#include <linux/nfs_fs.h> 57#include <linux/nfs_fs.h>
58#include <linux/nfs_mount.h> 58#include <linux/nfs_mount.h>
59#include <linux/nfs_page.h> 59#include <linux/nfs_page.h>
60#include <linux/backing-dev.h>
61
60#include <asm/uaccess.h> 62#include <asm/uaccess.h>
61#include <linux/smp_lock.h> 63#include <linux/smp_lock.h>
62 64
@@ -395,7 +397,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
395out: 397out:
396 clear_bit(BDI_write_congested, &bdi->state); 398 clear_bit(BDI_write_congested, &bdi->state);
397 wake_up_all(&nfs_write_congestion); 399 wake_up_all(&nfs_write_congestion);
398 writeback_congestion_end(); 400 congestion_end(WRITE);
399 return err; 401 return err;
400} 402}
401 403
@@ -588,10 +590,10 @@ static void nfs_cancel_commit_list(struct list_head *head)
588 590
589 while(!list_empty(head)) { 591 while(!list_empty(head)) {
590 req = nfs_list_entry(head->next); 592 req = nfs_list_entry(head->next);
593 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
591 nfs_list_remove_request(req); 594 nfs_list_remove_request(req);
592 nfs_inode_remove_request(req); 595 nfs_inode_remove_request(req);
593 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 596 nfs_unlock_request(req);
594 nfs_clear_page_writeback(req);
595 } 597 }
596} 598}
597 599
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 0c2be8c0307d..c11f5375d7c1 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -46,7 +46,7 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
46{ 46{
47 struct nfsacl_encode_desc *nfsacl_desc = 47 struct nfsacl_encode_desc *nfsacl_desc =
48 (struct nfsacl_encode_desc *) desc; 48 (struct nfsacl_encode_desc *) desc;
49 u32 *p = (u32 *) elem; 49 __be32 *p = elem;
50 50
51 struct posix_acl_entry *entry = 51 struct posix_acl_entry *entry =
52 &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; 52 &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
@@ -127,7 +127,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
127{ 127{
128 struct nfsacl_decode_desc *nfsacl_desc = 128 struct nfsacl_decode_desc *nfsacl_desc =
129 (struct nfsacl_decode_desc *) desc; 129 (struct nfsacl_decode_desc *) desc;
130 u32 *p = (u32 *) elem; 130 __be32 *p = elem;
131 struct posix_acl_entry *entry; 131 struct posix_acl_entry *entry;
132 132
133 if (!nfsacl_desc->acl) { 133 if (!nfsacl_desc->acl) {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cfe141e5d759..f37df46d2eaa 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -319,12 +319,25 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
319 319
320static struct cache_head *export_table[EXPORT_HASHMAX]; 320static struct cache_head *export_table[EXPORT_HASHMAX];
321 321
322static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
323{
324 int i;
325
326 for (i = 0; i < fsloc->locations_count; i++) {
327 kfree(fsloc->locations[i].path);
328 kfree(fsloc->locations[i].hosts);
329 }
330 kfree(fsloc->locations);
331}
332
322static void svc_export_put(struct kref *ref) 333static void svc_export_put(struct kref *ref)
323{ 334{
324 struct svc_export *exp = container_of(ref, struct svc_export, h.ref); 335 struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
325 dput(exp->ex_dentry); 336 dput(exp->ex_dentry);
326 mntput(exp->ex_mnt); 337 mntput(exp->ex_mnt);
327 auth_domain_put(exp->ex_client); 338 auth_domain_put(exp->ex_client);
339 kfree(exp->ex_path);
340 nfsd4_fslocs_free(&exp->ex_fslocs);
328 kfree(exp); 341 kfree(exp);
329} 342}
330 343
@@ -386,6 +399,69 @@ static int check_export(struct inode *inode, int flags)
386 399
387} 400}
388 401
402#ifdef CONFIG_NFSD_V4
403
404static int
405fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
406{
407 int len;
408 int migrated, i, err;
409
410 len = qword_get(mesg, buf, PAGE_SIZE);
411 if (len != 5 || memcmp(buf, "fsloc", 5))
412 return 0;
413
414 /* listsize */
415 err = get_int(mesg, &fsloc->locations_count);
416 if (err)
417 return err;
418 if (fsloc->locations_count > MAX_FS_LOCATIONS)
419 return -EINVAL;
420 if (fsloc->locations_count == 0)
421 return 0;
422
423 fsloc->locations = kzalloc(fsloc->locations_count
424 * sizeof(struct nfsd4_fs_location), GFP_KERNEL);
425 if (!fsloc->locations)
426 return -ENOMEM;
427 for (i=0; i < fsloc->locations_count; i++) {
428 /* colon separated host list */
429 err = -EINVAL;
430 len = qword_get(mesg, buf, PAGE_SIZE);
431 if (len <= 0)
432 goto out_free_all;
433 err = -ENOMEM;
434 fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL);
435 if (!fsloc->locations[i].hosts)
436 goto out_free_all;
437 err = -EINVAL;
438 /* slash separated path component list */
439 len = qword_get(mesg, buf, PAGE_SIZE);
440 if (len <= 0)
441 goto out_free_all;
442 err = -ENOMEM;
443 fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL);
444 if (!fsloc->locations[i].path)
445 goto out_free_all;
446 }
447 /* migrated */
448 err = get_int(mesg, &migrated);
449 if (err)
450 goto out_free_all;
451 err = -EINVAL;
452 if (migrated < 0 || migrated > 1)
453 goto out_free_all;
454 fsloc->migrated = migrated;
455 return 0;
456out_free_all:
457 nfsd4_fslocs_free(fsloc);
458 return err;
459}
460
461#else /* CONFIG_NFSD_V4 */
462static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; }
463#endif
464
389static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) 465static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
390{ 466{
391 /* client path expiry [flags anonuid anongid fsid] */ 467 /* client path expiry [flags anonuid anongid fsid] */
@@ -398,6 +474,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
398 int an_int; 474 int an_int;
399 475
400 nd.dentry = NULL; 476 nd.dentry = NULL;
477 exp.ex_path = NULL;
401 478
402 if (mesg[mlen-1] != '\n') 479 if (mesg[mlen-1] != '\n')
403 return -EINVAL; 480 return -EINVAL;
@@ -428,6 +505,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
428 exp.ex_client = dom; 505 exp.ex_client = dom;
429 exp.ex_mnt = nd.mnt; 506 exp.ex_mnt = nd.mnt;
430 exp.ex_dentry = nd.dentry; 507 exp.ex_dentry = nd.dentry;
508 exp.ex_path = kstrdup(buf, GFP_KERNEL);
509 err = -ENOMEM;
510 if (!exp.ex_path)
511 goto out;
431 512
432 /* expiry */ 513 /* expiry */
433 err = -EINVAL; 514 err = -EINVAL;
@@ -435,6 +516,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
435 if (exp.h.expiry_time == 0) 516 if (exp.h.expiry_time == 0)
436 goto out; 517 goto out;
437 518
519 /* fs locations */
520 exp.ex_fslocs.locations = NULL;
521 exp.ex_fslocs.locations_count = 0;
522 exp.ex_fslocs.migrated = 0;
523
438 /* flags */ 524 /* flags */
439 err = get_int(&mesg, &an_int); 525 err = get_int(&mesg, &an_int);
440 if (err == -ENOENT) 526 if (err == -ENOENT)
@@ -460,6 +546,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
460 546
461 err = check_export(nd.dentry->d_inode, exp.ex_flags); 547 err = check_export(nd.dentry->d_inode, exp.ex_flags);
462 if (err) goto out; 548 if (err) goto out;
549
550 err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
551 if (err)
552 goto out;
463 } 553 }
464 554
465 expp = svc_export_lookup(&exp); 555 expp = svc_export_lookup(&exp);
@@ -473,6 +563,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
473 else 563 else
474 exp_put(expp); 564 exp_put(expp);
475 out: 565 out:
566 kfree(exp.ex_path);
476 if (nd.dentry) 567 if (nd.dentry)
477 path_release(&nd); 568 path_release(&nd);
478 out_no_path: 569 out_no_path:
@@ -482,7 +573,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
482 return err; 573 return err;
483} 574}
484 575
485static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong); 576static void exp_flags(struct seq_file *m, int flag, int fsid,
577 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
486 578
487static int svc_export_show(struct seq_file *m, 579static int svc_export_show(struct seq_file *m,
488 struct cache_detail *cd, 580 struct cache_detail *cd,
@@ -501,8 +593,8 @@ static int svc_export_show(struct seq_file *m,
501 seq_putc(m, '('); 593 seq_putc(m, '(');
502 if (test_bit(CACHE_VALID, &h->flags) && 594 if (test_bit(CACHE_VALID, &h->flags) &&
503 !test_bit(CACHE_NEGATIVE, &h->flags)) 595 !test_bit(CACHE_NEGATIVE, &h->flags))
504 exp_flags(m, exp->ex_flags, exp->ex_fsid, 596 exp_flags(m, exp->ex_flags, exp->ex_fsid,
505 exp->ex_anon_uid, exp->ex_anon_gid); 597 exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
506 seq_puts(m, ")\n"); 598 seq_puts(m, ")\n");
507 return 0; 599 return 0;
508} 600}
@@ -524,6 +616,10 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
524 new->ex_client = item->ex_client; 616 new->ex_client = item->ex_client;
525 new->ex_dentry = dget(item->ex_dentry); 617 new->ex_dentry = dget(item->ex_dentry);
526 new->ex_mnt = mntget(item->ex_mnt); 618 new->ex_mnt = mntget(item->ex_mnt);
619 new->ex_path = NULL;
620 new->ex_fslocs.locations = NULL;
621 new->ex_fslocs.locations_count = 0;
622 new->ex_fslocs.migrated = 0;
527} 623}
528 624
529static void export_update(struct cache_head *cnew, struct cache_head *citem) 625static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -535,6 +631,14 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
535 new->ex_anon_uid = item->ex_anon_uid; 631 new->ex_anon_uid = item->ex_anon_uid;
536 new->ex_anon_gid = item->ex_anon_gid; 632 new->ex_anon_gid = item->ex_anon_gid;
537 new->ex_fsid = item->ex_fsid; 633 new->ex_fsid = item->ex_fsid;
634 new->ex_path = item->ex_path;
635 item->ex_path = NULL;
636 new->ex_fslocs.locations = item->ex_fslocs.locations;
637 item->ex_fslocs.locations = NULL;
638 new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
639 item->ex_fslocs.locations_count = 0;
640 new->ex_fslocs.migrated = item->ex_fslocs.migrated;
641 item->ex_fslocs.migrated = 0;
538} 642}
539 643
540static struct cache_head *svc_export_alloc(void) 644static struct cache_head *svc_export_alloc(void)
@@ -1044,34 +1148,25 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
1044 * for a given NFSv4 client. The root is defined to be the 1148 * for a given NFSv4 client. The root is defined to be the
1045 * export point with fsid==0 1149 * export point with fsid==0
1046 */ 1150 */
1047int 1151__be32
1048exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, 1152exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
1049 struct cache_req *creq) 1153 struct cache_req *creq)
1050{ 1154{
1051 struct svc_expkey *fsid_key;
1052 struct svc_export *exp; 1155 struct svc_export *exp;
1053 int rv; 1156 __be32 rv;
1054 u32 fsidv[2]; 1157 u32 fsidv[2];
1055 1158
1056 mk_fsid_v1(fsidv, 0); 1159 mk_fsid_v1(fsidv, 0);
1057 1160
1058 fsid_key = exp_find_key(clp, 1, fsidv, creq); 1161 exp = exp_find(clp, 1, fsidv, creq);
1059 if (IS_ERR(fsid_key) && PTR_ERR(fsid_key) == -EAGAIN) 1162 if (IS_ERR(exp) && PTR_ERR(exp) == -EAGAIN)
1060 return nfserr_dropit; 1163 return nfserr_dropit;
1061 if (!fsid_key || IS_ERR(fsid_key))
1062 return nfserr_perm;
1063
1064 exp = exp_get_by_name(clp, fsid_key->ek_mnt, fsid_key->ek_dentry, creq);
1065 if (exp == NULL) 1164 if (exp == NULL)
1066 rv = nfserr_perm; 1165 return nfserr_perm;
1067 else if (IS_ERR(exp)) 1166 else if (IS_ERR(exp))
1068 rv = nfserrno(PTR_ERR(exp)); 1167 return nfserrno(PTR_ERR(exp));
1069 else { 1168 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
1070 rv = fh_compose(fhp, exp, 1169 exp_put(exp);
1071 fsid_key->ek_dentry, NULL);
1072 exp_put(exp);
1073 }
1074 cache_put(&fsid_key->h, &svc_expkey_cache);
1075 return rv; 1170 return rv;
1076} 1171}
1077 1172
@@ -1158,7 +1253,8 @@ static struct flags {
1158 { 0, {"", ""}} 1253 { 0, {"", ""}}
1159}; 1254};
1160 1255
1161static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong) 1256static void exp_flags(struct seq_file *m, int flag, int fsid,
1257 uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
1162{ 1258{
1163 int first = 0; 1259 int first = 0;
1164 struct flags *flg; 1260 struct flags *flg;
@@ -1174,6 +1270,21 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t
1174 seq_printf(m, "%sanonuid=%d", first++?",":"", anonu); 1270 seq_printf(m, "%sanonuid=%d", first++?",":"", anonu);
1175 if (anong != (gid_t)-2 && anong != (0x10000-2)) 1271 if (anong != (gid_t)-2 && anong != (0x10000-2))
1176 seq_printf(m, "%sanongid=%d", first++?",":"", anong); 1272 seq_printf(m, "%sanongid=%d", first++?",":"", anong);
1273 if (fsloc && fsloc->locations_count > 0) {
1274 char *loctype = (fsloc->migrated) ? "refer" : "replicas";
1275 int i;
1276
1277 seq_printf(m, "%s%s=", first++?",":"", loctype);
1278 seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
1279 seq_putc(m, '@');
1280 seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
1281 for (i = 1; i < fsloc->locations_count; i++) {
1282 seq_putc(m, ';');
1283 seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\");
1284 seq_putc(m, '@');
1285 seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\");
1286 }
1287 }
1177} 1288}
1178 1289
1179static int e_show(struct seq_file *m, void *p) 1290static int e_show(struct seq_file *m, void *p)
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 7b889ff15ae6..11fdaf7721b4 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -25,7 +25,7 @@
25static u32 25static u32
26nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) 26nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
27{ 27{
28 u32 nfserr; 28 __be32 nfserr;
29 struct svc_fh fh; 29 struct svc_fh fh;
30 30
31 /* must initialize before using! but maxsize doesn't matter */ 31 /* must initialize before using! but maxsize doesn't matter */
@@ -39,18 +39,20 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
39 fh_put(&fh); 39 fh_put(&fh);
40 rqstp->rq_client = NULL; 40 rqstp->rq_client = NULL;
41 exp_readunlock(); 41 exp_readunlock();
42 /* nlm and nfsd don't share error codes. 42 /* We return nlm error codes as nlm doesn't know
43 * we invent: 0 = no error 43 * about nfsd, but nfsd does know about nlm..
44 * 1 = stale file handle
45 * 2 = other error
46 */ 44 */
47 switch (nfserr) { 45 switch (nfserr) {
48 case nfs_ok: 46 case nfs_ok:
49 return 0; 47 return 0;
48 case nfserr_dropit:
49 return nlm_drop_reply;
50#ifdef CONFIG_LOCKD_V4
50 case nfserr_stale: 51 case nfserr_stale:
51 return 1; 52 return nlm4_stale_fh;
53#endif
52 default: 54 default:
53 return 2; 55 return nlm_lck_denied;
54 } 56 }
55} 57}
56 58
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index fe56b38364cc..e3eca0816986 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -21,7 +21,7 @@
21/* 21/*
22 * NULL call. 22 * NULL call.
23 */ 23 */
24static int 24static __be32
25nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) 25nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
26{ 26{
27 return nfs_ok; 27 return nfs_ok;
@@ -30,12 +30,12 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
30/* 30/*
31 * Get the Access and/or Default ACL of a file. 31 * Get the Access and/or Default ACL of a file.
32 */ 32 */
33static int nfsacld_proc_getacl(struct svc_rqst * rqstp, 33static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
34 struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) 34 struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
35{ 35{
36 svc_fh *fh; 36 svc_fh *fh;
37 struct posix_acl *acl; 37 struct posix_acl *acl;
38 int nfserr = 0; 38 __be32 nfserr = 0;
39 39
40 dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); 40 dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
41 41
@@ -97,12 +97,12 @@ fail:
97/* 97/*
98 * Set the Access and/or Default ACL of a file. 98 * Set the Access and/or Default ACL of a file.
99 */ 99 */
100static int nfsacld_proc_setacl(struct svc_rqst * rqstp, 100static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
101 struct nfsd3_setaclargs *argp, 101 struct nfsd3_setaclargs *argp,
102 struct nfsd_attrstat *resp) 102 struct nfsd_attrstat *resp)
103{ 103{
104 svc_fh *fh; 104 svc_fh *fh;
105 int nfserr = 0; 105 __be32 nfserr = 0;
106 106
107 dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); 107 dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
108 108
@@ -128,7 +128,7 @@ static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
128/* 128/*
129 * Check file attributes 129 * Check file attributes
130 */ 130 */
131static int nfsacld_proc_getattr(struct svc_rqst * rqstp, 131static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
132 struct nfsd_fhandle *argp, struct nfsd_attrstat *resp) 132 struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
133{ 133{
134 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); 134 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
@@ -140,10 +140,10 @@ static int nfsacld_proc_getattr(struct svc_rqst * rqstp,
140/* 140/*
141 * Check file access 141 * Check file access
142 */ 142 */
143static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp, 143static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
144 struct nfsd3_accessres *resp) 144 struct nfsd3_accessres *resp)
145{ 145{
146 int nfserr; 146 __be32 nfserr;
147 147
148 dprintk("nfsd: ACCESS(2acl) %s 0x%x\n", 148 dprintk("nfsd: ACCESS(2acl) %s 0x%x\n",
149 SVCFH_fmt(&argp->fh), 149 SVCFH_fmt(&argp->fh),
@@ -158,7 +158,7 @@ static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *
158/* 158/*
159 * XDR decode functions 159 * XDR decode functions
160 */ 160 */
161static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p, 161static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
162 struct nfsd3_getaclargs *argp) 162 struct nfsd3_getaclargs *argp)
163{ 163{
164 if (!(p = nfs2svc_decode_fh(p, &argp->fh))) 164 if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -169,7 +169,7 @@ static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
169} 169}
170 170
171 171
172static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p, 172static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
173 struct nfsd3_setaclargs *argp) 173 struct nfsd3_setaclargs *argp)
174{ 174{
175 struct kvec *head = rqstp->rq_arg.head; 175 struct kvec *head = rqstp->rq_arg.head;
@@ -194,7 +194,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
194 return (n > 0); 194 return (n > 0);
195} 195}
196 196
197static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p, 197static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
198 struct nfsd_fhandle *argp) 198 struct nfsd_fhandle *argp)
199{ 199{
200 if (!(p = nfs2svc_decode_fh(p, &argp->fh))) 200 if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -202,7 +202,7 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p,
202 return xdr_argsize_check(rqstp, p); 202 return xdr_argsize_check(rqstp, p);
203} 203}
204 204
205static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p, 205static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
206 struct nfsd3_accessargs *argp) 206 struct nfsd3_accessargs *argp)
207{ 207{
208 if (!(p = nfs2svc_decode_fh(p, &argp->fh))) 208 if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -217,7 +217,7 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
217 */ 217 */
218 218
219/* GETACL */ 219/* GETACL */
220static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p, 220static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
221 struct nfsd3_getaclres *resp) 221 struct nfsd3_getaclres *resp)
222{ 222{
223 struct dentry *dentry = resp->fh.fh_dentry; 223 struct dentry *dentry = resp->fh.fh_dentry;
@@ -241,7 +241,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
241 241
242 rqstp->rq_res.page_len = w; 242 rqstp->rq_res.page_len = w;
243 while (w > 0) { 243 while (w > 0) {
244 if (!svc_take_res_page(rqstp)) 244 if (!rqstp->rq_respages[rqstp->rq_resused++])
245 return 0; 245 return 0;
246 w -= PAGE_SIZE; 246 w -= PAGE_SIZE;
247 } 247 }
@@ -259,7 +259,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
259 return 1; 259 return 1;
260} 260}
261 261
262static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p, 262static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
263 struct nfsd_attrstat *resp) 263 struct nfsd_attrstat *resp)
264{ 264{
265 p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); 265 p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
@@ -267,7 +267,7 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p,
267} 267}
268 268
269/* ACCESS */ 269/* ACCESS */
270static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p, 270static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
271 struct nfsd3_accessres *resp) 271 struct nfsd3_accessres *resp)
272{ 272{
273 p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); 273 p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
@@ -278,7 +278,7 @@ static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
278/* 278/*
279 * XDR release functions 279 * XDR release functions
280 */ 280 */
281static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p, 281static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
282 struct nfsd3_getaclres *resp) 282 struct nfsd3_getaclres *resp)
283{ 283{
284 fh_put(&resp->fh); 284 fh_put(&resp->fh);
@@ -287,7 +287,7 @@ static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p,
287 return 1; 287 return 1;
288} 288}
289 289
290static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, u32 *p, 290static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
291 struct nfsd_fhandle *resp) 291 struct nfsd_fhandle *resp)
292{ 292{
293 fh_put(&resp->fh); 293 fh_put(&resp->fh);
@@ -333,4 +333,5 @@ struct svc_version nfsd_acl_version2 = {
333 .vs_proc = nfsd_acl_procedures2, 333 .vs_proc = nfsd_acl_procedures2,
334 .vs_dispatch = nfsd_dispatch, 334 .vs_dispatch = nfsd_dispatch,
335 .vs_xdrsize = NFS3_SVC_XDRSIZE, 335 .vs_xdrsize = NFS3_SVC_XDRSIZE,
336 .vs_hidden = 1,
336}; 337};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 16e10c170aed..fcad2895ddb0 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -19,7 +19,7 @@
19/* 19/*
20 * NULL call. 20 * NULL call.
21 */ 21 */
22static int 22static __be32
23nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) 23nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
24{ 24{
25 return nfs_ok; 25 return nfs_ok;
@@ -28,12 +28,12 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
28/* 28/*
29 * Get the Access and/or Default ACL of a file. 29 * Get the Access and/or Default ACL of a file.
30 */ 30 */
31static int nfsd3_proc_getacl(struct svc_rqst * rqstp, 31static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
32 struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) 32 struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
33{ 33{
34 svc_fh *fh; 34 svc_fh *fh;
35 struct posix_acl *acl; 35 struct posix_acl *acl;
36 int nfserr = 0; 36 __be32 nfserr = 0;
37 37
38 fh = fh_copy(&resp->fh, &argp->fh); 38 fh = fh_copy(&resp->fh, &argp->fh);
39 if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP))) 39 if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
@@ -93,12 +93,12 @@ fail:
93/* 93/*
94 * Set the Access and/or Default ACL of a file. 94 * Set the Access and/or Default ACL of a file.
95 */ 95 */
96static int nfsd3_proc_setacl(struct svc_rqst * rqstp, 96static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
97 struct nfsd3_setaclargs *argp, 97 struct nfsd3_setaclargs *argp,
98 struct nfsd3_attrstat *resp) 98 struct nfsd3_attrstat *resp)
99{ 99{
100 svc_fh *fh; 100 svc_fh *fh;
101 int nfserr = 0; 101 __be32 nfserr = 0;
102 102
103 fh = fh_copy(&resp->fh, &argp->fh); 103 fh = fh_copy(&resp->fh, &argp->fh);
104 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR); 104 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
@@ -122,7 +122,7 @@ static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
122/* 122/*
123 * XDR decode functions 123 * XDR decode functions
124 */ 124 */
125static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p, 125static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
126 struct nfsd3_getaclargs *args) 126 struct nfsd3_getaclargs *args)
127{ 127{
128 if (!(p = nfs3svc_decode_fh(p, &args->fh))) 128 if (!(p = nfs3svc_decode_fh(p, &args->fh)))
@@ -133,7 +133,7 @@ static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
133} 133}
134 134
135 135
136static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p, 136static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
137 struct nfsd3_setaclargs *args) 137 struct nfsd3_setaclargs *args)
138{ 138{
139 struct kvec *head = rqstp->rq_arg.head; 139 struct kvec *head = rqstp->rq_arg.head;
@@ -163,7 +163,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
163 */ 163 */
164 164
165/* GETACL */ 165/* GETACL */
166static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p, 166static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
167 struct nfsd3_getaclres *resp) 167 struct nfsd3_getaclres *resp)
168{ 168{
169 struct dentry *dentry = resp->fh.fh_dentry; 169 struct dentry *dentry = resp->fh.fh_dentry;
@@ -185,7 +185,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
185 185
186 rqstp->rq_res.page_len = w; 186 rqstp->rq_res.page_len = w;
187 while (w > 0) { 187 while (w > 0) {
188 if (!svc_take_res_page(rqstp)) 188 if (!rqstp->rq_respages[rqstp->rq_resused++])
189 return 0; 189 return 0;
190 w -= PAGE_SIZE; 190 w -= PAGE_SIZE;
191 } 191 }
@@ -208,7 +208,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
208} 208}
209 209
210/* SETACL */ 210/* SETACL */
211static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p, 211static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p,
212 struct nfsd3_attrstat *resp) 212 struct nfsd3_attrstat *resp)
213{ 213{
214 p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); 214 p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
@@ -219,7 +219,7 @@ static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p,
219/* 219/*
220 * XDR release functions 220 * XDR release functions
221 */ 221 */
222static int nfs3svc_release_getacl(struct svc_rqst *rqstp, u32 *p, 222static int nfs3svc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
223 struct nfsd3_getaclres *resp) 223 struct nfsd3_getaclres *resp)
224{ 224{
225 fh_put(&resp->fh); 225 fh_put(&resp->fh);
@@ -263,5 +263,6 @@ struct svc_version nfsd_acl_version3 = {
263 .vs_proc = nfsd_acl_procedures3, 263 .vs_proc = nfsd_acl_procedures3,
264 .vs_dispatch = nfsd_dispatch, 264 .vs_dispatch = nfsd_dispatch,
265 .vs_xdrsize = NFS3_SVC_XDRSIZE, 265 .vs_xdrsize = NFS3_SVC_XDRSIZE,
266 .vs_hidden = 1,
266}; 267};
267 268
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index f61142afea44..64db601c2bd2 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -43,7 +43,7 @@ static int nfs3_ftypes[] = {
43/* 43/*
44 * NULL call. 44 * NULL call.
45 */ 45 */
46static int 46static __be32
47nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) 47nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
48{ 48{
49 return nfs_ok; 49 return nfs_ok;
@@ -52,11 +52,12 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
52/* 52/*
53 * Get a file's attributes 53 * Get a file's attributes
54 */ 54 */
55static int 55static __be32
56nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, 56nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
57 struct nfsd3_attrstat *resp) 57 struct nfsd3_attrstat *resp)
58{ 58{
59 int err, nfserr; 59 int err;
60 __be32 nfserr;
60 61
61 dprintk("nfsd: GETATTR(3) %s\n", 62 dprintk("nfsd: GETATTR(3) %s\n",
62 SVCFH_fmt(&argp->fh)); 63 SVCFH_fmt(&argp->fh));
@@ -76,11 +77,11 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
76/* 77/*
77 * Set a file's attributes 78 * Set a file's attributes
78 */ 79 */
79static int 80static __be32
80nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp, 81nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
81 struct nfsd3_attrstat *resp) 82 struct nfsd3_attrstat *resp)
82{ 83{
83 int nfserr; 84 __be32 nfserr;
84 85
85 dprintk("nfsd: SETATTR(3) %s\n", 86 dprintk("nfsd: SETATTR(3) %s\n",
86 SVCFH_fmt(&argp->fh)); 87 SVCFH_fmt(&argp->fh));
@@ -94,11 +95,11 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
94/* 95/*
95 * Look up a path name component 96 * Look up a path name component
96 */ 97 */
97static int 98static __be32
98nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, 99nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
99 struct nfsd3_diropres *resp) 100 struct nfsd3_diropres *resp)
100{ 101{
101 int nfserr; 102 __be32 nfserr;
102 103
103 dprintk("nfsd: LOOKUP(3) %s %.*s\n", 104 dprintk("nfsd: LOOKUP(3) %s %.*s\n",
104 SVCFH_fmt(&argp->fh), 105 SVCFH_fmt(&argp->fh),
@@ -118,11 +119,11 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
118/* 119/*
119 * Check file access 120 * Check file access
120 */ 121 */
121static int 122static __be32
122nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp, 123nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
123 struct nfsd3_accessres *resp) 124 struct nfsd3_accessres *resp)
124{ 125{
125 int nfserr; 126 __be32 nfserr;
126 127
127 dprintk("nfsd: ACCESS(3) %s 0x%x\n", 128 dprintk("nfsd: ACCESS(3) %s 0x%x\n",
128 SVCFH_fmt(&argp->fh), 129 SVCFH_fmt(&argp->fh),
@@ -137,11 +138,11 @@ nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
137/* 138/*
138 * Read a symlink. 139 * Read a symlink.
139 */ 140 */
140static int 141static __be32
141nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp, 142nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
142 struct nfsd3_readlinkres *resp) 143 struct nfsd3_readlinkres *resp)
143{ 144{
144 int nfserr; 145 __be32 nfserr;
145 146
146 dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); 147 dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
147 148
@@ -155,11 +156,12 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
155/* 156/*
156 * Read a portion of a file. 157 * Read a portion of a file.
157 */ 158 */
158static int 159static __be32
159nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, 160nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
160 struct nfsd3_readres *resp) 161 struct nfsd3_readres *resp)
161{ 162{
162 int nfserr; 163 __be32 nfserr;
164 u32 max_blocksize = svc_max_payload(rqstp);
163 165
164 dprintk("nfsd: READ(3) %s %lu bytes at %lu\n", 166 dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
165 SVCFH_fmt(&argp->fh), 167 SVCFH_fmt(&argp->fh),
@@ -172,15 +174,15 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
172 */ 174 */
173 175
174 resp->count = argp->count; 176 resp->count = argp->count;
175 if (NFSSVC_MAXBLKSIZE < resp->count) 177 if (max_blocksize < resp->count)
176 resp->count = NFSSVC_MAXBLKSIZE; 178 resp->count = max_blocksize;
177 179
178 svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); 180 svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
179 181
180 fh_copy(&resp->fh, &argp->fh); 182 fh_copy(&resp->fh, &argp->fh);
181 nfserr = nfsd_read(rqstp, &resp->fh, NULL, 183 nfserr = nfsd_read(rqstp, &resp->fh, NULL,
182 argp->offset, 184 argp->offset,
183 argp->vec, argp->vlen, 185 rqstp->rq_vec, argp->vlen,
184 &resp->count); 186 &resp->count);
185 if (nfserr == 0) { 187 if (nfserr == 0) {
186 struct inode *inode = resp->fh.fh_dentry->d_inode; 188 struct inode *inode = resp->fh.fh_dentry->d_inode;
@@ -194,11 +196,11 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
194/* 196/*
195 * Write data to a file 197 * Write data to a file
196 */ 198 */
197static int 199static __be32
198nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, 200nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
199 struct nfsd3_writeres *resp) 201 struct nfsd3_writeres *resp)
200{ 202{
201 int nfserr; 203 __be32 nfserr;
202 204
203 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 205 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
204 SVCFH_fmt(&argp->fh), 206 SVCFH_fmt(&argp->fh),
@@ -210,7 +212,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
210 resp->committed = argp->stable; 212 resp->committed = argp->stable;
211 nfserr = nfsd_write(rqstp, &resp->fh, NULL, 213 nfserr = nfsd_write(rqstp, &resp->fh, NULL,
212 argp->offset, 214 argp->offset,
213 argp->vec, argp->vlen, 215 rqstp->rq_vec, argp->vlen,
214 argp->len, 216 argp->len,
215 &resp->committed); 217 &resp->committed);
216 resp->count = argp->count; 218 resp->count = argp->count;
@@ -222,13 +224,13 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
222 * At least in theory; we'll see how it fares in practice when the 224 * At least in theory; we'll see how it fares in practice when the
223 * first reports about SunOS compatibility problems start to pour in... 225 * first reports about SunOS compatibility problems start to pour in...
224 */ 226 */
225static int 227static __be32
226nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, 228nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
227 struct nfsd3_diropres *resp) 229 struct nfsd3_diropres *resp)
228{ 230{
229 svc_fh *dirfhp, *newfhp = NULL; 231 svc_fh *dirfhp, *newfhp = NULL;
230 struct iattr *attr; 232 struct iattr *attr;
231 u32 nfserr; 233 __be32 nfserr;
232 234
233 dprintk("nfsd: CREATE(3) %s %.*s\n", 235 dprintk("nfsd: CREATE(3) %s %.*s\n",
234 SVCFH_fmt(&argp->fh), 236 SVCFH_fmt(&argp->fh),
@@ -264,11 +266,11 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
264/* 266/*
265 * Make directory. This operation is not idempotent. 267 * Make directory. This operation is not idempotent.
266 */ 268 */
267static int 269static __be32
268nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, 270nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
269 struct nfsd3_diropres *resp) 271 struct nfsd3_diropres *resp)
270{ 272{
271 int nfserr; 273 __be32 nfserr;
272 274
273 dprintk("nfsd: MKDIR(3) %s %.*s\n", 275 dprintk("nfsd: MKDIR(3) %s %.*s\n",
274 SVCFH_fmt(&argp->fh), 276 SVCFH_fmt(&argp->fh),
@@ -284,11 +286,11 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
284 RETURN_STATUS(nfserr); 286 RETURN_STATUS(nfserr);
285} 287}
286 288
287static int 289static __be32
288nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp, 290nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
289 struct nfsd3_diropres *resp) 291 struct nfsd3_diropres *resp)
290{ 292{
291 int nfserr; 293 __be32 nfserr;
292 294
293 dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", 295 dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n",
294 SVCFH_fmt(&argp->ffh), 296 SVCFH_fmt(&argp->ffh),
@@ -306,11 +308,12 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
306/* 308/*
307 * Make socket/fifo/device. 309 * Make socket/fifo/device.
308 */ 310 */
309static int 311static __be32
310nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp, 312nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
311 struct nfsd3_diropres *resp) 313 struct nfsd3_diropres *resp)
312{ 314{
313 int nfserr, type; 315 __be32 nfserr;
316 int type;
314 dev_t rdev = 0; 317 dev_t rdev = 0;
315 318
316 dprintk("nfsd: MKNOD(3) %s %.*s\n", 319 dprintk("nfsd: MKNOD(3) %s %.*s\n",
@@ -342,11 +345,11 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
342/* 345/*
343 * Remove file/fifo/socket etc. 346 * Remove file/fifo/socket etc.
344 */ 347 */
345static int 348static __be32
346nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, 349nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
347 struct nfsd3_attrstat *resp) 350 struct nfsd3_attrstat *resp)
348{ 351{
349 int nfserr; 352 __be32 nfserr;
350 353
351 dprintk("nfsd: REMOVE(3) %s %.*s\n", 354 dprintk("nfsd: REMOVE(3) %s %.*s\n",
352 SVCFH_fmt(&argp->fh), 355 SVCFH_fmt(&argp->fh),
@@ -362,11 +365,11 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
362/* 365/*
363 * Remove a directory 366 * Remove a directory
364 */ 367 */
365static int 368static __be32
366nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, 369nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
367 struct nfsd3_attrstat *resp) 370 struct nfsd3_attrstat *resp)
368{ 371{
369 int nfserr; 372 __be32 nfserr;
370 373
371 dprintk("nfsd: RMDIR(3) %s %.*s\n", 374 dprintk("nfsd: RMDIR(3) %s %.*s\n",
372 SVCFH_fmt(&argp->fh), 375 SVCFH_fmt(&argp->fh),
@@ -378,11 +381,11 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
378 RETURN_STATUS(nfserr); 381 RETURN_STATUS(nfserr);
379} 382}
380 383
381static int 384static __be32
382nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp, 385nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
383 struct nfsd3_renameres *resp) 386 struct nfsd3_renameres *resp)
384{ 387{
385 int nfserr; 388 __be32 nfserr;
386 389
387 dprintk("nfsd: RENAME(3) %s %.*s ->\n", 390 dprintk("nfsd: RENAME(3) %s %.*s ->\n",
388 SVCFH_fmt(&argp->ffh), 391 SVCFH_fmt(&argp->ffh),
@@ -400,11 +403,11 @@ nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
400 RETURN_STATUS(nfserr); 403 RETURN_STATUS(nfserr);
401} 404}
402 405
403static int 406static __be32
404nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp, 407nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
405 struct nfsd3_linkres *resp) 408 struct nfsd3_linkres *resp)
406{ 409{
407 int nfserr; 410 __be32 nfserr;
408 411
409 dprintk("nfsd: LINK(3) %s ->\n", 412 dprintk("nfsd: LINK(3) %s ->\n",
410 SVCFH_fmt(&argp->ffh)); 413 SVCFH_fmt(&argp->ffh));
@@ -423,11 +426,12 @@ nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
423/* 426/*
424 * Read a portion of a directory. 427 * Read a portion of a directory.
425 */ 428 */
426static int 429static __be32
427nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, 430nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
428 struct nfsd3_readdirres *resp) 431 struct nfsd3_readdirres *resp)
429{ 432{
430 int nfserr, count; 433 __be32 nfserr;
434 int count;
431 435
432 dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", 436 dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
433 SVCFH_fmt(&argp->fh), 437 SVCFH_fmt(&argp->fh),
@@ -458,11 +462,12 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
458 * Read a portion of a directory, including file handles and attrs. 462 * Read a portion of a directory, including file handles and attrs.
459 * For now, we choose to ignore the dircount parameter. 463 * For now, we choose to ignore the dircount parameter.
460 */ 464 */
461static int 465static __be32
462nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, 466nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
463 struct nfsd3_readdirres *resp) 467 struct nfsd3_readdirres *resp)
464{ 468{
465 int nfserr, count = 0; 469 __be32 nfserr;
470 int count = 0;
466 loff_t offset; 471 loff_t offset;
467 int i; 472 int i;
468 caddr_t page_addr = NULL; 473 caddr_t page_addr = NULL;
@@ -516,11 +521,11 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
516/* 521/*
517 * Get file system stats 522 * Get file system stats
518 */ 523 */
519static int 524static __be32
520nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, 525nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
521 struct nfsd3_fsstatres *resp) 526 struct nfsd3_fsstatres *resp)
522{ 527{
523 int nfserr; 528 __be32 nfserr;
524 529
525 dprintk("nfsd: FSSTAT(3) %s\n", 530 dprintk("nfsd: FSSTAT(3) %s\n",
526 SVCFH_fmt(&argp->fh)); 531 SVCFH_fmt(&argp->fh));
@@ -533,20 +538,21 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
533/* 538/*
534 * Get file system info 539 * Get file system info
535 */ 540 */
536static int 541static __be32
537nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, 542nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
538 struct nfsd3_fsinfores *resp) 543 struct nfsd3_fsinfores *resp)
539{ 544{
540 int nfserr; 545 __be32 nfserr;
546 u32 max_blocksize = svc_max_payload(rqstp);
541 547
542 dprintk("nfsd: FSINFO(3) %s\n", 548 dprintk("nfsd: FSINFO(3) %s\n",
543 SVCFH_fmt(&argp->fh)); 549 SVCFH_fmt(&argp->fh));
544 550
545 resp->f_rtmax = NFSSVC_MAXBLKSIZE; 551 resp->f_rtmax = max_blocksize;
546 resp->f_rtpref = NFSSVC_MAXBLKSIZE; 552 resp->f_rtpref = max_blocksize;
547 resp->f_rtmult = PAGE_SIZE; 553 resp->f_rtmult = PAGE_SIZE;
548 resp->f_wtmax = NFSSVC_MAXBLKSIZE; 554 resp->f_wtmax = max_blocksize;
549 resp->f_wtpref = NFSSVC_MAXBLKSIZE; 555 resp->f_wtpref = max_blocksize;
550 resp->f_wtmult = PAGE_SIZE; 556 resp->f_wtmult = PAGE_SIZE;
551 resp->f_dtpref = PAGE_SIZE; 557 resp->f_dtpref = PAGE_SIZE;
552 resp->f_maxfilesize = ~(u32) 0; 558 resp->f_maxfilesize = ~(u32) 0;
@@ -574,11 +580,11 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
574/* 580/*
575 * Get pathconf info for the specified file 581 * Get pathconf info for the specified file
576 */ 582 */
577static int 583static __be32
578nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, 584nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
579 struct nfsd3_pathconfres *resp) 585 struct nfsd3_pathconfres *resp)
580{ 586{
581 int nfserr; 587 __be32 nfserr;
582 588
583 dprintk("nfsd: PATHCONF(3) %s\n", 589 dprintk("nfsd: PATHCONF(3) %s\n",
584 SVCFH_fmt(&argp->fh)); 590 SVCFH_fmt(&argp->fh));
@@ -617,11 +623,11 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
617/* 623/*
618 * Commit a file (range) to stable storage. 624 * Commit a file (range) to stable storage.
619 */ 625 */
620static int 626static __be32
621nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp, 627nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
622 struct nfsd3_commitres *resp) 628 struct nfsd3_commitres *resp)
623{ 629{
624 int nfserr; 630 __be32 nfserr;
625 631
626 dprintk("nfsd: COMMIT(3) %s %u@%Lu\n", 632 dprintk("nfsd: COMMIT(3) %s %u@%Lu\n",
627 SVCFH_fmt(&argp->fh), 633 SVCFH_fmt(&argp->fh),
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 243d94b9653a..b4baca3053c3 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -42,23 +42,23 @@ static u32 nfs3_ftypes[] = {
42/* 42/*
43 * XDR functions for basic NFS types 43 * XDR functions for basic NFS types
44 */ 44 */
45static inline u32 * 45static inline __be32 *
46encode_time3(u32 *p, struct timespec *time) 46encode_time3(__be32 *p, struct timespec *time)
47{ 47{
48 *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec); 48 *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
49 return p; 49 return p;
50} 50}
51 51
52static inline u32 * 52static inline __be32 *
53decode_time3(u32 *p, struct timespec *time) 53decode_time3(__be32 *p, struct timespec *time)
54{ 54{
55 time->tv_sec = ntohl(*p++); 55 time->tv_sec = ntohl(*p++);
56 time->tv_nsec = ntohl(*p++); 56 time->tv_nsec = ntohl(*p++);
57 return p; 57 return p;
58} 58}
59 59
60static inline u32 * 60static inline __be32 *
61decode_fh(u32 *p, struct svc_fh *fhp) 61decode_fh(__be32 *p, struct svc_fh *fhp)
62{ 62{
63 unsigned int size; 63 unsigned int size;
64 fh_init(fhp, NFS3_FHSIZE); 64 fh_init(fhp, NFS3_FHSIZE);
@@ -72,13 +72,13 @@ decode_fh(u32 *p, struct svc_fh *fhp)
72} 72}
73 73
74/* Helper function for NFSv3 ACL code */ 74/* Helper function for NFSv3 ACL code */
75u32 *nfs3svc_decode_fh(u32 *p, struct svc_fh *fhp) 75__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
76{ 76{
77 return decode_fh(p, fhp); 77 return decode_fh(p, fhp);
78} 78}
79 79
80static inline u32 * 80static inline __be32 *
81encode_fh(u32 *p, struct svc_fh *fhp) 81encode_fh(__be32 *p, struct svc_fh *fhp)
82{ 82{
83 unsigned int size = fhp->fh_handle.fh_size; 83 unsigned int size = fhp->fh_handle.fh_size;
84 *p++ = htonl(size); 84 *p++ = htonl(size);
@@ -91,8 +91,8 @@ encode_fh(u32 *p, struct svc_fh *fhp)
91 * Decode a file name and make sure that the path contains 91 * Decode a file name and make sure that the path contains
92 * no slashes or null bytes. 92 * no slashes or null bytes.
93 */ 93 */
94static inline u32 * 94static inline __be32 *
95decode_filename(u32 *p, char **namp, int *lenp) 95decode_filename(__be32 *p, char **namp, int *lenp)
96{ 96{
97 char *name; 97 char *name;
98 int i; 98 int i;
@@ -107,8 +107,8 @@ decode_filename(u32 *p, char **namp, int *lenp)
107 return p; 107 return p;
108} 108}
109 109
110static inline u32 * 110static inline __be32 *
111decode_sattr3(u32 *p, struct iattr *iap) 111decode_sattr3(__be32 *p, struct iattr *iap)
112{ 112{
113 u32 tmp; 113 u32 tmp;
114 114
@@ -153,8 +153,8 @@ decode_sattr3(u32 *p, struct iattr *iap)
153 return p; 153 return p;
154} 154}
155 155
156static inline u32 * 156static inline __be32 *
157encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp, 157encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
158 struct kstat *stat) 158 struct kstat *stat)
159{ 159{
160 struct dentry *dentry = fhp->fh_dentry; 160 struct dentry *dentry = fhp->fh_dentry;
@@ -186,8 +186,8 @@ encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
186 return p; 186 return p;
187} 187}
188 188
189static inline u32 * 189static inline __be32 *
190encode_saved_post_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 190encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
191{ 191{
192 struct inode *inode = fhp->fh_dentry->d_inode; 192 struct inode *inode = fhp->fh_dentry->d_inode;
193 193
@@ -224,8 +224,8 @@ encode_saved_post_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
224 * The inode may be NULL if the call failed because of a stale file 224 * The inode may be NULL if the call failed because of a stale file
225 * handle. In this case, no attributes are returned. 225 * handle. In this case, no attributes are returned.
226 */ 226 */
227static u32 * 227static __be32 *
228encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 228encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
229{ 229{
230 struct dentry *dentry = fhp->fh_dentry; 230 struct dentry *dentry = fhp->fh_dentry;
231 if (dentry && dentry->d_inode != NULL) { 231 if (dentry && dentry->d_inode != NULL) {
@@ -243,8 +243,8 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
243} 243}
244 244
245/* Helper for NFSv3 ACLs */ 245/* Helper for NFSv3 ACLs */
246u32 * 246__be32 *
247nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 247nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
248{ 248{
249 return encode_post_op_attr(rqstp, p, fhp); 249 return encode_post_op_attr(rqstp, p, fhp);
250} 250}
@@ -252,8 +252,8 @@ nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
252/* 252/*
253 * Enocde weak cache consistency data 253 * Enocde weak cache consistency data
254 */ 254 */
255static u32 * 255static __be32 *
256encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 256encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
257{ 257{
258 struct dentry *dentry = fhp->fh_dentry; 258 struct dentry *dentry = fhp->fh_dentry;
259 259
@@ -278,7 +278,7 @@ encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
278 * XDR decode functions 278 * XDR decode functions
279 */ 279 */
280int 280int
281nfs3svc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args) 281nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
282{ 282{
283 if (!(p = decode_fh(p, &args->fh))) 283 if (!(p = decode_fh(p, &args->fh)))
284 return 0; 284 return 0;
@@ -286,7 +286,7 @@ nfs3svc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args
286} 286}
287 287
288int 288int
289nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p, 289nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
290 struct nfsd3_sattrargs *args) 290 struct nfsd3_sattrargs *args)
291{ 291{
292 if (!(p = decode_fh(p, &args->fh)) 292 if (!(p = decode_fh(p, &args->fh))
@@ -303,7 +303,7 @@ nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
303} 303}
304 304
305int 305int
306nfs3svc_decode_diropargs(struct svc_rqst *rqstp, u32 *p, 306nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
307 struct nfsd3_diropargs *args) 307 struct nfsd3_diropargs *args)
308{ 308{
309 if (!(p = decode_fh(p, &args->fh)) 309 if (!(p = decode_fh(p, &args->fh))
@@ -314,7 +314,7 @@ nfs3svc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
314} 314}
315 315
316int 316int
317nfs3svc_decode_accessargs(struct svc_rqst *rqstp, u32 *p, 317nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
318 struct nfsd3_accessargs *args) 318 struct nfsd3_accessargs *args)
319{ 319{
320 if (!(p = decode_fh(p, &args->fh))) 320 if (!(p = decode_fh(p, &args->fh)))
@@ -325,11 +325,12 @@ nfs3svc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
325} 325}
326 326
327int 327int
328nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p, 328nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
329 struct nfsd3_readargs *args) 329 struct nfsd3_readargs *args)
330{ 330{
331 unsigned int len; 331 unsigned int len;
332 int v,pn; 332 int v,pn;
333 u32 max_blocksize = svc_max_payload(rqstp);
333 334
334 if (!(p = decode_fh(p, &args->fh)) 335 if (!(p = decode_fh(p, &args->fh))
335 || !(p = xdr_decode_hyper(p, &args->offset))) 336 || !(p = xdr_decode_hyper(p, &args->offset)))
@@ -337,17 +338,16 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
337 338
338 len = args->count = ntohl(*p++); 339 len = args->count = ntohl(*p++);
339 340
340 if (len > NFSSVC_MAXBLKSIZE) 341 if (len > max_blocksize)
341 len = NFSSVC_MAXBLKSIZE; 342 len = max_blocksize;
342 343
343 /* set up the kvec */ 344 /* set up the kvec */
344 v=0; 345 v=0;
345 while (len > 0) { 346 while (len > 0) {
346 pn = rqstp->rq_resused; 347 pn = rqstp->rq_resused++;
347 svc_take_page(rqstp); 348 rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
348 args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]); 349 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
349 args->vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; 350 len -= rqstp->rq_vec[v].iov_len;
350 len -= args->vec[v].iov_len;
351 v++; 351 v++;
352 } 352 }
353 args->vlen = v; 353 args->vlen = v;
@@ -355,10 +355,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
355} 355}
356 356
357int 357int
358nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p, 358nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
359 struct nfsd3_writeargs *args) 359 struct nfsd3_writeargs *args)
360{ 360{
361 unsigned int len, v, hdr; 361 unsigned int len, v, hdr;
362 u32 max_blocksize = svc_max_payload(rqstp);
362 363
363 if (!(p = decode_fh(p, &args->fh)) 364 if (!(p = decode_fh(p, &args->fh))
364 || !(p = xdr_decode_hyper(p, &args->offset))) 365 || !(p = xdr_decode_hyper(p, &args->offset)))
@@ -373,26 +374,26 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
373 rqstp->rq_arg.len - hdr < len) 374 rqstp->rq_arg.len - hdr < len)
374 return 0; 375 return 0;
375 376
376 args->vec[0].iov_base = (void*)p; 377 rqstp->rq_vec[0].iov_base = (void*)p;
377 args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; 378 rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
378 379
379 if (len > NFSSVC_MAXBLKSIZE) 380 if (len > max_blocksize)
380 len = NFSSVC_MAXBLKSIZE; 381 len = max_blocksize;
381 v= 0; 382 v= 0;
382 while (len > args->vec[v].iov_len) { 383 while (len > rqstp->rq_vec[v].iov_len) {
383 len -= args->vec[v].iov_len; 384 len -= rqstp->rq_vec[v].iov_len;
384 v++; 385 v++;
385 args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]); 386 rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
386 args->vec[v].iov_len = PAGE_SIZE; 387 rqstp->rq_vec[v].iov_len = PAGE_SIZE;
387 } 388 }
388 args->vec[v].iov_len = len; 389 rqstp->rq_vec[v].iov_len = len;
389 args->vlen = v+1; 390 args->vlen = v+1;
390 391
391 return args->count == args->len && args->vec[0].iov_len > 0; 392 return args->count == args->len && rqstp->rq_vec[0].iov_len > 0;
392} 393}
393 394
394int 395int
395nfs3svc_decode_createargs(struct svc_rqst *rqstp, u32 *p, 396nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
396 struct nfsd3_createargs *args) 397 struct nfsd3_createargs *args)
397{ 398{
398 if (!(p = decode_fh(p, &args->fh)) 399 if (!(p = decode_fh(p, &args->fh))
@@ -416,7 +417,7 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
416 return xdr_argsize_check(rqstp, p); 417 return xdr_argsize_check(rqstp, p);
417} 418}
418int 419int
419nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, u32 *p, 420nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
420 struct nfsd3_createargs *args) 421 struct nfsd3_createargs *args)
421{ 422{
422 if (!(p = decode_fh(p, &args->fh)) 423 if (!(p = decode_fh(p, &args->fh))
@@ -428,7 +429,7 @@ nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, u32 *p,
428} 429}
429 430
430int 431int
431nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p, 432nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
432 struct nfsd3_symlinkargs *args) 433 struct nfsd3_symlinkargs *args)
433{ 434{
434 unsigned int len; 435 unsigned int len;
@@ -446,11 +447,11 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
446 * This page appears in the rq_res.pages list, but as pages_len is always 447 * This page appears in the rq_res.pages list, but as pages_len is always
447 * 0, it won't get in the way 448 * 0, it won't get in the way
448 */ 449 */
449 svc_take_page(rqstp);
450 len = ntohl(*p++); 450 len = ntohl(*p++);
451 if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) 451 if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
452 return 0; 452 return 0;
453 args->tname = new = page_address(rqstp->rq_respages[rqstp->rq_resused-1]); 453 args->tname = new =
454 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
454 args->tlen = len; 455 args->tlen = len;
455 /* first copy and check from the first page */ 456 /* first copy and check from the first page */
456 old = (char*)p; 457 old = (char*)p;
@@ -480,7 +481,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
480} 481}
481 482
482int 483int
483nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, u32 *p, 484nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p,
484 struct nfsd3_mknodargs *args) 485 struct nfsd3_mknodargs *args)
485{ 486{
486 if (!(p = decode_fh(p, &args->fh)) 487 if (!(p = decode_fh(p, &args->fh))
@@ -504,7 +505,7 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, u32 *p,
504} 505}
505 506
506int 507int
507nfs3svc_decode_renameargs(struct svc_rqst *rqstp, u32 *p, 508nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
508 struct nfsd3_renameargs *args) 509 struct nfsd3_renameargs *args)
509{ 510{
510 if (!(p = decode_fh(p, &args->ffh)) 511 if (!(p = decode_fh(p, &args->ffh))
@@ -517,19 +518,19 @@ nfs3svc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
517} 518}
518 519
519int 520int
520nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p, 521nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
521 struct nfsd3_readlinkargs *args) 522 struct nfsd3_readlinkargs *args)
522{ 523{
523 if (!(p = decode_fh(p, &args->fh))) 524 if (!(p = decode_fh(p, &args->fh)))
524 return 0; 525 return 0;
525 svc_take_page(rqstp); 526 args->buffer =
526 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]); 527 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
527 528
528 return xdr_argsize_check(rqstp, p); 529 return xdr_argsize_check(rqstp, p);
529} 530}
530 531
531int 532int
532nfs3svc_decode_linkargs(struct svc_rqst *rqstp, u32 *p, 533nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
533 struct nfsd3_linkargs *args) 534 struct nfsd3_linkargs *args)
534{ 535{
535 if (!(p = decode_fh(p, &args->ffh)) 536 if (!(p = decode_fh(p, &args->ffh))
@@ -541,7 +542,7 @@ nfs3svc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
541} 542}
542 543
543int 544int
544nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p, 545nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
545 struct nfsd3_readdirargs *args) 546 struct nfsd3_readdirargs *args)
546{ 547{
547 if (!(p = decode_fh(p, &args->fh))) 548 if (!(p = decode_fh(p, &args->fh)))
@@ -554,17 +555,18 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
554 if (args->count > PAGE_SIZE) 555 if (args->count > PAGE_SIZE)
555 args->count = PAGE_SIZE; 556 args->count = PAGE_SIZE;
556 557
557 svc_take_page(rqstp); 558 args->buffer =
558 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]); 559 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
559 560
560 return xdr_argsize_check(rqstp, p); 561 return xdr_argsize_check(rqstp, p);
561} 562}
562 563
563int 564int
564nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p, 565nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
565 struct nfsd3_readdirargs *args) 566 struct nfsd3_readdirargs *args)
566{ 567{
567 int len, pn; 568 int len, pn;
569 u32 max_blocksize = svc_max_payload(rqstp);
568 570
569 if (!(p = decode_fh(p, &args->fh))) 571 if (!(p = decode_fh(p, &args->fh)))
570 return 0; 572 return 0;
@@ -573,13 +575,12 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
573 args->dircount = ntohl(*p++); 575 args->dircount = ntohl(*p++);
574 args->count = ntohl(*p++); 576 args->count = ntohl(*p++);
575 577
576 len = (args->count > NFSSVC_MAXBLKSIZE) ? NFSSVC_MAXBLKSIZE : 578 len = (args->count > max_blocksize) ? max_blocksize :
577 args->count; 579 args->count;
578 args->count = len; 580 args->count = len;
579 581
580 while (len > 0) { 582 while (len > 0) {
581 pn = rqstp->rq_resused; 583 pn = rqstp->rq_resused++;
582 svc_take_page(rqstp);
583 if (!args->buffer) 584 if (!args->buffer)
584 args->buffer = page_address(rqstp->rq_respages[pn]); 585 args->buffer = page_address(rqstp->rq_respages[pn]);
585 len -= PAGE_SIZE; 586 len -= PAGE_SIZE;
@@ -589,7 +590,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
589} 590}
590 591
591int 592int
592nfs3svc_decode_commitargs(struct svc_rqst *rqstp, u32 *p, 593nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p,
593 struct nfsd3_commitargs *args) 594 struct nfsd3_commitargs *args)
594{ 595{
595 if (!(p = decode_fh(p, &args->fh))) 596 if (!(p = decode_fh(p, &args->fh)))
@@ -608,14 +609,14 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, u32 *p,
608 * will work properly. 609 * will work properly.
609 */ 610 */
610int 611int
611nfs3svc_encode_voidres(struct svc_rqst *rqstp, u32 *p, void *dummy) 612nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
612{ 613{
613 return xdr_ressize_check(rqstp, p); 614 return xdr_ressize_check(rqstp, p);
614} 615}
615 616
616/* GETATTR */ 617/* GETATTR */
617int 618int
618nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p, 619nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
619 struct nfsd3_attrstat *resp) 620 struct nfsd3_attrstat *resp)
620{ 621{
621 if (resp->status == 0) 622 if (resp->status == 0)
@@ -625,7 +626,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
625 626
626/* SETATTR, REMOVE, RMDIR */ 627/* SETATTR, REMOVE, RMDIR */
627int 628int
628nfs3svc_encode_wccstat(struct svc_rqst *rqstp, u32 *p, 629nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p,
629 struct nfsd3_attrstat *resp) 630 struct nfsd3_attrstat *resp)
630{ 631{
631 p = encode_wcc_data(rqstp, p, &resp->fh); 632 p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -634,7 +635,7 @@ nfs3svc_encode_wccstat(struct svc_rqst *rqstp, u32 *p,
634 635
635/* LOOKUP */ 636/* LOOKUP */
636int 637int
637nfs3svc_encode_diropres(struct svc_rqst *rqstp, u32 *p, 638nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
638 struct nfsd3_diropres *resp) 639 struct nfsd3_diropres *resp)
639{ 640{
640 if (resp->status == 0) { 641 if (resp->status == 0) {
@@ -647,7 +648,7 @@ nfs3svc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
647 648
648/* ACCESS */ 649/* ACCESS */
649int 650int
650nfs3svc_encode_accessres(struct svc_rqst *rqstp, u32 *p, 651nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
651 struct nfsd3_accessres *resp) 652 struct nfsd3_accessres *resp)
652{ 653{
653 p = encode_post_op_attr(rqstp, p, &resp->fh); 654 p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -658,7 +659,7 @@ nfs3svc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
658 659
659/* READLINK */ 660/* READLINK */
660int 661int
661nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p, 662nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
662 struct nfsd3_readlinkres *resp) 663 struct nfsd3_readlinkres *resp)
663{ 664{
664 p = encode_post_op_attr(rqstp, p, &resp->fh); 665 p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -668,7 +669,6 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
668 rqstp->rq_res.page_len = resp->len; 669 rqstp->rq_res.page_len = resp->len;
669 if (resp->len & 3) { 670 if (resp->len & 3) {
670 /* need to pad the tail */ 671 /* need to pad the tail */
671 rqstp->rq_restailpage = 0;
672 rqstp->rq_res.tail[0].iov_base = p; 672 rqstp->rq_res.tail[0].iov_base = p;
673 *p = 0; 673 *p = 0;
674 rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); 674 rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
@@ -680,7 +680,7 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
680 680
681/* READ */ 681/* READ */
682int 682int
683nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p, 683nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
684 struct nfsd3_readres *resp) 684 struct nfsd3_readres *resp)
685{ 685{
686 p = encode_post_op_attr(rqstp, p, &resp->fh); 686 p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -693,7 +693,6 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
693 rqstp->rq_res.page_len = resp->count; 693 rqstp->rq_res.page_len = resp->count;
694 if (resp->count & 3) { 694 if (resp->count & 3) {
695 /* need to pad the tail */ 695 /* need to pad the tail */
696 rqstp->rq_restailpage = 0;
697 rqstp->rq_res.tail[0].iov_base = p; 696 rqstp->rq_res.tail[0].iov_base = p;
698 *p = 0; 697 *p = 0;
699 rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); 698 rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
@@ -705,7 +704,7 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
705 704
706/* WRITE */ 705/* WRITE */
707int 706int
708nfs3svc_encode_writeres(struct svc_rqst *rqstp, u32 *p, 707nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
709 struct nfsd3_writeres *resp) 708 struct nfsd3_writeres *resp)
710{ 709{
711 p = encode_wcc_data(rqstp, p, &resp->fh); 710 p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -720,7 +719,7 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, u32 *p,
720 719
721/* CREATE, MKDIR, SYMLINK, MKNOD */ 720/* CREATE, MKDIR, SYMLINK, MKNOD */
722int 721int
723nfs3svc_encode_createres(struct svc_rqst *rqstp, u32 *p, 722nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p,
724 struct nfsd3_diropres *resp) 723 struct nfsd3_diropres *resp)
725{ 724{
726 if (resp->status == 0) { 725 if (resp->status == 0) {
@@ -734,7 +733,7 @@ nfs3svc_encode_createres(struct svc_rqst *rqstp, u32 *p,
734 733
735/* RENAME */ 734/* RENAME */
736int 735int
737nfs3svc_encode_renameres(struct svc_rqst *rqstp, u32 *p, 736nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p,
738 struct nfsd3_renameres *resp) 737 struct nfsd3_renameres *resp)
739{ 738{
740 p = encode_wcc_data(rqstp, p, &resp->ffh); 739 p = encode_wcc_data(rqstp, p, &resp->ffh);
@@ -744,7 +743,7 @@ nfs3svc_encode_renameres(struct svc_rqst *rqstp, u32 *p,
744 743
745/* LINK */ 744/* LINK */
746int 745int
747nfs3svc_encode_linkres(struct svc_rqst *rqstp, u32 *p, 746nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p,
748 struct nfsd3_linkres *resp) 747 struct nfsd3_linkres *resp)
749{ 748{
750 p = encode_post_op_attr(rqstp, p, &resp->fh); 749 p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -754,7 +753,7 @@ nfs3svc_encode_linkres(struct svc_rqst *rqstp, u32 *p,
754 753
755/* READDIR */ 754/* READDIR */
756int 755int
757nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p, 756nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
758 struct nfsd3_readdirres *resp) 757 struct nfsd3_readdirres *resp)
759{ 758{
760 p = encode_post_op_attr(rqstp, p, &resp->fh); 759 p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -768,7 +767,6 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
768 rqstp->rq_res.page_len = (resp->count) << 2; 767 rqstp->rq_res.page_len = (resp->count) << 2;
769 768
770 /* add the 'tail' to the end of the 'head' page - page 0. */ 769 /* add the 'tail' to the end of the 'head' page - page 0. */
771 rqstp->rq_restailpage = 0;
772 rqstp->rq_res.tail[0].iov_base = p; 770 rqstp->rq_res.tail[0].iov_base = p;
773 *p++ = 0; /* no more entries */ 771 *p++ = 0; /* no more entries */
774 *p++ = htonl(resp->common.err == nfserr_eof); 772 *p++ = htonl(resp->common.err == nfserr_eof);
@@ -778,8 +776,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
778 return xdr_ressize_check(rqstp, p); 776 return xdr_ressize_check(rqstp, p);
779} 777}
780 778
781static inline u32 * 779static inline __be32 *
782encode_entry_baggage(struct nfsd3_readdirres *cd, u32 *p, const char *name, 780encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
783 int namlen, ino_t ino) 781 int namlen, ino_t ino)
784{ 782{
785 *p++ = xdr_one; /* mark entry present */ 783 *p++ = xdr_one; /* mark entry present */
@@ -792,8 +790,8 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, u32 *p, const char *name,
792 return p; 790 return p;
793} 791}
794 792
795static inline u32 * 793static inline __be32 *
796encode_entryplus_baggage(struct nfsd3_readdirres *cd, u32 *p, 794encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
797 struct svc_fh *fhp) 795 struct svc_fh *fhp)
798{ 796{
799 p = encode_post_op_attr(cd->rqstp, p, fhp); 797 p = encode_post_op_attr(cd->rqstp, p, fhp);
@@ -855,7 +853,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
855{ 853{
856 struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, 854 struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
857 common); 855 common);
858 u32 *p = cd->buffer; 856 __be32 *p = cd->buffer;
859 caddr_t curr_page_addr = NULL; 857 caddr_t curr_page_addr = NULL;
860 int pn; /* current page number */ 858 int pn; /* current page number */
861 int slen; /* string (name) length */ 859 int slen; /* string (name) length */
@@ -921,7 +919,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
921 } else if (cd->rqstp->rq_respages[pn+1] != NULL) { 919 } else if (cd->rqstp->rq_respages[pn+1] != NULL) {
922 /* temporarily encode entry into next page, then move back to 920 /* temporarily encode entry into next page, then move back to
923 * current and next page in rq_respages[] */ 921 * current and next page in rq_respages[] */
924 u32 *p1, *tmp; 922 __be32 *p1, *tmp;
925 int len1, len2; 923 int len1, len2;
926 924
927 /* grab next page for temporary storage of entry */ 925 /* grab next page for temporary storage of entry */
@@ -1011,7 +1009,7 @@ nfs3svc_encode_entry_plus(struct readdir_cd *cd, const char *name,
1011 1009
1012/* FSSTAT */ 1010/* FSSTAT */
1013int 1011int
1014nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, u32 *p, 1012nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p,
1015 struct nfsd3_fsstatres *resp) 1013 struct nfsd3_fsstatres *resp)
1016{ 1014{
1017 struct kstatfs *s = &resp->stats; 1015 struct kstatfs *s = &resp->stats;
@@ -1033,7 +1031,7 @@ nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, u32 *p,
1033 1031
1034/* FSINFO */ 1032/* FSINFO */
1035int 1033int
1036nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p, 1034nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p,
1037 struct nfsd3_fsinfores *resp) 1035 struct nfsd3_fsinfores *resp)
1038{ 1036{
1039 *p++ = xdr_zero; /* no post_op_attr */ 1037 *p++ = xdr_zero; /* no post_op_attr */
@@ -1057,7 +1055,7 @@ nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p,
1057 1055
1058/* PATHCONF */ 1056/* PATHCONF */
1059int 1057int
1060nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, u32 *p, 1058nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p,
1061 struct nfsd3_pathconfres *resp) 1059 struct nfsd3_pathconfres *resp)
1062{ 1060{
1063 *p++ = xdr_zero; /* no post_op_attr */ 1061 *p++ = xdr_zero; /* no post_op_attr */
@@ -1076,7 +1074,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, u32 *p,
1076 1074
1077/* COMMIT */ 1075/* COMMIT */
1078int 1076int
1079nfs3svc_encode_commitres(struct svc_rqst *rqstp, u32 *p, 1077nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
1080 struct nfsd3_commitres *resp) 1078 struct nfsd3_commitres *resp)
1081{ 1079{
1082 p = encode_wcc_data(rqstp, p, &resp->fh); 1080 p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -1092,7 +1090,7 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, u32 *p,
1092 * XDR release functions 1090 * XDR release functions
1093 */ 1091 */
1094int 1092int
1095nfs3svc_release_fhandle(struct svc_rqst *rqstp, u32 *p, 1093nfs3svc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
1096 struct nfsd3_attrstat *resp) 1094 struct nfsd3_attrstat *resp)
1097{ 1095{
1098 fh_put(&resp->fh); 1096 fh_put(&resp->fh);
@@ -1100,7 +1098,7 @@ nfs3svc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
1100} 1098}
1101 1099
1102int 1100int
1103nfs3svc_release_fhandle2(struct svc_rqst *rqstp, u32 *p, 1101nfs3svc_release_fhandle2(struct svc_rqst *rqstp, __be32 *p,
1104 struct nfsd3_fhandle_pair *resp) 1102 struct nfsd3_fhandle_pair *resp)
1105{ 1103{
1106 fh_put(&resp->fh1); 1104 fh_put(&resp->fh1);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index edb107e61b91..5d94555cdc83 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -63,6 +63,8 @@
63#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ 63#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
64 | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE) 64 | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE)
65 65
66#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS | NFS4_ACE_IDENTIFIER_GROUP)
67
66#define MASK_EQUAL(mask1, mask2) \ 68#define MASK_EQUAL(mask1, mask2) \
67 ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) 69 ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
68 70
@@ -96,24 +98,26 @@ deny_mask(u32 allow_mask, unsigned int flags)
96/* XXX: modify functions to return NFS errors; they're only ever 98/* XXX: modify functions to return NFS errors; they're only ever
97 * used by nfs code, after all.... */ 99 * used by nfs code, after all.... */
98 100
99static int 101/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
100mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) 102 * side of being more restrictive, so the mode bit mapping below is
103 * pessimistic. An optimistic version would be needed to handle DENY's,
104 * but we espect to coalesce all ALLOWs and DENYs before mapping to mode
105 * bits. */
106
107static void
108low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
101{ 109{
102 u32 ignore = 0; 110 u32 write_mode = NFS4_WRITE_MODE;
103 111
104 if (!(flags & NFS4_ACL_DIR)) 112 if (flags & NFS4_ACL_DIR)
105 ignore |= NFS4_ACE_DELETE_CHILD; /* ignore it */ 113 write_mode |= NFS4_ACE_DELETE_CHILD;
106 perm |= ignore;
107 *mode = 0; 114 *mode = 0;
108 if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE) 115 if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
109 *mode |= ACL_READ; 116 *mode |= ACL_READ;
110 if ((perm & NFS4_WRITE_MODE) == NFS4_WRITE_MODE) 117 if ((perm & write_mode) == write_mode)
111 *mode |= ACL_WRITE; 118 *mode |= ACL_WRITE;
112 if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE) 119 if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
113 *mode |= ACL_EXECUTE; 120 *mode |= ACL_EXECUTE;
114 if (!MASK_EQUAL(perm, ignore|mask_from_posix(*mode, flags)))
115 return -EINVAL;
116 return 0;
117} 121}
118 122
119struct ace_container { 123struct ace_container {
@@ -338,38 +342,6 @@ sort_pacl(struct posix_acl *pacl)
338 return; 342 return;
339} 343}
340 344
341static int
342write_pace(struct nfs4_ace *ace, struct posix_acl *pacl,
343 struct posix_acl_entry **pace, short tag, unsigned int flags)
344{
345 struct posix_acl_entry *this = *pace;
346
347 if (*pace == pacl->a_entries + pacl->a_count)
348 return -EINVAL; /* fell off the end */
349 (*pace)++;
350 this->e_tag = tag;
351 if (tag == ACL_USER_OBJ)
352 flags |= NFS4_ACL_OWNER;
353 if (mode_from_nfs4(ace->access_mask, &this->e_perm, flags))
354 return -EINVAL;
355 this->e_id = (tag == ACL_USER || tag == ACL_GROUP ?
356 ace->who : ACL_UNDEFINED_ID);
357 return 0;
358}
359
360static struct nfs4_ace *
361get_next_v4_ace(struct list_head **p, struct list_head *head)
362{
363 struct nfs4_ace *ace;
364
365 *p = (*p)->next;
366 if (*p == head)
367 return NULL;
368 ace = list_entry(*p, struct nfs4_ace, l_ace);
369
370 return ace;
371}
372
373int 345int
374nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, 346nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
375 struct posix_acl **dpacl, unsigned int flags) 347 struct posix_acl **dpacl, unsigned int flags)
@@ -385,42 +357,23 @@ nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
385 goto out; 357 goto out;
386 358
387 error = nfs4_acl_split(acl, dacl); 359 error = nfs4_acl_split(acl, dacl);
388 if (error < 0) 360 if (error)
389 goto out_acl; 361 goto out_acl;
390 362
391 if (pacl != NULL) { 363 *pacl = _nfsv4_to_posix_one(acl, flags);
392 if (acl->naces == 0) { 364 if (IS_ERR(*pacl)) {
393 error = -ENODATA; 365 error = PTR_ERR(*pacl);
394 goto try_dpacl; 366 *pacl = NULL;
395 } 367 goto out_acl;
396
397 *pacl = _nfsv4_to_posix_one(acl, flags);
398 if (IS_ERR(*pacl)) {
399 error = PTR_ERR(*pacl);
400 *pacl = NULL;
401 goto out_acl;
402 }
403 } 368 }
404 369
405try_dpacl: 370 *dpacl = _nfsv4_to_posix_one(dacl, flags);
406 if (dpacl != NULL) { 371 if (IS_ERR(*dpacl)) {
407 if (dacl->naces == 0) { 372 error = PTR_ERR(*dpacl);
408 if (pacl == NULL || *pacl == NULL) 373 *dpacl = NULL;
409 error = -ENODATA;
410 goto out_acl;
411 }
412
413 error = 0;
414 *dpacl = _nfsv4_to_posix_one(dacl, flags);
415 if (IS_ERR(*dpacl)) {
416 error = PTR_ERR(*dpacl);
417 *dpacl = NULL;
418 goto out_acl;
419 }
420 } 374 }
421
422out_acl: 375out_acl:
423 if (error && pacl) { 376 if (error) {
424 posix_acl_release(*pacl); 377 posix_acl_release(*pacl);
425 *pacl = NULL; 378 *pacl = NULL;
426 } 379 }
@@ -429,349 +382,311 @@ out:
429 return error; 382 return error;
430} 383}
431 384
385/*
386 * While processing the NFSv4 ACE, this maintains bitmasks representing
387 * which permission bits have been allowed and which denied to a given
388 * entity: */
389struct posix_ace_state {
390 u32 allow;
391 u32 deny;
392};
393
394struct posix_user_ace_state {
395 uid_t uid;
396 struct posix_ace_state perms;
397};
398
399struct posix_ace_state_array {
400 int n;
401 struct posix_user_ace_state aces[];
402};
403
404/*
405 * While processing the NFSv4 ACE, this maintains the partial permissions
406 * calculated so far: */
407
408struct posix_acl_state {
409 struct posix_ace_state owner;
410 struct posix_ace_state group;
411 struct posix_ace_state other;
412 struct posix_ace_state everyone;
413 struct posix_ace_state mask; /* Deny unused in this case */
414 struct posix_ace_state_array *users;
415 struct posix_ace_state_array *groups;
416};
417
432static int 418static int
433same_who(struct nfs4_ace *a, struct nfs4_ace *b) 419init_state(struct posix_acl_state *state, int cnt)
434{ 420{
435 return a->whotype == b->whotype && 421 int alloc;
436 (a->whotype != NFS4_ACL_WHO_NAMED || a->who == b->who); 422
423 memset(state, 0, sizeof(struct posix_acl_state));
424 /*
425 * In the worst case, each individual acl could be for a distinct
426 * named user or group, but we don't no which, so we allocate
427 * enough space for either:
428 */
429 alloc = sizeof(struct posix_ace_state_array)
430 + cnt*sizeof(struct posix_ace_state);
431 state->users = kzalloc(alloc, GFP_KERNEL);
432 if (!state->users)
433 return -ENOMEM;
434 state->groups = kzalloc(alloc, GFP_KERNEL);
435 if (!state->groups) {
436 kfree(state->users);
437 return -ENOMEM;
438 }
439 return 0;
437} 440}
438 441
439static int 442static void
440complementary_ace_pair(struct nfs4_ace *allow, struct nfs4_ace *deny, 443free_state(struct posix_acl_state *state) {
441 unsigned int flags) 444 kfree(state->users);
442{ 445 kfree(state->groups);
443 int ignore = 0;
444 if (!(flags & NFS4_ACL_DIR))
445 ignore |= NFS4_ACE_DELETE_CHILD;
446 return MASK_EQUAL(ignore|deny_mask(allow->access_mask, flags),
447 ignore|deny->access_mask) &&
448 allow->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
449 deny->type == NFS4_ACE_ACCESS_DENIED_ACE_TYPE &&
450 allow->flag == deny->flag &&
451 same_who(allow, deny);
452} 446}
453 447
454static inline int 448static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate)
455user_obj_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
456 struct posix_acl *pacl, struct posix_acl_entry **pace,
457 unsigned int flags)
458{ 449{
459 int error = -EINVAL; 450 state->mask.allow |= astate->allow;
460 struct nfs4_ace *ace, *ace2;
461
462 ace = get_next_v4_ace(p, &n4acl->ace_head);
463 if (ace == NULL)
464 goto out;
465 if (ace2type(ace) != ACL_USER_OBJ)
466 goto out;
467 error = write_pace(ace, pacl, pace, ACL_USER_OBJ, flags);
468 if (error < 0)
469 goto out;
470 error = -EINVAL;
471 ace2 = get_next_v4_ace(p, &n4acl->ace_head);
472 if (ace2 == NULL)
473 goto out;
474 if (!complementary_ace_pair(ace, ace2, flags))
475 goto out;
476 error = 0;
477out:
478 return error;
479} 451}
480 452
481static inline int 453/*
482users_from_v4(struct nfs4_acl *n4acl, struct list_head **p, 454 * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS,
483 struct nfs4_ace **mask_ace, 455 * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate
484 struct posix_acl *pacl, struct posix_acl_entry **pace, 456 * to traditional read/write/execute permissions.
485 unsigned int flags) 457 *
486{ 458 * It's problematic to reject acls that use certain mode bits, because it
487 int error = -EINVAL; 459 * places the burden on users to learn the rules about which bits one
488 struct nfs4_ace *ace, *ace2; 460 * particular server sets, without giving the user a lot of help--we return an
461 * error that could mean any number of different things. To make matters
462 * worse, the problematic bits might be introduced by some application that's
463 * automatically mapping from some other acl model.
464 *
465 * So wherever possible we accept anything, possibly erring on the side of
466 * denying more permissions than necessary.
467 *
468 * However we do reject *explicit* DENY's of a few bits representing
469 * permissions we could never deny:
470 */
489 471
490 ace = get_next_v4_ace(p, &n4acl->ace_head); 472static inline int check_deny(u32 mask, int isowner)
491 if (ace == NULL) 473{
492 goto out; 474 if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL))
493 while (ace2type(ace) == ACL_USER) { 475 return -EINVAL;
494 if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) 476 if (!isowner)
495 goto out; 477 return 0;
496 if (*mask_ace && 478 if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL))
497 !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask)) 479 return -EINVAL;
498 goto out; 480 return 0;
499 *mask_ace = ace;
500 ace = get_next_v4_ace(p, &n4acl->ace_head);
501 if (ace == NULL)
502 goto out;
503 if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
504 goto out;
505 error = write_pace(ace, pacl, pace, ACL_USER, flags);
506 if (error < 0)
507 goto out;
508 error = -EINVAL;
509 ace2 = get_next_v4_ace(p, &n4acl->ace_head);
510 if (ace2 == NULL)
511 goto out;
512 if (!complementary_ace_pair(ace, ace2, flags))
513 goto out;
514 if ((*mask_ace)->flag != ace2->flag ||
515 !same_who(*mask_ace, ace2))
516 goto out;
517 ace = get_next_v4_ace(p, &n4acl->ace_head);
518 if (ace == NULL)
519 goto out;
520 }
521 error = 0;
522out:
523 return error;
524} 481}
525 482
526static inline int 483static struct posix_acl *
527group_obj_and_groups_from_v4(struct nfs4_acl *n4acl, struct list_head **p, 484posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
528 struct nfs4_ace **mask_ace,
529 struct posix_acl *pacl, struct posix_acl_entry **pace,
530 unsigned int flags)
531{ 485{
532 int error = -EINVAL; 486 struct posix_acl_entry *pace;
533 struct nfs4_ace *ace, *ace2; 487 struct posix_acl *pacl;
534 struct ace_container *ac; 488 int nace;
535 struct list_head group_l; 489 int i, error = 0;
536
537 INIT_LIST_HEAD(&group_l);
538 ace = list_entry(*p, struct nfs4_ace, l_ace);
539
540 /* group owner (mask and allow aces) */
541 490
542 if (pacl->a_count != 3) { 491 nace = 4 + state->users->n + state->groups->n;
543 /* then the group owner should be preceded by mask */ 492 pacl = posix_acl_alloc(nace, GFP_KERNEL);
544 if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) 493 if (!pacl)
545 goto out; 494 return ERR_PTR(-ENOMEM);
546 if (*mask_ace &&
547 !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
548 goto out;
549 *mask_ace = ace;
550 ace = get_next_v4_ace(p, &n4acl->ace_head);
551 if (ace == NULL)
552 goto out;
553 495
554 if ((*mask_ace)->flag != ace->flag || !same_who(*mask_ace, ace)) 496 pace = pacl->a_entries;
555 goto out; 497 pace->e_tag = ACL_USER_OBJ;
498 error = check_deny(state->owner.deny, 1);
499 if (error)
500 goto out_err;
501 low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
502 pace->e_id = ACL_UNDEFINED_ID;
503
504 for (i=0; i < state->users->n; i++) {
505 pace++;
506 pace->e_tag = ACL_USER;
507 error = check_deny(state->users->aces[i].perms.deny, 0);
508 if (error)
509 goto out_err;
510 low_mode_from_nfs4(state->users->aces[i].perms.allow,
511 &pace->e_perm, flags);
512 pace->e_id = state->users->aces[i].uid;
513 add_to_mask(state, &state->users->aces[i].perms);
556 } 514 }
557 515
558 if (ace2type(ace) != ACL_GROUP_OBJ) 516 pace++;
559 goto out; 517 pace->e_tag = ACL_GROUP_OBJ;
560 518 error = check_deny(state->group.deny, 0);
561 ac = kmalloc(sizeof(*ac), GFP_KERNEL); 519 if (error)
562 error = -ENOMEM; 520 goto out_err;
563 if (ac == NULL) 521 low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
564 goto out; 522 pace->e_id = ACL_UNDEFINED_ID;
565 ac->ace = ace; 523 add_to_mask(state, &state->group);
566 list_add_tail(&ac->ace_l, &group_l); 524
567 525 for (i=0; i < state->groups->n; i++) {
568 error = -EINVAL; 526 pace++;
569 if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) 527 pace->e_tag = ACL_GROUP;
570 goto out; 528 error = check_deny(state->groups->aces[i].perms.deny, 0);
571 529 if (error)
572 error = write_pace(ace, pacl, pace, ACL_GROUP_OBJ, flags); 530 goto out_err;
573 if (error < 0) 531 low_mode_from_nfs4(state->groups->aces[i].perms.allow,
574 goto out; 532 &pace->e_perm, flags);
575 533 pace->e_id = state->groups->aces[i].uid;
576 error = -EINVAL; 534 add_to_mask(state, &state->groups->aces[i].perms);
577 ace = get_next_v4_ace(p, &n4acl->ace_head); 535 }
578 if (ace == NULL)
579 goto out;
580
581 /* groups (mask and allow aces) */
582
583 while (ace2type(ace) == ACL_GROUP) {
584 if (*mask_ace == NULL)
585 goto out;
586
587 if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE ||
588 !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
589 goto out;
590 *mask_ace = ace;
591 536
592 ace = get_next_v4_ace(p, &n4acl->ace_head); 537 pace++;
593 if (ace == NULL) 538 pace->e_tag = ACL_MASK;
594 goto out; 539 low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
595 ac = kmalloc(sizeof(*ac), GFP_KERNEL); 540 pace->e_id = ACL_UNDEFINED_ID;
596 error = -ENOMEM;
597 if (ac == NULL)
598 goto out;
599 error = -EINVAL;
600 if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE ||
601 !same_who(ace, *mask_ace))
602 goto out;
603 541
604 ac->ace = ace; 542 pace++;
605 list_add_tail(&ac->ace_l, &group_l); 543 pace->e_tag = ACL_OTHER;
544 error = check_deny(state->other.deny, 0);
545 if (error)
546 goto out_err;
547 low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
548 pace->e_id = ACL_UNDEFINED_ID;
606 549
607 error = write_pace(ace, pacl, pace, ACL_GROUP, flags); 550 return pacl;
608 if (error < 0) 551out_err:
609 goto out; 552 posix_acl_release(pacl);
610 error = -EINVAL; 553 return ERR_PTR(error);
611 ace = get_next_v4_ace(p, &n4acl->ace_head); 554}
612 if (ace == NULL)
613 goto out;
614 }
615 555
616 /* group owner (deny ace) */ 556static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
557{
558 /* Allow all bits in the mask not already denied: */
559 astate->allow |= mask & ~astate->deny;
560}
617 561
618 if (ace2type(ace) != ACL_GROUP_OBJ) 562static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
619 goto out; 563{
620 ac = list_entry(group_l.next, struct ace_container, ace_l); 564 /* Deny all bits in the mask not already allowed: */
621 ace2 = ac->ace; 565 astate->deny |= mask & ~astate->allow;
622 if (!complementary_ace_pair(ace2, ace, flags)) 566}
623 goto out;
624 list_del(group_l.next);
625 kfree(ac);
626 567
627 /* groups (deny aces) */ 568static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid)
569{
570 int i;
628 571
629 while (!list_empty(&group_l)) { 572 for (i = 0; i < a->n; i++)
630 ace = get_next_v4_ace(p, &n4acl->ace_head); 573 if (a->aces[i].uid == uid)
631 if (ace == NULL) 574 return i;
632 goto out; 575 /* Not found: */
633 if (ace2type(ace) != ACL_GROUP) 576 a->n++;
634 goto out; 577 a->aces[i].uid = uid;
635 ac = list_entry(group_l.next, struct ace_container, ace_l); 578 a->aces[i].perms.allow = state->everyone.allow;
636 ace2 = ac->ace; 579 a->aces[i].perms.deny = state->everyone.deny;
637 if (!complementary_ace_pair(ace2, ace, flags))
638 goto out;
639 list_del(group_l.next);
640 kfree(ac);
641 }
642 580
643 ace = get_next_v4_ace(p, &n4acl->ace_head); 581 return i;
644 if (ace == NULL)
645 goto out;
646 if (ace2type(ace) != ACL_OTHER)
647 goto out;
648 error = 0;
649out:
650 while (!list_empty(&group_l)) {
651 ac = list_entry(group_l.next, struct ace_container, ace_l);
652 list_del(group_l.next);
653 kfree(ac);
654 }
655 return error;
656} 582}
657 583
658static inline int 584static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
659mask_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
660 struct nfs4_ace **mask_ace,
661 struct posix_acl *pacl, struct posix_acl_entry **pace,
662 unsigned int flags)
663{ 585{
664 int error = -EINVAL; 586 int i;
665 struct nfs4_ace *ace;
666 587
667 ace = list_entry(*p, struct nfs4_ace, l_ace); 588 for (i=0; i < a->n; i++)
668 if (pacl->a_count != 3) { 589 deny_bits(&a->aces[i].perms, mask);
669 if (*mask_ace == NULL)
670 goto out;
671 (*mask_ace)->access_mask = deny_mask((*mask_ace)->access_mask, flags);
672 write_pace(*mask_ace, pacl, pace, ACL_MASK, flags);
673 }
674 error = 0;
675out:
676 return error;
677} 590}
678 591
679static inline int 592static void allow_bits_array(struct posix_ace_state_array *a, u32 mask)
680other_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
681 struct posix_acl *pacl, struct posix_acl_entry **pace,
682 unsigned int flags)
683{ 593{
684 int error = -EINVAL; 594 int i;
685 struct nfs4_ace *ace, *ace2;
686 595
687 ace = list_entry(*p, struct nfs4_ace, l_ace); 596 for (i=0; i < a->n; i++)
688 if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) 597 allow_bits(&a->aces[i].perms, mask);
689 goto out;
690 error = write_pace(ace, pacl, pace, ACL_OTHER, flags);
691 if (error < 0)
692 goto out;
693 error = -EINVAL;
694 ace2 = get_next_v4_ace(p, &n4acl->ace_head);
695 if (ace2 == NULL)
696 goto out;
697 if (!complementary_ace_pair(ace, ace2, flags))
698 goto out;
699 error = 0;
700out:
701 return error;
702} 598}
703 599
704static int 600static void process_one_v4_ace(struct posix_acl_state *state,
705calculate_posix_ace_count(struct nfs4_acl *n4acl) 601 struct nfs4_ace *ace)
706{ 602{
707 if (n4acl->naces == 6) /* owner, owner group, and other only */ 603 u32 mask = ace->access_mask;
708 return 3; 604 int i;
709 else { /* Otherwise there must be a mask entry. */ 605
710 /* Also, the remaining entries are for named users and 606 switch (ace2type(ace)) {
711 * groups, and come in threes (mask, allow, deny): */ 607 case ACL_USER_OBJ:
712 if (n4acl->naces < 7) 608 if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
713 return -EINVAL; 609 allow_bits(&state->owner, mask);
714 if ((n4acl->naces - 7) % 3) 610 } else {
715 return -EINVAL; 611 deny_bits(&state->owner, mask);
716 return 4 + (n4acl->naces - 7)/3; 612 }
613 break;
614 case ACL_USER:
615 i = find_uid(state, state->users, ace->who);
616 if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
617 allow_bits(&state->users->aces[i].perms, mask);
618 } else {
619 deny_bits(&state->users->aces[i].perms, mask);
620 mask = state->users->aces[i].perms.deny;
621 deny_bits(&state->owner, mask);
622 }
623 break;
624 case ACL_GROUP_OBJ:
625 if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
626 allow_bits(&state->group, mask);
627 } else {
628 deny_bits(&state->group, mask);
629 mask = state->group.deny;
630 deny_bits(&state->owner, mask);
631 deny_bits(&state->everyone, mask);
632 deny_bits_array(state->users, mask);
633 deny_bits_array(state->groups, mask);
634 }
635 break;
636 case ACL_GROUP:
637 i = find_uid(state, state->groups, ace->who);
638 if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
639 allow_bits(&state->groups->aces[i].perms, mask);
640 } else {
641 deny_bits(&state->groups->aces[i].perms, mask);
642 mask = state->groups->aces[i].perms.deny;
643 deny_bits(&state->owner, mask);
644 deny_bits(&state->group, mask);
645 deny_bits(&state->everyone, mask);
646 deny_bits_array(state->users, mask);
647 deny_bits_array(state->groups, mask);
648 }
649 break;
650 case ACL_OTHER:
651 if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
652 allow_bits(&state->owner, mask);
653 allow_bits(&state->group, mask);
654 allow_bits(&state->other, mask);
655 allow_bits(&state->everyone, mask);
656 allow_bits_array(state->users, mask);
657 allow_bits_array(state->groups, mask);
658 } else {
659 deny_bits(&state->owner, mask);
660 deny_bits(&state->group, mask);
661 deny_bits(&state->other, mask);
662 deny_bits(&state->everyone, mask);
663 deny_bits_array(state->users, mask);
664 deny_bits_array(state->groups, mask);
665 }
717 } 666 }
718} 667}
719 668
720
721static struct posix_acl * 669static struct posix_acl *
722_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags) 670_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags)
723{ 671{
672 struct posix_acl_state state;
724 struct posix_acl *pacl; 673 struct posix_acl *pacl;
725 int error = -EINVAL, nace = 0; 674 struct nfs4_ace *ace;
726 struct list_head *p; 675 int ret;
727 struct nfs4_ace *mask_ace = NULL;
728 struct posix_acl_entry *pace;
729
730 nace = calculate_posix_ace_count(n4acl);
731 if (nace < 0)
732 goto out_err;
733
734 pacl = posix_acl_alloc(nace, GFP_KERNEL);
735 error = -ENOMEM;
736 if (pacl == NULL)
737 goto out_err;
738
739 pace = &pacl->a_entries[0];
740 p = &n4acl->ace_head;
741
742 error = user_obj_from_v4(n4acl, &p, pacl, &pace, flags);
743 if (error)
744 goto out_acl;
745
746 error = users_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
747 if (error)
748 goto out_acl;
749 676
750 error = group_obj_and_groups_from_v4(n4acl, &p, &mask_ace, pacl, &pace, 677 ret = init_state(&state, n4acl->naces);
751 flags); 678 if (ret)
752 if (error) 679 return ERR_PTR(ret);
753 goto out_acl;
754 680
755 error = mask_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags); 681 list_for_each_entry(ace, &n4acl->ace_head, l_ace)
756 if (error) 682 process_one_v4_ace(&state, ace);
757 goto out_acl;
758 error = other_from_v4(n4acl, &p, pacl, &pace, flags);
759 if (error)
760 goto out_acl;
761 683
762 error = -EINVAL; 684 pacl = posix_state_to_acl(&state, flags);
763 if (p->next != &n4acl->ace_head)
764 goto out_acl;
765 if (pace != pacl->a_entries + pacl->a_count)
766 goto out_acl;
767 685
768 sort_pacl(pacl); 686 free_state(&state);
769 687
770 return pacl; 688 if (!IS_ERR(pacl))
771out_acl: 689 sort_pacl(pacl);
772 posix_acl_release(pacl);
773out_err:
774 pacl = ERR_PTR(error);
775 return pacl; 690 return pacl;
776} 691}
777 692
@@ -785,22 +700,41 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
785 list_for_each_safe(h, n, &acl->ace_head) { 700 list_for_each_safe(h, n, &acl->ace_head) {
786 ace = list_entry(h, struct nfs4_ace, l_ace); 701 ace = list_entry(h, struct nfs4_ace, l_ace);
787 702
788 if ((ace->flag & NFS4_INHERITANCE_FLAGS) 703 if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
789 != NFS4_INHERITANCE_FLAGS) 704 ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
790 continue; 705 return -EINVAL;
791 706
792 error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, 707 if (ace->flag & ~NFS4_SUPPORTED_FLAGS)
793 ace->access_mask, ace->whotype, ace->who); 708 return -EINVAL;
794 if (error < 0)
795 goto out;
796 709
797 list_del(h); 710 switch (ace->flag & NFS4_INHERITANCE_FLAGS) {
798 kfree(ace); 711 case 0:
799 acl->naces--; 712 /* Leave this ace in the effective acl: */
713 continue;
714 case NFS4_INHERITANCE_FLAGS:
715 /* Add this ace to the default acl and remove it
716 * from the effective acl: */
717 error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
718 ace->access_mask, ace->whotype, ace->who);
719 if (error)
720 return error;
721 list_del(h);
722 kfree(ace);
723 acl->naces--;
724 break;
725 case NFS4_INHERITANCE_FLAGS & ~NFS4_ACE_INHERIT_ONLY_ACE:
726 /* Add this ace to the default, but leave it in
727 * the effective acl as well: */
728 error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
729 ace->access_mask, ace->whotype, ace->who);
730 if (error)
731 return error;
732 break;
733 default:
734 return -EINVAL;
735 }
800 } 736 }
801 737 return 0;
802out:
803 return error;
804} 738}
805 739
806static short 740static short
@@ -930,23 +864,6 @@ nfs4_acl_write_who(int who, char *p)
930 return -1; 864 return -1;
931} 865}
932 866
933static inline int
934match_who(struct nfs4_ace *ace, uid_t owner, gid_t group, uid_t who)
935{
936 switch (ace->whotype) {
937 case NFS4_ACL_WHO_NAMED:
938 return who == ace->who;
939 case NFS4_ACL_WHO_OWNER:
940 return who == owner;
941 case NFS4_ACL_WHO_GROUP:
942 return who == group;
943 case NFS4_ACL_WHO_EVERYONE:
944 return 1;
945 default:
946 return 0;
947 }
948}
949
950EXPORT_SYMBOL(nfs4_acl_new); 867EXPORT_SYMBOL(nfs4_acl_new);
951EXPORT_SYMBOL(nfs4_acl_free); 868EXPORT_SYMBOL(nfs4_acl_free);
952EXPORT_SYMBOL(nfs4_acl_add_ace); 869EXPORT_SYMBOL(nfs4_acl_add_ace);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f6ca9fb3fc63..f57655a7a2b6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -85,8 +85,8 @@ enum nfs_cb_opnum4 {
85/* 85/*
86* Generic encode routines from fs/nfs/nfs4xdr.c 86* Generic encode routines from fs/nfs/nfs4xdr.c
87*/ 87*/
88static inline u32 * 88static inline __be32 *
89xdr_writemem(u32 *p, const void *ptr, int nbytes) 89xdr_writemem(__be32 *p, const void *ptr, int nbytes)
90{ 90{
91 int tmp = XDR_QUADLEN(nbytes); 91 int tmp = XDR_QUADLEN(nbytes);
92 if (!tmp) 92 if (!tmp)
@@ -205,7 +205,7 @@ nfs_cb_stat_to_errno(int stat)
205static int 205static int
206encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) 206encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
207{ 207{
208 u32 * p; 208 __be32 * p;
209 209
210 RESERVE_SPACE(16); 210 RESERVE_SPACE(16);
211 WRITE32(0); /* tag length is always 0 */ 211 WRITE32(0); /* tag length is always 0 */
@@ -218,7 +218,7 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
218static int 218static int
219encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) 219encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
220{ 220{
221 u32 *p; 221 __be32 *p;
222 int len = cb_rec->cbr_fhlen; 222 int len = cb_rec->cbr_fhlen;
223 223
224 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 224 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
@@ -231,7 +231,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
231} 231}
232 232
233static int 233static int
234nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p) 234nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
235{ 235{
236 struct xdr_stream xdrs, *xdr = &xdrs; 236 struct xdr_stream xdrs, *xdr = &xdrs;
237 237
@@ -241,7 +241,7 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
241} 241}
242 242
243static int 243static int
244nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args) 244nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
245{ 245{
246 struct xdr_stream xdr; 246 struct xdr_stream xdr;
247 struct nfs4_cb_compound_hdr hdr = { 247 struct nfs4_cb_compound_hdr hdr = {
@@ -257,7 +257,7 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args
257 257
258static int 258static int
259decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ 259decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
260 u32 *p; 260 __be32 *p;
261 261
262 READ_BUF(8); 262 READ_BUF(8);
263 READ32(hdr->status); 263 READ32(hdr->status);
@@ -272,7 +272,7 @@ decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
272static int 272static int
273decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 273decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
274{ 274{
275 u32 *p; 275 __be32 *p;
276 u32 op; 276 u32 op;
277 int32_t nfserr; 277 int32_t nfserr;
278 278
@@ -291,13 +291,13 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
291} 291}
292 292
293static int 293static int
294nfs4_xdr_dec_cb_null(struct rpc_rqst *req, u32 *p) 294nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
295{ 295{
296 return 0; 296 return 0;
297} 297}
298 298
299static int 299static int
300nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p) 300nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
301{ 301{
302 struct xdr_stream xdr; 302 struct xdr_stream xdr;
303 struct nfs4_cb_compound_hdr hdr; 303 struct nfs4_cb_compound_hdr hdr;
@@ -421,7 +421,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
421 421
422 /* Create RPC client */ 422 /* Create RPC client */
423 cb->cb_client = rpc_create(&args); 423 cb->cb_client = rpc_create(&args);
424 if (!cb->cb_client) { 424 if (IS_ERR(cb->cb_client)) {
425 dprintk("NFSD: couldn't create callback client\n"); 425 dprintk("NFSD: couldn't create callback client\n");
426 goto out_err; 426 goto out_err;
427 } 427 }
@@ -448,10 +448,10 @@ nfsd4_probe_callback(struct nfs4_client *clp)
448out_rpciod: 448out_rpciod:
449 atomic_dec(&clp->cl_count); 449 atomic_dec(&clp->cl_count);
450 rpciod_down(); 450 rpciod_down();
451 cb->cb_client = NULL;
452out_clnt: 451out_clnt:
453 rpc_shutdown_client(cb->cb_client); 452 rpc_shutdown_client(cb->cb_client);
454out_err: 453out_err:
454 cb->cb_client = NULL;
455 dprintk("NFSD: warning: no callback path to client %.*s\n", 455 dprintk("NFSD: warning: no callback path to client %.*s\n",
456 (int)clp->cl_name.len, clp->cl_name.data); 456 (int)clp->cl_name.len, clp->cl_name.data);
457} 457}
@@ -461,7 +461,7 @@ nfs4_cb_null(struct rpc_task *task, void *dummy)
461{ 461{
462 struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; 462 struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
463 struct nfs4_callback *cb = &clp->cl_callback; 463 struct nfs4_callback *cb = &clp->cl_callback;
464 u32 addr = htonl(cb->cb_addr); 464 __be32 addr = htonl(cb->cb_addr);
465 465
466 dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status); 466 dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
467 467
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 15ded7a30a72..0a7bbdc4a10a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -67,32 +67,32 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
67 *dst = *src; 67 *dst = *src;
68} 68}
69 69
70static int 70static __be32
71do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 71do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
72{ 72{
73 int accmode, status; 73 __be32 status;
74 74
75 if (open->op_truncate && 75 if (open->op_truncate &&
76 !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) 76 !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
77 return nfserr_inval; 77 return nfserr_inval;
78 78
79 accmode = MAY_NOP;
80 if (open->op_share_access & NFS4_SHARE_ACCESS_READ) 79 if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
81 accmode = MAY_READ; 80 accmode |= MAY_READ;
82 if (open->op_share_deny & NFS4_SHARE_ACCESS_WRITE) 81 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
83 accmode |= (MAY_WRITE | MAY_TRUNC); 82 accmode |= (MAY_WRITE | MAY_TRUNC);
84 accmode |= MAY_OWNER_OVERRIDE; 83 if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
84 accmode |= MAY_WRITE;
85 85
86 status = fh_verify(rqstp, current_fh, S_IFREG, accmode); 86 status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
87 87
88 return status; 88 return status;
89} 89}
90 90
91static int 91static __be32
92do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 92do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
93{ 93{
94 struct svc_fh resfh; 94 struct svc_fh resfh;
95 int status; 95 __be32 status;
96 96
97 fh_init(&resfh, NFS4_FHSIZE); 97 fh_init(&resfh, NFS4_FHSIZE);
98 open->op_truncate = 0; 98 open->op_truncate = 0;
@@ -124,17 +124,17 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
124 &resfh.fh_handle.fh_base, 124 &resfh.fh_handle.fh_base,
125 resfh.fh_handle.fh_size); 125 resfh.fh_handle.fh_size);
126 126
127 status = do_open_permission(rqstp, current_fh, open); 127 status = do_open_permission(rqstp, current_fh, open, MAY_NOP);
128 } 128 }
129 129
130 fh_put(&resfh); 130 fh_put(&resfh);
131 return status; 131 return status;
132} 132}
133 133
134static int 134static __be32
135do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 135do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
136{ 136{
137 int status; 137 __be32 status;
138 138
139 /* Only reclaims from previously confirmed clients are valid */ 139 /* Only reclaims from previously confirmed clients are valid */
140 if ((status = nfs4_check_open_reclaim(&open->op_clientid))) 140 if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
@@ -155,16 +155,16 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
155 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && 155 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
156 (open->op_iattr.ia_size == 0); 156 (open->op_iattr.ia_size == 0);
157 157
158 status = do_open_permission(rqstp, current_fh, open); 158 status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE);
159 159
160 return status; 160 return status;
161} 161}
162 162
163 163
164static inline int 164static inline __be32
165nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, struct nfs4_stateowner **replay_owner) 165nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, struct nfs4_stateowner **replay_owner)
166{ 166{
167 int status; 167 __be32 status;
168 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", 168 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
169 (int)open->op_fname.len, open->op_fname.data, 169 (int)open->op_fname.len, open->op_fname.data,
170 open->op_stateowner); 170 open->op_stateowner);
@@ -177,7 +177,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
177 177
178 /* check seqid for replay. set nfs4_owner */ 178 /* check seqid for replay. set nfs4_owner */
179 status = nfsd4_process_open1(open); 179 status = nfsd4_process_open1(open);
180 if (status == NFSERR_REPLAY_ME) { 180 if (status == nfserr_replay_me) {
181 struct nfs4_replay *rp = &open->op_stateowner->so_replay; 181 struct nfs4_replay *rp = &open->op_stateowner->so_replay;
182 fh_put(current_fh); 182 fh_put(current_fh);
183 current_fh->fh_handle.fh_size = rp->rp_openfh_len; 183 current_fh->fh_handle.fh_size = rp->rp_openfh_len;
@@ -188,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
188 dprintk("nfsd4_open: replay failed" 188 dprintk("nfsd4_open: replay failed"
189 " restoring previous filehandle\n"); 189 " restoring previous filehandle\n");
190 else 190 else
191 status = NFSERR_REPLAY_ME; 191 status = nfserr_replay_me;
192 } 192 }
193 if (status) 193 if (status)
194 goto out; 194 goto out;
@@ -261,7 +261,7 @@ out:
261/* 261/*
262 * filehandle-manipulating ops. 262 * filehandle-manipulating ops.
263 */ 263 */
264static inline int 264static inline __be32
265nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh) 265nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh)
266{ 266{
267 if (!current_fh->fh_dentry) 267 if (!current_fh->fh_dentry)
@@ -271,7 +271,7 @@ nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh)
271 return nfs_ok; 271 return nfs_ok;
272} 272}
273 273
274static inline int 274static inline __be32
275nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putfh *putfh) 275nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putfh *putfh)
276{ 276{
277 fh_put(current_fh); 277 fh_put(current_fh);
@@ -280,10 +280,10 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putf
280 return fh_verify(rqstp, current_fh, 0, MAY_NOP); 280 return fh_verify(rqstp, current_fh, 0, MAY_NOP);
281} 281}
282 282
283static inline int 283static inline __be32
284nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh) 284nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
285{ 285{
286 int status; 286 __be32 status;
287 287
288 fh_put(current_fh); 288 fh_put(current_fh);
289 status = exp_pseudoroot(rqstp->rq_client, current_fh, 289 status = exp_pseudoroot(rqstp->rq_client, current_fh,
@@ -291,7 +291,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
291 return status; 291 return status;
292} 292}
293 293
294static inline int 294static inline __be32
295nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh) 295nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
296{ 296{
297 if (!save_fh->fh_dentry) 297 if (!save_fh->fh_dentry)
@@ -301,7 +301,7 @@ nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
301 return nfs_ok; 301 return nfs_ok;
302} 302}
303 303
304static inline int 304static inline __be32
305nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh) 305nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
306{ 306{
307 if (!current_fh->fh_dentry) 307 if (!current_fh->fh_dentry)
@@ -314,7 +314,7 @@ nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
314/* 314/*
315 * misc nfsv4 ops 315 * misc nfsv4 ops
316 */ 316 */
317static inline int 317static inline __be32
318nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_access *access) 318nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_access *access)
319{ 319{
320 if (access->ac_req_access & ~NFS3_ACCESS_FULL) 320 if (access->ac_req_access & ~NFS3_ACCESS_FULL)
@@ -324,10 +324,10 @@ nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_acc
324 return nfsd_access(rqstp, current_fh, &access->ac_resp_access, &access->ac_supported); 324 return nfsd_access(rqstp, current_fh, &access->ac_resp_access, &access->ac_supported);
325} 325}
326 326
327static inline int 327static inline __be32
328nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_commit *commit) 328nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_commit *commit)
329{ 329{
330 int status; 330 __be32 status;
331 331
332 u32 *p = (u32 *)commit->co_verf.data; 332 u32 *p = (u32 *)commit->co_verf.data;
333 *p++ = nfssvc_boot.tv_sec; 333 *p++ = nfssvc_boot.tv_sec;
@@ -339,11 +339,11 @@ nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_com
339 return status; 339 return status;
340} 340}
341 341
342static int 342static __be32
343nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create) 343nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create)
344{ 344{
345 struct svc_fh resfh; 345 struct svc_fh resfh;
346 int status; 346 __be32 status;
347 dev_t rdev; 347 dev_t rdev;
348 348
349 fh_init(&resfh, NFS4_FHSIZE); 349 fh_init(&resfh, NFS4_FHSIZE);
@@ -423,10 +423,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre
423 return status; 423 return status;
424} 424}
425 425
426static inline int 426static inline __be32
427nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_getattr *getattr) 427nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_getattr *getattr)
428{ 428{
429 int status; 429 __be32 status;
430 430
431 status = fh_verify(rqstp, current_fh, 0, MAY_NOP); 431 status = fh_verify(rqstp, current_fh, 0, MAY_NOP);
432 if (status) 432 if (status)
@@ -442,11 +442,11 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ge
442 return nfs_ok; 442 return nfs_ok;
443} 443}
444 444
445static inline int 445static inline __be32
446nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh, 446nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh,
447 struct svc_fh *save_fh, struct nfsd4_link *link) 447 struct svc_fh *save_fh, struct nfsd4_link *link)
448{ 448{
449 int status = nfserr_nofilehandle; 449 __be32 status = nfserr_nofilehandle;
450 450
451 if (!save_fh->fh_dentry) 451 if (!save_fh->fh_dentry)
452 return status; 452 return status;
@@ -456,11 +456,11 @@ nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh,
456 return status; 456 return status;
457} 457}
458 458
459static int 459static __be32
460nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh) 460nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
461{ 461{
462 struct svc_fh tmp_fh; 462 struct svc_fh tmp_fh;
463 int ret; 463 __be32 ret;
464 464
465 fh_init(&tmp_fh, NFS4_FHSIZE); 465 fh_init(&tmp_fh, NFS4_FHSIZE);
466 if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh, 466 if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh,
@@ -474,16 +474,16 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
474 return nfsd_lookup(rqstp, current_fh, "..", 2, current_fh); 474 return nfsd_lookup(rqstp, current_fh, "..", 2, current_fh);
475} 475}
476 476
477static inline int 477static inline __be32
478nfsd4_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lookup *lookup) 478nfsd4_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lookup *lookup)
479{ 479{
480 return nfsd_lookup(rqstp, current_fh, lookup->lo_name, lookup->lo_len, current_fh); 480 return nfsd_lookup(rqstp, current_fh, lookup->lo_name, lookup->lo_len, current_fh);
481} 481}
482 482
483static inline int 483static inline __be32
484nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read) 484nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read)
485{ 485{
486 int status; 486 __be32 status;
487 487
488 /* no need to check permission - this will be done in nfsd_read() */ 488 /* no need to check permission - this will be done in nfsd_read() */
489 489
@@ -508,7 +508,7 @@ out:
508 return status; 508 return status;
509} 509}
510 510
511static inline int 511static inline __be32
512nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readdir *readdir) 512nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readdir *readdir)
513{ 513{
514 u64 cookie = readdir->rd_cookie; 514 u64 cookie = readdir->rd_cookie;
@@ -531,7 +531,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_re
531 return nfs_ok; 531 return nfs_ok;
532} 532}
533 533
534static inline int 534static inline __be32
535nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readlink *readlink) 535nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readlink *readlink)
536{ 536{
537 readlink->rl_rqstp = rqstp; 537 readlink->rl_rqstp = rqstp;
@@ -539,10 +539,10 @@ nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_r
539 return nfs_ok; 539 return nfs_ok;
540} 540}
541 541
542static inline int 542static inline __be32
543nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_remove *remove) 543nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_remove *remove)
544{ 544{
545 int status; 545 __be32 status;
546 546
547 if (nfs4_in_grace()) 547 if (nfs4_in_grace())
548 return nfserr_grace; 548 return nfserr_grace;
@@ -556,11 +556,11 @@ nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_rem
556 return status; 556 return status;
557} 557}
558 558
559static inline int 559static inline __be32
560nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh, 560nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh,
561 struct svc_fh *save_fh, struct nfsd4_rename *rename) 561 struct svc_fh *save_fh, struct nfsd4_rename *rename)
562{ 562{
563 int status = nfserr_nofilehandle; 563 __be32 status = nfserr_nofilehandle;
564 564
565 if (!save_fh->fh_dentry) 565 if (!save_fh->fh_dentry)
566 return status; 566 return status;
@@ -589,10 +589,10 @@ nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh,
589 return status; 589 return status;
590} 590}
591 591
592static inline int 592static inline __be32
593nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr) 593nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr)
594{ 594{
595 int status = nfs_ok; 595 __be32 status = nfs_ok;
596 596
597 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { 597 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
598 nfs4_lock_state(); 598 nfs4_lock_state();
@@ -614,13 +614,13 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se
614 return status; 614 return status;
615} 615}
616 616
617static inline int 617static inline __be32
618nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write) 618nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write)
619{ 619{
620 stateid_t *stateid = &write->wr_stateid; 620 stateid_t *stateid = &write->wr_stateid;
621 struct file *filp = NULL; 621 struct file *filp = NULL;
622 u32 *p; 622 u32 *p;
623 int status = nfs_ok; 623 __be32 status = nfs_ok;
624 624
625 /* no need to check permission - this will be done in nfsd_write() */ 625 /* no need to check permission - this will be done in nfsd_write() */
626 626
@@ -646,7 +646,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
646 *p++ = nfssvc_boot.tv_usec; 646 *p++ = nfssvc_boot.tv_usec;
647 647
648 status = nfsd_write(rqstp, current_fh, filp, write->wr_offset, 648 status = nfsd_write(rqstp, current_fh, filp, write->wr_offset,
649 write->wr_vec, write->wr_vlen, write->wr_buflen, 649 rqstp->rq_vec, write->wr_vlen, write->wr_buflen,
650 &write->wr_how_written); 650 &write->wr_how_written);
651 if (filp) 651 if (filp)
652 fput(filp); 652 fput(filp);
@@ -661,12 +661,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
661 * attributes matched. VERIFY is implemented by mapping NFSERR_SAME 661 * attributes matched. VERIFY is implemented by mapping NFSERR_SAME
662 * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK. 662 * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK.
663 */ 663 */
664static int 664static __be32
665nfsd4_verify(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_verify *verify) 665nfsd4_verify(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_verify *verify)
666{ 666{
667 u32 *buf, *p; 667 __be32 *buf, *p;
668 int count; 668 int count;
669 int status; 669 __be32 status;
670 670
671 status = fh_verify(rqstp, current_fh, 0, MAY_NOP); 671 status = fh_verify(rqstp, current_fh, 0, MAY_NOP);
672 if (status) 672 if (status)
@@ -715,7 +715,7 @@ out_kfree:
715/* 715/*
716 * NULL call. 716 * NULL call.
717 */ 717 */
718static int 718static __be32
719nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) 719nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
720{ 720{
721 return nfs_ok; 721 return nfs_ok;
@@ -731,7 +731,7 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
731/* 731/*
732 * COMPOUND call. 732 * COMPOUND call.
733 */ 733 */
734static int 734static __be32
735nfsd4_proc_compound(struct svc_rqst *rqstp, 735nfsd4_proc_compound(struct svc_rqst *rqstp,
736 struct nfsd4_compoundargs *args, 736 struct nfsd4_compoundargs *args,
737 struct nfsd4_compoundres *resp) 737 struct nfsd4_compoundres *resp)
@@ -741,7 +741,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
741 struct svc_fh *save_fh = NULL; 741 struct svc_fh *save_fh = NULL;
742 struct nfs4_stateowner *replay_owner = NULL; 742 struct nfs4_stateowner *replay_owner = NULL;
743 int slack_space; /* in words, not bytes! */ 743 int slack_space; /* in words, not bytes! */
744 int status; 744 __be32 status;
745 745
746 status = nfserr_resource; 746 status = nfserr_resource;
747 current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL); 747 current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL);
@@ -802,13 +802,29 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
802 * SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH 802 * SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH
803 * require a valid current filehandle 803 * require a valid current filehandle
804 */ 804 */
805 if ((!current_fh->fh_dentry) && 805 if (!current_fh->fh_dentry) {
806 !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) || 806 if (!((op->opnum == OP_PUTFH) ||
807 (op->opnum == OP_SETCLIENTID) || 807 (op->opnum == OP_PUTROOTFH) ||
808 (op->opnum == OP_SETCLIENTID_CONFIRM) || 808 (op->opnum == OP_SETCLIENTID) ||
809 (op->opnum == OP_RENEW) || (op->opnum == OP_RESTOREFH) || 809 (op->opnum == OP_SETCLIENTID_CONFIRM) ||
810 (op->opnum == OP_RELEASE_LOCKOWNER))) { 810 (op->opnum == OP_RENEW) ||
811 op->status = nfserr_nofilehandle; 811 (op->opnum == OP_RESTOREFH) ||
812 (op->opnum == OP_RELEASE_LOCKOWNER))) {
813 op->status = nfserr_nofilehandle;
814 goto encode_op;
815 }
816 }
817 /* Check must be done at start of each operation, except
818 * for GETATTR and ops not listed as returning NFS4ERR_MOVED
819 */
820 else if (current_fh->fh_export->ex_fslocs.migrated &&
821 !((op->opnum == OP_GETATTR) ||
822 (op->opnum == OP_PUTROOTFH) ||
823 (op->opnum == OP_PUTPUBFH) ||
824 (op->opnum == OP_RENEW) ||
825 (op->opnum == OP_SETCLIENTID) ||
826 (op->opnum == OP_RELEASE_LOCKOWNER))) {
827 op->status = nfserr_moved;
812 goto encode_op; 828 goto encode_op;
813 } 829 }
814 switch (op->opnum) { 830 switch (op->opnum) {
@@ -921,7 +937,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
921 } 937 }
922 938
923encode_op: 939encode_op:
924 if (op->status == NFSERR_REPLAY_ME) { 940 if (op->status == nfserr_replay_me) {
925 op->replay = &replay_owner->so_replay; 941 op->replay = &replay_owner->so_replay;
926 nfsd4_encode_replay(resp, op); 942 nfsd4_encode_replay(resp, op);
927 status = op->status = op->replay->rp_status; 943 status = op->status = op->replay->rp_status;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1cbd2e4ee122..e9d07704680e 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -83,13 +83,13 @@ md5_to_hex(char *out, char *md5)
83 *out = '\0'; 83 *out = '\0';
84} 84}
85 85
86int 86__be32
87nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) 87nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
88{ 88{
89 struct xdr_netobj cksum; 89 struct xdr_netobj cksum;
90 struct hash_desc desc; 90 struct hash_desc desc;
91 struct scatterlist sg[1]; 91 struct scatterlist sg[1];
92 int status = nfserr_resource; 92 __be32 status = nfserr_resource;
93 93
94 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", 94 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
95 clname->len, clname->data); 95 clname->len, clname->data);
@@ -193,7 +193,7 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
193 struct dentry_list *child; 193 struct dentry_list *child;
194 194
195 if (name && isdotent(name, namlen)) 195 if (name && isdotent(name, namlen))
196 return nfs_ok; 196 return 0;
197 dentry = lookup_one_len(name, parent, namlen); 197 dentry = lookup_one_len(name, parent, namlen);
198 if (IS_ERR(dentry)) 198 if (IS_ERR(dentry))
199 return PTR_ERR(dentry); 199 return PTR_ERR(dentry);
@@ -333,14 +333,14 @@ purge_old(struct dentry *parent, struct dentry *child)
333 int status; 333 int status;
334 334
335 if (nfs4_has_reclaimed_state(child->d_name.name)) 335 if (nfs4_has_reclaimed_state(child->d_name.name))
336 return nfs_ok; 336 return 0;
337 337
338 status = nfsd4_clear_clid_dir(parent, child); 338 status = nfsd4_clear_clid_dir(parent, child);
339 if (status) 339 if (status)
340 printk("failed to remove client recovery directory %s\n", 340 printk("failed to remove client recovery directory %s\n",
341 child->d_name.name); 341 child->d_name.name);
342 /* Keep trying, success or failure: */ 342 /* Keep trying, success or failure: */
343 return nfs_ok; 343 return 0;
344} 344}
345 345
346void 346void
@@ -365,10 +365,10 @@ load_recdir(struct dentry *parent, struct dentry *child)
365 printk("nfsd4: illegal name %s in recovery directory\n", 365 printk("nfsd4: illegal name %s in recovery directory\n",
366 child->d_name.name); 366 child->d_name.name);
367 /* Keep trying; maybe the others are OK: */ 367 /* Keep trying; maybe the others are OK: */
368 return nfs_ok; 368 return 0;
369 } 369 }
370 nfs4_client_to_reclaim(child->d_name.name); 370 nfs4_client_to_reclaim(child->d_name.name);
371 return nfs_ok; 371 return 0;
372} 372}
373 373
374int 374int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ebcf226a9e4a..293b6495829f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -710,10 +710,10 @@ out_err:
710 * as described above. 710 * as described above.
711 * 711 *
712 */ 712 */
713int 713__be32
714nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid) 714nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
715{ 715{
716 u32 ip_addr = rqstp->rq_addr.sin_addr.s_addr; 716 __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
717 struct xdr_netobj clname = { 717 struct xdr_netobj clname = {
718 .len = setclid->se_namelen, 718 .len = setclid->se_namelen,
719 .data = setclid->se_name, 719 .data = setclid->se_name,
@@ -721,7 +721,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
721 nfs4_verifier clverifier = setclid->se_verf; 721 nfs4_verifier clverifier = setclid->se_verf;
722 unsigned int strhashval; 722 unsigned int strhashval;
723 struct nfs4_client *conf, *unconf, *new; 723 struct nfs4_client *conf, *unconf, *new;
724 int status; 724 __be32 status;
725 char dname[HEXDIR_LEN]; 725 char dname[HEXDIR_LEN];
726 726
727 if (!check_name(clname)) 727 if (!check_name(clname))
@@ -875,14 +875,14 @@ out:
875 * 875 *
876 * NOTE: callback information will be processed here in a future patch 876 * NOTE: callback information will be processed here in a future patch
877 */ 877 */
878int 878__be32
879nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm) 879nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm)
880{ 880{
881 u32 ip_addr = rqstp->rq_addr.sin_addr.s_addr; 881 __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
882 struct nfs4_client *conf, *unconf; 882 struct nfs4_client *conf, *unconf;
883 nfs4_verifier confirm = setclientid_confirm->sc_confirm; 883 nfs4_verifier confirm = setclientid_confirm->sc_confirm;
884 clientid_t * clid = &setclientid_confirm->sc_clientid; 884 clientid_t * clid = &setclientid_confirm->sc_clientid;
885 int status; 885 __be32 status;
886 886
887 if (STALE_CLIENTID(clid)) 887 if (STALE_CLIENTID(clid))
888 return nfserr_stale_clientid; 888 return nfserr_stale_clientid;
@@ -1280,13 +1280,13 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
1280 * Called to check deny when READ with all zero stateid or 1280 * Called to check deny when READ with all zero stateid or
1281 * WRITE with all zero or all one stateid 1281 * WRITE with all zero or all one stateid
1282 */ 1282 */
1283static int 1283static __be32
1284nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) 1284nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
1285{ 1285{
1286 struct inode *ino = current_fh->fh_dentry->d_inode; 1286 struct inode *ino = current_fh->fh_dentry->d_inode;
1287 struct nfs4_file *fp; 1287 struct nfs4_file *fp;
1288 struct nfs4_stateid *stp; 1288 struct nfs4_stateid *stp;
1289 int ret; 1289 __be32 ret;
1290 1290
1291 dprintk("NFSD: nfs4_share_conflict\n"); 1291 dprintk("NFSD: nfs4_share_conflict\n");
1292 1292
@@ -1444,7 +1444,7 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
1444}; 1444};
1445 1445
1446 1446
1447int 1447__be32
1448nfsd4_process_open1(struct nfsd4_open *open) 1448nfsd4_process_open1(struct nfsd4_open *open)
1449{ 1449{
1450 clientid_t *clientid = &open->op_clientid; 1450 clientid_t *clientid = &open->op_clientid;
@@ -1477,7 +1477,7 @@ nfsd4_process_open1(struct nfsd4_open *open)
1477 } 1477 }
1478 if (open->op_seqid == sop->so_seqid - 1) { 1478 if (open->op_seqid == sop->so_seqid - 1) {
1479 if (sop->so_replay.rp_buflen) 1479 if (sop->so_replay.rp_buflen)
1480 return NFSERR_REPLAY_ME; 1480 return nfserr_replay_me;
1481 /* The original OPEN failed so spectacularly 1481 /* The original OPEN failed so spectacularly
1482 * that we don't even have replay data saved! 1482 * that we don't even have replay data saved!
1483 * Therefore, we have no choice but to continue 1483 * Therefore, we have no choice but to continue
@@ -1501,7 +1501,7 @@ renew:
1501 return nfs_ok; 1501 return nfs_ok;
1502} 1502}
1503 1503
1504static inline int 1504static inline __be32
1505nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) 1505nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
1506{ 1506{
1507 if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) 1507 if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
@@ -1522,12 +1522,12 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
1522 return NULL; 1522 return NULL;
1523} 1523}
1524 1524
1525static int 1525static __be32
1526nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, 1526nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
1527 struct nfs4_delegation **dp) 1527 struct nfs4_delegation **dp)
1528{ 1528{
1529 int flags; 1529 int flags;
1530 int status = nfserr_bad_stateid; 1530 __be32 status = nfserr_bad_stateid;
1531 1531
1532 *dp = find_delegation_file(fp, &open->op_delegate_stateid); 1532 *dp = find_delegation_file(fp, &open->op_delegate_stateid);
1533 if (*dp == NULL) 1533 if (*dp == NULL)
@@ -1546,11 +1546,11 @@ out:
1546 return nfs_ok; 1546 return nfs_ok;
1547} 1547}
1548 1548
1549static int 1549static __be32
1550nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) 1550nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
1551{ 1551{
1552 struct nfs4_stateid *local; 1552 struct nfs4_stateid *local;
1553 int status = nfserr_share_denied; 1553 __be32 status = nfserr_share_denied;
1554 struct nfs4_stateowner *sop = open->op_stateowner; 1554 struct nfs4_stateowner *sop = open->op_stateowner;
1555 1555
1556 list_for_each_entry(local, &fp->fi_stateids, st_perfile) { 1556 list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
@@ -1575,7 +1575,7 @@ nfs4_alloc_stateid(void)
1575 return kmem_cache_alloc(stateid_slab, GFP_KERNEL); 1575 return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
1576} 1576}
1577 1577
1578static int 1578static __be32
1579nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, 1579nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
1580 struct nfs4_delegation *dp, 1580 struct nfs4_delegation *dp,
1581 struct svc_fh *cur_fh, int flags) 1581 struct svc_fh *cur_fh, int flags)
@@ -1590,7 +1590,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
1590 get_file(dp->dl_vfs_file); 1590 get_file(dp->dl_vfs_file);
1591 stp->st_vfs_file = dp->dl_vfs_file; 1591 stp->st_vfs_file = dp->dl_vfs_file;
1592 } else { 1592 } else {
1593 int status; 1593 __be32 status;
1594 status = nfsd_open(rqstp, cur_fh, S_IFREG, flags, 1594 status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
1595 &stp->st_vfs_file); 1595 &stp->st_vfs_file);
1596 if (status) { 1596 if (status) {
@@ -1604,7 +1604,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
1604 return 0; 1604 return 0;
1605} 1605}
1606 1606
1607static inline int 1607static inline __be32
1608nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, 1608nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
1609 struct nfsd4_open *open) 1609 struct nfsd4_open *open)
1610{ 1610{
@@ -1619,22 +1619,22 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
1619 return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0); 1619 return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
1620} 1620}
1621 1621
1622static int 1622static __be32
1623nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) 1623nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
1624{ 1624{
1625 struct file *filp = stp->st_vfs_file; 1625 struct file *filp = stp->st_vfs_file;
1626 struct inode *inode = filp->f_dentry->d_inode; 1626 struct inode *inode = filp->f_dentry->d_inode;
1627 unsigned int share_access, new_writer; 1627 unsigned int share_access, new_writer;
1628 int status; 1628 __be32 status;
1629 1629
1630 set_access(&share_access, stp->st_access_bmap); 1630 set_access(&share_access, stp->st_access_bmap);
1631 new_writer = (~share_access) & open->op_share_access 1631 new_writer = (~share_access) & open->op_share_access
1632 & NFS4_SHARE_ACCESS_WRITE; 1632 & NFS4_SHARE_ACCESS_WRITE;
1633 1633
1634 if (new_writer) { 1634 if (new_writer) {
1635 status = get_write_access(inode); 1635 int err = get_write_access(inode);
1636 if (status) 1636 if (err)
1637 return nfserrno(status); 1637 return nfserrno(err);
1638 } 1638 }
1639 status = nfsd4_truncate(rqstp, cur_fh, open); 1639 status = nfsd4_truncate(rqstp, cur_fh, open);
1640 if (status) { 1640 if (status) {
@@ -1738,14 +1738,14 @@ out:
1738/* 1738/*
1739 * called with nfs4_lock_state() held. 1739 * called with nfs4_lock_state() held.
1740 */ 1740 */
1741int 1741__be32
1742nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 1742nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
1743{ 1743{
1744 struct nfs4_file *fp = NULL; 1744 struct nfs4_file *fp = NULL;
1745 struct inode *ino = current_fh->fh_dentry->d_inode; 1745 struct inode *ino = current_fh->fh_dentry->d_inode;
1746 struct nfs4_stateid *stp = NULL; 1746 struct nfs4_stateid *stp = NULL;
1747 struct nfs4_delegation *dp = NULL; 1747 struct nfs4_delegation *dp = NULL;
1748 int status; 1748 __be32 status;
1749 1749
1750 status = nfserr_inval; 1750 status = nfserr_inval;
1751 if (!access_valid(open->op_share_access) 1751 if (!access_valid(open->op_share_access)
@@ -1833,11 +1833,11 @@ static struct work_struct laundromat_work;
1833static void laundromat_main(void *); 1833static void laundromat_main(void *);
1834static DECLARE_WORK(laundromat_work, laundromat_main, NULL); 1834static DECLARE_WORK(laundromat_work, laundromat_main, NULL);
1835 1835
1836int 1836__be32
1837nfsd4_renew(clientid_t *clid) 1837nfsd4_renew(clientid_t *clid)
1838{ 1838{
1839 struct nfs4_client *clp; 1839 struct nfs4_client *clp;
1840 int status; 1840 __be32 status;
1841 1841
1842 nfs4_lock_state(); 1842 nfs4_lock_state();
1843 dprintk("process_renew(%08x/%08x): starting\n", 1843 dprintk("process_renew(%08x/%08x): starting\n",
@@ -1996,9 +1996,9 @@ access_permit_write(unsigned long access_bmap)
1996} 1996}
1997 1997
1998static 1998static
1999int nfs4_check_openmode(struct nfs4_stateid *stp, int flags) 1999__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
2000{ 2000{
2001 int status = nfserr_openmode; 2001 __be32 status = nfserr_openmode;
2002 2002
2003 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) 2003 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
2004 goto out; 2004 goto out;
@@ -2009,7 +2009,7 @@ out:
2009 return status; 2009 return status;
2010} 2010}
2011 2011
2012static inline int 2012static inline __be32
2013check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) 2013check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
2014{ 2014{
2015 /* Trying to call delegreturn with a special stateid? Yuch: */ 2015 /* Trying to call delegreturn with a special stateid? Yuch: */
@@ -2043,14 +2043,14 @@ io_during_grace_disallowed(struct inode *inode, int flags)
2043/* 2043/*
2044* Checks for stateid operations 2044* Checks for stateid operations
2045*/ 2045*/
2046int 2046__be32
2047nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) 2047nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
2048{ 2048{
2049 struct nfs4_stateid *stp = NULL; 2049 struct nfs4_stateid *stp = NULL;
2050 struct nfs4_delegation *dp = NULL; 2050 struct nfs4_delegation *dp = NULL;
2051 stateid_t *stidp; 2051 stateid_t *stidp;
2052 struct inode *ino = current_fh->fh_dentry->d_inode; 2052 struct inode *ino = current_fh->fh_dentry->d_inode;
2053 int status; 2053 __be32 status;
2054 2054
2055 dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n", 2055 dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
2056 stateid->si_boot, stateid->si_stateownerid, 2056 stateid->si_boot, stateid->si_stateownerid,
@@ -2125,7 +2125,7 @@ setlkflg (int type)
2125/* 2125/*
2126 * Checks for sequence id mutating operations. 2126 * Checks for sequence id mutating operations.
2127 */ 2127 */
2128static int 2128static __be32
2129nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) 2129nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
2130{ 2130{
2131 struct nfs4_stateid *stp; 2131 struct nfs4_stateid *stp;
@@ -2169,7 +2169,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2169 clientid_t *lockclid = &lock->v.new.clientid; 2169 clientid_t *lockclid = &lock->v.new.clientid;
2170 struct nfs4_client *clp = sop->so_client; 2170 struct nfs4_client *clp = sop->so_client;
2171 int lkflg = 0; 2171 int lkflg = 0;
2172 int status; 2172 __be32 status;
2173 2173
2174 lkflg = setlkflg(lock->lk_type); 2174 lkflg = setlkflg(lock->lk_type);
2175 2175
@@ -2233,7 +2233,7 @@ check_replay:
2233 if (seqid == sop->so_seqid - 1) { 2233 if (seqid == sop->so_seqid - 1) {
2234 dprintk("NFSD: preprocess_seqid_op: retransmission?\n"); 2234 dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
2235 /* indicate replay to calling function */ 2235 /* indicate replay to calling function */
2236 return NFSERR_REPLAY_ME; 2236 return nfserr_replay_me;
2237 } 2237 }
2238 printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", 2238 printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
2239 sop->so_seqid, seqid); 2239 sop->so_seqid, seqid);
@@ -2241,10 +2241,10 @@ check_replay:
2241 return nfserr_bad_seqid; 2241 return nfserr_bad_seqid;
2242} 2242}
2243 2243
2244int 2244__be32
2245nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc, struct nfs4_stateowner **replay_owner) 2245nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc, struct nfs4_stateowner **replay_owner)
2246{ 2246{
2247 int status; 2247 __be32 status;
2248 struct nfs4_stateowner *sop; 2248 struct nfs4_stateowner *sop;
2249 struct nfs4_stateid *stp; 2249 struct nfs4_stateid *stp;
2250 2250
@@ -2310,10 +2310,10 @@ reset_union_bmap_deny(unsigned long deny, unsigned long *bmap)
2310 } 2310 }
2311} 2311}
2312 2312
2313int 2313__be32
2314nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_downgrade *od, struct nfs4_stateowner **replay_owner) 2314nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_downgrade *od, struct nfs4_stateowner **replay_owner)
2315{ 2315{
2316 int status; 2316 __be32 status;
2317 struct nfs4_stateid *stp; 2317 struct nfs4_stateid *stp;
2318 unsigned int share_access; 2318 unsigned int share_access;
2319 2319
@@ -2365,10 +2365,10 @@ out:
2365/* 2365/*
2366 * nfs4_unlock_state() called after encode 2366 * nfs4_unlock_state() called after encode
2367 */ 2367 */
2368int 2368__be32
2369nfsd4_close(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_close *close, struct nfs4_stateowner **replay_owner) 2369nfsd4_close(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_close *close, struct nfs4_stateowner **replay_owner)
2370{ 2370{
2371 int status; 2371 __be32 status;
2372 struct nfs4_stateid *stp; 2372 struct nfs4_stateid *stp;
2373 2373
2374 dprintk("NFSD: nfsd4_close on file %.*s\n", 2374 dprintk("NFSD: nfsd4_close on file %.*s\n",
@@ -2404,10 +2404,10 @@ out:
2404 return status; 2404 return status;
2405} 2405}
2406 2406
2407int 2407__be32
2408nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr) 2408nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr)
2409{ 2409{
2410 int status; 2410 __be32 status;
2411 2411
2412 if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0))) 2412 if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0)))
2413 goto out; 2413 goto out;
@@ -2635,7 +2635,7 @@ check_lock_length(u64 offset, u64 length)
2635/* 2635/*
2636 * LOCK operation 2636 * LOCK operation
2637 */ 2637 */
2638int 2638__be32
2639nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock *lock, struct nfs4_stateowner **replay_owner) 2639nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock *lock, struct nfs4_stateowner **replay_owner)
2640{ 2640{
2641 struct nfs4_stateowner *open_sop = NULL; 2641 struct nfs4_stateowner *open_sop = NULL;
@@ -2644,8 +2644,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2644 struct file *filp; 2644 struct file *filp;
2645 struct file_lock file_lock; 2645 struct file_lock file_lock;
2646 struct file_lock conflock; 2646 struct file_lock conflock;
2647 int status = 0; 2647 __be32 status = 0;
2648 unsigned int strhashval; 2648 unsigned int strhashval;
2649 int err;
2649 2650
2650 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", 2651 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
2651 (long long) lock->lk_offset, 2652 (long long) lock->lk_offset,
@@ -2758,13 +2759,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2758 * locks_copy_lock: */ 2759 * locks_copy_lock: */
2759 conflock.fl_ops = NULL; 2760 conflock.fl_ops = NULL;
2760 conflock.fl_lmops = NULL; 2761 conflock.fl_lmops = NULL;
2761 status = posix_lock_file_conf(filp, &file_lock, &conflock); 2762 err = posix_lock_file_conf(filp, &file_lock, &conflock);
2762 dprintk("NFSD: nfsd4_lock: posix_lock_file_conf status %d\n",status); 2763 dprintk("NFSD: nfsd4_lock: posix_lock_file_conf status %d\n",status);
2763 switch (-status) { 2764 switch (-err) {
2764 case 0: /* success! */ 2765 case 0: /* success! */
2765 update_stateid(&lock_stp->st_stateid); 2766 update_stateid(&lock_stp->st_stateid);
2766 memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 2767 memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid,
2767 sizeof(stateid_t)); 2768 sizeof(stateid_t));
2769 status = 0;
2768 break; 2770 break;
2769 case (EAGAIN): /* conflock holds conflicting lock */ 2771 case (EAGAIN): /* conflock holds conflicting lock */
2770 status = nfserr_denied; 2772 status = nfserr_denied;
@@ -2775,7 +2777,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2775 status = nfserr_deadlock; 2777 status = nfserr_deadlock;
2776 break; 2778 break;
2777 default: 2779 default:
2778 dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",status); 2780 dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",err);
2779 status = nfserr_resource; 2781 status = nfserr_resource;
2780 break; 2782 break;
2781 } 2783 }
@@ -2793,14 +2795,14 @@ out:
2793/* 2795/*
2794 * LOCKT operation 2796 * LOCKT operation
2795 */ 2797 */
2796int 2798__be32
2797nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lockt *lockt) 2799nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lockt *lockt)
2798{ 2800{
2799 struct inode *inode; 2801 struct inode *inode;
2800 struct file file; 2802 struct file file;
2801 struct file_lock file_lock; 2803 struct file_lock file_lock;
2802 struct file_lock conflock; 2804 struct file_lock conflock;
2803 int status; 2805 __be32 status;
2804 2806
2805 if (nfs4_in_grace()) 2807 if (nfs4_in_grace())
2806 return nfserr_grace; 2808 return nfserr_grace;
@@ -2873,13 +2875,14 @@ out:
2873 return status; 2875 return status;
2874} 2876}
2875 2877
2876int 2878__be32
2877nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_locku *locku, struct nfs4_stateowner **replay_owner) 2879nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_locku *locku, struct nfs4_stateowner **replay_owner)
2878{ 2880{
2879 struct nfs4_stateid *stp; 2881 struct nfs4_stateid *stp;
2880 struct file *filp = NULL; 2882 struct file *filp = NULL;
2881 struct file_lock file_lock; 2883 struct file_lock file_lock;
2882 int status; 2884 __be32 status;
2885 int err;
2883 2886
2884 dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", 2887 dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
2885 (long long) locku->lu_offset, 2888 (long long) locku->lu_offset,
@@ -2917,8 +2920,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
2917 /* 2920 /*
2918 * Try to unlock the file in the VFS. 2921 * Try to unlock the file in the VFS.
2919 */ 2922 */
2920 status = posix_lock_file(filp, &file_lock); 2923 err = posix_lock_file(filp, &file_lock);
2921 if (status) { 2924 if (err) {
2922 dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n"); 2925 dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n");
2923 goto out_nfserr; 2926 goto out_nfserr;
2924 } 2927 }
@@ -2937,7 +2940,7 @@ out:
2937 return status; 2940 return status;
2938 2941
2939out_nfserr: 2942out_nfserr:
2940 status = nfserrno(status); 2943 status = nfserrno(err);
2941 goto out; 2944 goto out;
2942} 2945}
2943 2946
@@ -2965,7 +2968,7 @@ out:
2965 return status; 2968 return status;
2966} 2969}
2967 2970
2968int 2971__be32
2969nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *rlockowner) 2972nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *rlockowner)
2970{ 2973{
2971 clientid_t *clid = &rlockowner->rl_clientid; 2974 clientid_t *clid = &rlockowner->rl_clientid;
@@ -2974,7 +2977,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *
2974 struct xdr_netobj *owner = &rlockowner->rl_owner; 2977 struct xdr_netobj *owner = &rlockowner->rl_owner;
2975 struct list_head matches; 2978 struct list_head matches;
2976 int i; 2979 int i;
2977 int status; 2980 __be32 status;
2978 2981
2979 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", 2982 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
2980 clid->cl_boot, clid->cl_id); 2983 clid->cl_boot, clid->cl_id);
@@ -3111,7 +3114,7 @@ nfs4_find_reclaim_client(clientid_t *clid)
3111/* 3114/*
3112* Called from OPEN. Look for clientid in reclaim list. 3115* Called from OPEN. Look for clientid in reclaim list.
3113*/ 3116*/
3114int 3117__be32
3115nfs4_check_open_reclaim(clientid_t *clid) 3118nfs4_check_open_reclaim(clientid_t *clid)
3116{ 3119{
3117 return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad; 3120 return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5be00436b5b8..f3f239db04bb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -60,8 +60,16 @@
60 60
61#define NFSDDBG_FACILITY NFSDDBG_XDR 61#define NFSDDBG_FACILITY NFSDDBG_XDR
62 62
63static int 63/*
64check_filename(char *str, int len, int err) 64 * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
65 * directory in order to indicate to the client that a filesystem boundary is present
66 * We use a fixed fsid for a referral
67 */
68#define NFS4_REFERRAL_FSID_MAJOR 0x8000000ULL
69#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL
70
71static __be32
72check_filename(char *str, int len, __be32 err)
65{ 73{
66 int i; 74 int i;
67 75
@@ -86,8 +94,8 @@ check_filename(char *str, int len, int err)
86 * consistent with the style used in NFSv2/v3... 94 * consistent with the style used in NFSv2/v3...
87 */ 95 */
88#define DECODE_HEAD \ 96#define DECODE_HEAD \
89 u32 *p; \ 97 __be32 *p; \
90 int status 98 __be32 status
91#define DECODE_TAIL \ 99#define DECODE_TAIL \
92 status = 0; \ 100 status = 0; \
93out: \ 101out: \
@@ -136,13 +144,13 @@ xdr_error: \
136 } \ 144 } \
137} while (0) 145} while (0)
138 146
139static u32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) 147static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
140{ 148{
141 /* We want more bytes than seem to be available. 149 /* We want more bytes than seem to be available.
142 * Maybe we need a new page, maybe we have just run out 150 * Maybe we need a new page, maybe we have just run out
143 */ 151 */
144 int avail = (char*)argp->end - (char*)argp->p; 152 int avail = (char*)argp->end - (char*)argp->p;
145 u32 *p; 153 __be32 *p;
146 if (avail + argp->pagelen < nbytes) 154 if (avail + argp->pagelen < nbytes)
147 return NULL; 155 return NULL;
148 if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */ 156 if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
@@ -189,7 +197,7 @@ defer_free(struct nfsd4_compoundargs *argp,
189 return 0; 197 return 0;
190} 198}
191 199
192static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes) 200static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
193{ 201{
194 void *new = NULL; 202 void *new = NULL;
195 if (p == argp->tmp) { 203 if (p == argp->tmp) {
@@ -209,7 +217,7 @@ static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
209} 217}
210 218
211 219
212static int 220static __be32
213nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) 221nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
214{ 222{
215 u32 bmlen; 223 u32 bmlen;
@@ -232,13 +240,14 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
232 DECODE_TAIL; 240 DECODE_TAIL;
233} 241}
234 242
235static int 243static __be32
236nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, 244nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
237 struct nfs4_acl **acl) 245 struct nfs4_acl **acl)
238{ 246{
239 int expected_len, len = 0; 247 int expected_len, len = 0;
240 u32 dummy32; 248 u32 dummy32;
241 char *buf; 249 char *buf;
250 int host_err;
242 251
243 DECODE_HEAD; 252 DECODE_HEAD;
244 iattr->ia_valid = 0; 253 iattr->ia_valid = 0;
@@ -272,7 +281,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
272 281
273 *acl = nfs4_acl_new(); 282 *acl = nfs4_acl_new();
274 if (*acl == NULL) { 283 if (*acl == NULL) {
275 status = -ENOMEM; 284 host_err = -ENOMEM;
276 goto out_nfserr; 285 goto out_nfserr;
277 } 286 }
278 defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl); 287 defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl);
@@ -287,20 +296,20 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
287 len += XDR_QUADLEN(dummy32) << 2; 296 len += XDR_QUADLEN(dummy32) << 2;
288 READMEM(buf, dummy32); 297 READMEM(buf, dummy32);
289 ace.whotype = nfs4_acl_get_whotype(buf, dummy32); 298 ace.whotype = nfs4_acl_get_whotype(buf, dummy32);
290 status = 0; 299 host_err = 0;
291 if (ace.whotype != NFS4_ACL_WHO_NAMED) 300 if (ace.whotype != NFS4_ACL_WHO_NAMED)
292 ace.who = 0; 301 ace.who = 0;
293 else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP) 302 else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP)
294 status = nfsd_map_name_to_gid(argp->rqstp, 303 host_err = nfsd_map_name_to_gid(argp->rqstp,
295 buf, dummy32, &ace.who); 304 buf, dummy32, &ace.who);
296 else 305 else
297 status = nfsd_map_name_to_uid(argp->rqstp, 306 host_err = nfsd_map_name_to_uid(argp->rqstp,
298 buf, dummy32, &ace.who); 307 buf, dummy32, &ace.who);
299 if (status) 308 if (host_err)
300 goto out_nfserr; 309 goto out_nfserr;
301 status = nfs4_acl_add_ace(*acl, ace.type, ace.flag, 310 host_err = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
302 ace.access_mask, ace.whotype, ace.who); 311 ace.access_mask, ace.whotype, ace.who);
303 if (status) 312 if (host_err)
304 goto out_nfserr; 313 goto out_nfserr;
305 } 314 }
306 } else 315 } else
@@ -319,7 +328,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
319 READ_BUF(dummy32); 328 READ_BUF(dummy32);
320 len += (XDR_QUADLEN(dummy32) << 2); 329 len += (XDR_QUADLEN(dummy32) << 2);
321 READMEM(buf, dummy32); 330 READMEM(buf, dummy32);
322 if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) 331 if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
323 goto out_nfserr; 332 goto out_nfserr;
324 iattr->ia_valid |= ATTR_UID; 333 iattr->ia_valid |= ATTR_UID;
325 } 334 }
@@ -330,7 +339,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
330 READ_BUF(dummy32); 339 READ_BUF(dummy32);
331 len += (XDR_QUADLEN(dummy32) << 2); 340 len += (XDR_QUADLEN(dummy32) << 2);
332 READMEM(buf, dummy32); 341 READMEM(buf, dummy32);
333 if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) 342 if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
334 goto out_nfserr; 343 goto out_nfserr;
335 iattr->ia_valid |= ATTR_GID; 344 iattr->ia_valid |= ATTR_GID;
336 } 345 }
@@ -406,11 +415,11 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
406 DECODE_TAIL; 415 DECODE_TAIL;
407 416
408out_nfserr: 417out_nfserr:
409 status = nfserrno(status); 418 status = nfserrno(host_err);
410 goto out; 419 goto out;
411} 420}
412 421
413static int 422static __be32
414nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) 423nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
415{ 424{
416 DECODE_HEAD; 425 DECODE_HEAD;
@@ -421,7 +430,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
421 DECODE_TAIL; 430 DECODE_TAIL;
422} 431}
423 432
424static int 433static __be32
425nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) 434nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
426{ 435{
427 DECODE_HEAD; 436 DECODE_HEAD;
@@ -436,7 +445,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
436} 445}
437 446
438 447
439static int 448static __be32
440nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) 449nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
441{ 450{
442 DECODE_HEAD; 451 DECODE_HEAD;
@@ -448,7 +457,7 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
448 DECODE_TAIL; 457 DECODE_TAIL;
449} 458}
450 459
451static int 460static __be32
452nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) 461nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
453{ 462{
454 DECODE_HEAD; 463 DECODE_HEAD;
@@ -488,7 +497,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
488 DECODE_TAIL; 497 DECODE_TAIL;
489} 498}
490 499
491static inline int 500static inline __be32
492nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) 501nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
493{ 502{
494 DECODE_HEAD; 503 DECODE_HEAD;
@@ -500,13 +509,13 @@ nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegretu
500 DECODE_TAIL; 509 DECODE_TAIL;
501} 510}
502 511
503static inline int 512static inline __be32
504nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) 513nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
505{ 514{
506 return nfsd4_decode_bitmap(argp, getattr->ga_bmval); 515 return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
507} 516}
508 517
509static int 518static __be32
510nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) 519nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
511{ 520{
512 DECODE_HEAD; 521 DECODE_HEAD;
@@ -521,7 +530,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
521 DECODE_TAIL; 530 DECODE_TAIL;
522} 531}
523 532
524static int 533static __be32
525nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) 534nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
526{ 535{
527 DECODE_HEAD; 536 DECODE_HEAD;
@@ -560,7 +569,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
560 DECODE_TAIL; 569 DECODE_TAIL;
561} 570}
562 571
563static int 572static __be32
564nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) 573nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
565{ 574{
566 DECODE_HEAD; 575 DECODE_HEAD;
@@ -579,7 +588,7 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
579 DECODE_TAIL; 588 DECODE_TAIL;
580} 589}
581 590
582static int 591static __be32
583nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) 592nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
584{ 593{
585 DECODE_HEAD; 594 DECODE_HEAD;
@@ -598,7 +607,7 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
598 DECODE_TAIL; 607 DECODE_TAIL;
599} 608}
600 609
601static int 610static __be32
602nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) 611nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
603{ 612{
604 DECODE_HEAD; 613 DECODE_HEAD;
@@ -613,7 +622,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
613 DECODE_TAIL; 622 DECODE_TAIL;
614} 623}
615 624
616static int 625static __be32
617nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) 626nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
618{ 627{
619 DECODE_HEAD; 628 DECODE_HEAD;
@@ -691,7 +700,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
691 DECODE_TAIL; 700 DECODE_TAIL;
692} 701}
693 702
694static int 703static __be32
695nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) 704nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
696{ 705{
697 DECODE_HEAD; 706 DECODE_HEAD;
@@ -705,7 +714,7 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
705 DECODE_TAIL; 714 DECODE_TAIL;
706} 715}
707 716
708static int 717static __be32
709nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) 718nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
710{ 719{
711 DECODE_HEAD; 720 DECODE_HEAD;
@@ -721,7 +730,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
721 DECODE_TAIL; 730 DECODE_TAIL;
722} 731}
723 732
724static int 733static __be32
725nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) 734nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
726{ 735{
727 DECODE_HEAD; 736 DECODE_HEAD;
@@ -736,7 +745,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
736 DECODE_TAIL; 745 DECODE_TAIL;
737} 746}
738 747
739static int 748static __be32
740nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) 749nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
741{ 750{
742 DECODE_HEAD; 751 DECODE_HEAD;
@@ -750,7 +759,7 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
750 DECODE_TAIL; 759 DECODE_TAIL;
751} 760}
752 761
753static int 762static __be32
754nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) 763nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
755{ 764{
756 DECODE_HEAD; 765 DECODE_HEAD;
@@ -766,7 +775,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
766 DECODE_TAIL; 775 DECODE_TAIL;
767} 776}
768 777
769static int 778static __be32
770nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) 779nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
771{ 780{
772 DECODE_HEAD; 781 DECODE_HEAD;
@@ -781,7 +790,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
781 DECODE_TAIL; 790 DECODE_TAIL;
782} 791}
783 792
784static int 793static __be32
785nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) 794nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
786{ 795{
787 DECODE_HEAD; 796 DECODE_HEAD;
@@ -801,7 +810,7 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
801 DECODE_TAIL; 810 DECODE_TAIL;
802} 811}
803 812
804static int 813static __be32
805nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) 814nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
806{ 815{
807 DECODE_HEAD; 816 DECODE_HEAD;
@@ -812,7 +821,7 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
812 DECODE_TAIL; 821 DECODE_TAIL;
813} 822}
814 823
815static int 824static __be32
816nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) 825nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
817{ 826{
818 DECODE_HEAD; 827 DECODE_HEAD;
@@ -826,7 +835,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
826 DECODE_TAIL; 835 DECODE_TAIL;
827} 836}
828 837
829static int 838static __be32
830nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) 839nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
831{ 840{
832 DECODE_HEAD; 841 DECODE_HEAD;
@@ -851,7 +860,7 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
851 DECODE_TAIL; 860 DECODE_TAIL;
852} 861}
853 862
854static int 863static __be32
855nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) 864nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
856{ 865{
857 DECODE_HEAD; 866 DECODE_HEAD;
@@ -864,7 +873,7 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
864} 873}
865 874
866/* Also used for NVERIFY */ 875/* Also used for NVERIFY */
867static int 876static __be32
868nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) 877nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
869{ 878{
870#if 0 879#if 0
@@ -900,7 +909,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
900 DECODE_TAIL; 909 DECODE_TAIL;
901} 910}
902 911
903static int 912static __be32
904nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) 913nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
905{ 914{
906 int avail; 915 int avail;
@@ -926,32 +935,32 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
926 printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); 935 printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__);
927 goto xdr_error; 936 goto xdr_error;
928 } 937 }
929 write->wr_vec[0].iov_base = p; 938 argp->rqstp->rq_vec[0].iov_base = p;
930 write->wr_vec[0].iov_len = avail; 939 argp->rqstp->rq_vec[0].iov_len = avail;
931 v = 0; 940 v = 0;
932 len = write->wr_buflen; 941 len = write->wr_buflen;
933 while (len > write->wr_vec[v].iov_len) { 942 while (len > argp->rqstp->rq_vec[v].iov_len) {
934 len -= write->wr_vec[v].iov_len; 943 len -= argp->rqstp->rq_vec[v].iov_len;
935 v++; 944 v++;
936 write->wr_vec[v].iov_base = page_address(argp->pagelist[0]); 945 argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]);
937 argp->pagelist++; 946 argp->pagelist++;
938 if (argp->pagelen >= PAGE_SIZE) { 947 if (argp->pagelen >= PAGE_SIZE) {
939 write->wr_vec[v].iov_len = PAGE_SIZE; 948 argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE;
940 argp->pagelen -= PAGE_SIZE; 949 argp->pagelen -= PAGE_SIZE;
941 } else { 950 } else {
942 write->wr_vec[v].iov_len = argp->pagelen; 951 argp->rqstp->rq_vec[v].iov_len = argp->pagelen;
943 argp->pagelen -= len; 952 argp->pagelen -= len;
944 } 953 }
945 } 954 }
946 argp->end = (u32*) (write->wr_vec[v].iov_base + write->wr_vec[v].iov_len); 955 argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len);
947 argp->p = (u32*) (write->wr_vec[v].iov_base + (XDR_QUADLEN(len) << 2)); 956 argp->p = (__be32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
948 write->wr_vec[v].iov_len = len; 957 argp->rqstp->rq_vec[v].iov_len = len;
949 write->wr_vlen = v+1; 958 write->wr_vlen = v+1;
950 959
951 DECODE_TAIL; 960 DECODE_TAIL;
952} 961}
953 962
954static int 963static __be32
955nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) 964nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
956{ 965{
957 DECODE_HEAD; 966 DECODE_HEAD;
@@ -965,7 +974,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
965 DECODE_TAIL; 974 DECODE_TAIL;
966} 975}
967 976
968static int 977static __be32
969nfsd4_decode_compound(struct nfsd4_compoundargs *argp) 978nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
970{ 979{
971 DECODE_HEAD; 980 DECODE_HEAD;
@@ -1171,7 +1180,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1171 * task to translate them into Linux-specific versions which are more 1180 * task to translate them into Linux-specific versions which are more
1172 * consistent with the style used in NFSv2/v3... 1181 * consistent with the style used in NFSv2/v3...
1173 */ 1182 */
1174#define ENCODE_HEAD u32 *p 1183#define ENCODE_HEAD __be32 *p
1175 1184
1176#define WRITE32(n) *p++ = htonl(n) 1185#define WRITE32(n) *p++ = htonl(n)
1177#define WRITE64(n) do { \ 1186#define WRITE64(n) do { \
@@ -1201,8 +1210,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1201 * Header routine to setup seqid operation replay cache 1210 * Header routine to setup seqid operation replay cache
1202 */ 1211 */
1203#define ENCODE_SEQID_OP_HEAD \ 1212#define ENCODE_SEQID_OP_HEAD \
1204 u32 *p; \ 1213 __be32 *p; \
1205 u32 *save; \ 1214 __be32 *save; \
1206 \ 1215 \
1207 save = resp->p; 1216 save = resp->p;
1208 1217
@@ -1223,6 +1232,120 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1223 stateowner->so_replay.rp_buflen); \ 1232 stateowner->so_replay.rp_buflen); \
1224 } } while (0); 1233 } } while (0);
1225 1234
1235/* Encode as an array of strings the string given with components
1236 * seperated @sep.
1237 */
1238static __be32 nfsd4_encode_components(char sep, char *components,
1239 __be32 **pp, int *buflen)
1240{
1241 __be32 *p = *pp;
1242 __be32 *countp = p;
1243 int strlen, count=0;
1244 char *str, *end;
1245
1246 dprintk("nfsd4_encode_components(%s)\n", components);
1247 if ((*buflen -= 4) < 0)
1248 return nfserr_resource;
1249 WRITE32(0); /* We will fill this in with @count later */
1250 end = str = components;
1251 while (*end) {
1252 for (; *end && (*end != sep); end++)
1253 ; /* Point to end of component */
1254 strlen = end - str;
1255 if (strlen) {
1256 if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
1257 return nfserr_resource;
1258 WRITE32(strlen);
1259 WRITEMEM(str, strlen);
1260 count++;
1261 }
1262 else
1263 end++;
1264 str = end;
1265 }
1266 *pp = p;
1267 p = countp;
1268 WRITE32(count);
1269 return 0;
1270}
1271
1272/*
1273 * encode a location element of a fs_locations structure
1274 */
1275static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
1276 __be32 **pp, int *buflen)
1277{
1278 __be32 status;
1279 __be32 *p = *pp;
1280
1281 status = nfsd4_encode_components(':', location->hosts, &p, buflen);
1282 if (status)
1283 return status;
1284 status = nfsd4_encode_components('/', location->path, &p, buflen);
1285 if (status)
1286 return status;
1287 *pp = p;
1288 return 0;
1289}
1290
1291/*
1292 * Return the path to an export point in the pseudo filesystem namespace
1293 * Returned string is safe to use as long as the caller holds a reference
1294 * to @exp.
1295 */
1296static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
1297{
1298 struct svc_fh tmp_fh;
1299 char *path, *rootpath;
1300
1301 fh_init(&tmp_fh, NFS4_FHSIZE);
1302 *stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle);
1303 if (*stat)
1304 return NULL;
1305 rootpath = tmp_fh.fh_export->ex_path;
1306
1307 path = exp->ex_path;
1308
1309 if (strncmp(path, rootpath, strlen(rootpath))) {
1310 printk("nfsd: fs_locations failed;"
1311 "%s is not contained in %s\n", path, rootpath);
1312 *stat = nfserr_notsupp;
1313 return NULL;
1314 }
1315
1316 return path + strlen(rootpath);
1317}
1318
1319/*
1320 * encode a fs_locations structure
1321 */
1322static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
1323 struct svc_export *exp,
1324 __be32 **pp, int *buflen)
1325{
1326 __be32 status;
1327 int i;
1328 __be32 *p = *pp;
1329 struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
1330 char *root = nfsd4_path(rqstp, exp, &status);
1331
1332 if (status)
1333 return status;
1334 status = nfsd4_encode_components('/', root, &p, buflen);
1335 if (status)
1336 return status;
1337 if ((*buflen -= 4) < 0)
1338 return nfserr_resource;
1339 WRITE32(fslocs->locations_count);
1340 for (i=0; i<fslocs->locations_count; i++) {
1341 status = nfsd4_encode_fs_location4(&fslocs->locations[i],
1342 &p, buflen);
1343 if (status)
1344 return status;
1345 }
1346 *pp = p;
1347 return 0;
1348}
1226 1349
1227static u32 nfs4_ftypes[16] = { 1350static u32 nfs4_ftypes[16] = {
1228 NF4BAD, NF4FIFO, NF4CHR, NF4BAD, 1351 NF4BAD, NF4FIFO, NF4CHR, NF4BAD,
@@ -1231,9 +1354,9 @@ static u32 nfs4_ftypes[16] = {
1231 NF4SOCK, NF4BAD, NF4LNK, NF4BAD, 1354 NF4SOCK, NF4BAD, NF4LNK, NF4BAD,
1232}; 1355};
1233 1356
1234static int 1357static __be32
1235nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, 1358nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
1236 u32 **p, int *buflen) 1359 __be32 **p, int *buflen)
1237{ 1360{
1238 int status; 1361 int status;
1239 1362
@@ -1253,25 +1376,44 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
1253 return 0; 1376 return 0;
1254} 1377}
1255 1378
1256static inline int 1379static inline __be32
1257nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, u32 **p, int *buflen) 1380nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen)
1258{ 1381{
1259 return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen); 1382 return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
1260} 1383}
1261 1384
1262static inline int 1385static inline __be32
1263nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, u32 **p, int *buflen) 1386nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen)
1264{ 1387{
1265 return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen); 1388 return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
1266} 1389}
1267 1390
1268static inline int 1391static inline __be32
1269nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group, 1392nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
1270 u32 **p, int *buflen) 1393 __be32 **p, int *buflen)
1271{ 1394{
1272 return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen); 1395 return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
1273} 1396}
1274 1397
1398#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
1399 FATTR4_WORD0_RDATTR_ERROR)
1400#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
1401
1402static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
1403{
1404 /* As per referral draft: */
1405 if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS ||
1406 *bmval1 & ~WORD1_ABSENT_FS_ATTRS) {
1407 if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR ||
1408 *bmval0 & FATTR4_WORD0_FS_LOCATIONS)
1409 *rdattr_err = NFSERR_MOVED;
1410 else
1411 return nfserr_moved;
1412 }
1413 *bmval0 &= WORD0_ABSENT_FS_ATTRS;
1414 *bmval1 &= WORD1_ABSENT_FS_ATTRS;
1415 return 0;
1416}
1275 1417
1276/* 1418/*
1277 * Note: @fhp can be NULL; in this case, we might have to compose the filehandle 1419 * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
@@ -1280,9 +1422,9 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
1280 * @countp is the buffer size in _words_; upon successful return this becomes 1422 * @countp is the buffer size in _words_; upon successful return this becomes
1281 * replaced with the number of words written. 1423 * replaced with the number of words written.
1282 */ 1424 */
1283int 1425__be32
1284nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 1426nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1285 struct dentry *dentry, u32 *buffer, int *countp, u32 *bmval, 1427 struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
1286 struct svc_rqst *rqstp) 1428 struct svc_rqst *rqstp)
1287{ 1429{
1288 u32 bmval0 = bmval[0]; 1430 u32 bmval0 = bmval[0];
@@ -1291,11 +1433,13 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1291 struct svc_fh tempfh; 1433 struct svc_fh tempfh;
1292 struct kstatfs statfs; 1434 struct kstatfs statfs;
1293 int buflen = *countp << 2; 1435 int buflen = *countp << 2;
1294 u32 *attrlenp; 1436 __be32 *attrlenp;
1295 u32 dummy; 1437 u32 dummy;
1296 u64 dummy64; 1438 u64 dummy64;
1297 u32 *p = buffer; 1439 u32 rdattr_err = 0;
1298 int status; 1440 __be32 *p = buffer;
1441 __be32 status;
1442 int err;
1299 int aclsupport = 0; 1443 int aclsupport = 0;
1300 struct nfs4_acl *acl = NULL; 1444 struct nfs4_acl *acl = NULL;
1301 1445
@@ -1303,14 +1447,20 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1303 BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); 1447 BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
1304 BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); 1448 BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
1305 1449
1306 status = vfs_getattr(exp->ex_mnt, dentry, &stat); 1450 if (exp->ex_fslocs.migrated) {
1307 if (status) 1451 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
1452 if (status)
1453 goto out;
1454 }
1455
1456 err = vfs_getattr(exp->ex_mnt, dentry, &stat);
1457 if (err)
1308 goto out_nfserr; 1458 goto out_nfserr;
1309 if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) || 1459 if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
1310 (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | 1460 (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
1311 FATTR4_WORD1_SPACE_TOTAL))) { 1461 FATTR4_WORD1_SPACE_TOTAL))) {
1312 status = vfs_statfs(dentry, &statfs); 1462 err = vfs_statfs(dentry, &statfs);
1313 if (status) 1463 if (err)
1314 goto out_nfserr; 1464 goto out_nfserr;
1315 } 1465 }
1316 if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { 1466 if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
@@ -1322,18 +1472,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1322 } 1472 }
1323 if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT 1473 if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
1324 | FATTR4_WORD0_SUPPORTED_ATTRS)) { 1474 | FATTR4_WORD0_SUPPORTED_ATTRS)) {
1325 status = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); 1475 err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
1326 aclsupport = (status == 0); 1476 aclsupport = (err == 0);
1327 if (bmval0 & FATTR4_WORD0_ACL) { 1477 if (bmval0 & FATTR4_WORD0_ACL) {
1328 if (status == -EOPNOTSUPP) 1478 if (err == -EOPNOTSUPP)
1329 bmval0 &= ~FATTR4_WORD0_ACL; 1479 bmval0 &= ~FATTR4_WORD0_ACL;
1330 else if (status == -EINVAL) { 1480 else if (err == -EINVAL) {
1331 status = nfserr_attrnotsupp; 1481 status = nfserr_attrnotsupp;
1332 goto out; 1482 goto out;
1333 } else if (status != 0) 1483 } else if (err != 0)
1334 goto out_nfserr; 1484 goto out_nfserr;
1335 } 1485 }
1336 } 1486 }
1487 if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
1488 if (exp->ex_fslocs.locations == NULL) {
1489 bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1490 }
1491 }
1337 if ((buflen -= 16) < 0) 1492 if ((buflen -= 16) < 0)
1338 goto out_resource; 1493 goto out_resource;
1339 1494
@@ -1343,12 +1498,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1343 attrlenp = p++; /* to be backfilled later */ 1498 attrlenp = p++; /* to be backfilled later */
1344 1499
1345 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { 1500 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
1501 u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
1346 if ((buflen -= 12) < 0) 1502 if ((buflen -= 12) < 0)
1347 goto out_resource; 1503 goto out_resource;
1504 if (!aclsupport)
1505 word0 &= ~FATTR4_WORD0_ACL;
1506 if (!exp->ex_fslocs.locations)
1507 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1348 WRITE32(2); 1508 WRITE32(2);
1349 WRITE32(aclsupport ? 1509 WRITE32(word0);
1350 NFSD_SUPPORTED_ATTRS_WORD0 :
1351 NFSD_SUPPORTED_ATTRS_WORD0 & ~FATTR4_WORD0_ACL);
1352 WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); 1510 WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
1353 } 1511 }
1354 if (bmval0 & FATTR4_WORD0_TYPE) { 1512 if (bmval0 & FATTR4_WORD0_TYPE) {
@@ -1402,7 +1560,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1402 if (bmval0 & FATTR4_WORD0_FSID) { 1560 if (bmval0 & FATTR4_WORD0_FSID) {
1403 if ((buflen -= 16) < 0) 1561 if ((buflen -= 16) < 0)
1404 goto out_resource; 1562 goto out_resource;
1405 if (is_fsid(fhp, rqstp->rq_reffh)) { 1563 if (exp->ex_fslocs.migrated) {
1564 WRITE64(NFS4_REFERRAL_FSID_MAJOR);
1565 WRITE64(NFS4_REFERRAL_FSID_MINOR);
1566 } else if (is_fsid(fhp, rqstp->rq_reffh)) {
1406 WRITE64((u64)exp->ex_fsid); 1567 WRITE64((u64)exp->ex_fsid);
1407 WRITE64((u64)0); 1568 WRITE64((u64)0);
1408 } else { 1569 } else {
@@ -1425,7 +1586,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1425 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { 1586 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
1426 if ((buflen -= 4) < 0) 1587 if ((buflen -= 4) < 0)
1427 goto out_resource; 1588 goto out_resource;
1428 WRITE32(0); 1589 WRITE32(rdattr_err);
1429 } 1590 }
1430 if (bmval0 & FATTR4_WORD0_ACL) { 1591 if (bmval0 & FATTR4_WORD0_ACL) {
1431 struct nfs4_ace *ace; 1592 struct nfs4_ace *ace;
@@ -1513,6 +1674,13 @@ out_acl:
1513 goto out_resource; 1674 goto out_resource;
1514 WRITE64((u64) statfs.f_files); 1675 WRITE64((u64) statfs.f_files);
1515 } 1676 }
1677 if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
1678 status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
1679 if (status == nfserr_resource)
1680 goto out_resource;
1681 if (status)
1682 goto out;
1683 }
1516 if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { 1684 if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
1517 if ((buflen -= 4) < 0) 1685 if ((buflen -= 4) < 0)
1518 goto out_resource; 1686 goto out_resource;
@@ -1536,12 +1704,12 @@ out_acl:
1536 if (bmval0 & FATTR4_WORD0_MAXREAD) { 1704 if (bmval0 & FATTR4_WORD0_MAXREAD) {
1537 if ((buflen -= 8) < 0) 1705 if ((buflen -= 8) < 0)
1538 goto out_resource; 1706 goto out_resource;
1539 WRITE64((u64) NFSSVC_MAXBLKSIZE); 1707 WRITE64((u64) svc_max_payload(rqstp));
1540 } 1708 }
1541 if (bmval0 & FATTR4_WORD0_MAXWRITE) { 1709 if (bmval0 & FATTR4_WORD0_MAXWRITE) {
1542 if ((buflen -= 8) < 0) 1710 if ((buflen -= 8) < 0)
1543 goto out_resource; 1711 goto out_resource;
1544 WRITE64((u64) NFSSVC_MAXBLKSIZE); 1712 WRITE64((u64) svc_max_payload(rqstp));
1545 } 1713 }
1546 if (bmval1 & FATTR4_WORD1_MODE) { 1714 if (bmval1 & FATTR4_WORD1_MODE) {
1547 if ((buflen -= 4) < 0) 1715 if ((buflen -= 4) < 0)
@@ -1652,7 +1820,7 @@ out:
1652 fh_put(&tempfh); 1820 fh_put(&tempfh);
1653 return status; 1821 return status;
1654out_nfserr: 1822out_nfserr:
1655 status = nfserrno(status); 1823 status = nfserrno(err);
1656 goto out; 1824 goto out;
1657out_resource: 1825out_resource:
1658 *countp = 0; 1826 *countp = 0;
@@ -1663,13 +1831,13 @@ out_serverfault:
1663 goto out; 1831 goto out;
1664} 1832}
1665 1833
1666static int 1834static __be32
1667nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, 1835nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
1668 const char *name, int namlen, u32 *p, int *buflen) 1836 const char *name, int namlen, __be32 *p, int *buflen)
1669{ 1837{
1670 struct svc_export *exp = cd->rd_fhp->fh_export; 1838 struct svc_export *exp = cd->rd_fhp->fh_export;
1671 struct dentry *dentry; 1839 struct dentry *dentry;
1672 int nfserr; 1840 __be32 nfserr;
1673 1841
1674 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); 1842 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
1675 if (IS_ERR(dentry)) 1843 if (IS_ERR(dentry))
@@ -1698,10 +1866,10 @@ out_put:
1698 return nfserr; 1866 return nfserr;
1699} 1867}
1700 1868
1701static u32 * 1869static __be32 *
1702nfsd4_encode_rdattr_error(u32 *p, int buflen, int nfserr) 1870nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
1703{ 1871{
1704 u32 *attrlenp; 1872 __be32 *attrlenp;
1705 1873
1706 if (buflen < 6) 1874 if (buflen < 6)
1707 return NULL; 1875 return NULL;
@@ -1721,8 +1889,8 @@ nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen,
1721{ 1889{
1722 struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); 1890 struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
1723 int buflen; 1891 int buflen;
1724 u32 *p = cd->buffer; 1892 __be32 *p = cd->buffer;
1725 int nfserr = nfserr_toosmall; 1893 __be32 nfserr = nfserr_toosmall;
1726 1894
1727 /* In nfsv4, "." and ".." never make it onto the wire.. */ 1895 /* In nfsv4, "." and ".." never make it onto the wire.. */
1728 if (name && isdotent(name, namlen)) { 1896 if (name && isdotent(name, namlen)) {
@@ -1778,7 +1946,7 @@ fail:
1778} 1946}
1779 1947
1780static void 1948static void
1781nfsd4_encode_access(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_access *access) 1949nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
1782{ 1950{
1783 ENCODE_HEAD; 1951 ENCODE_HEAD;
1784 1952
@@ -1791,7 +1959,7 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_acc
1791} 1959}
1792 1960
1793static void 1961static void
1794nfsd4_encode_close(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_close *close) 1962nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
1795{ 1963{
1796 ENCODE_SEQID_OP_HEAD; 1964 ENCODE_SEQID_OP_HEAD;
1797 1965
@@ -1806,7 +1974,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_clos
1806 1974
1807 1975
1808static void 1976static void
1809nfsd4_encode_commit(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_commit *commit) 1977nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
1810{ 1978{
1811 ENCODE_HEAD; 1979 ENCODE_HEAD;
1812 1980
@@ -1818,7 +1986,7 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_com
1818} 1986}
1819 1987
1820static void 1988static void
1821nfsd4_encode_create(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_create *create) 1989nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
1822{ 1990{
1823 ENCODE_HEAD; 1991 ENCODE_HEAD;
1824 1992
@@ -1832,8 +2000,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_cre
1832 } 2000 }
1833} 2001}
1834 2002
1835static int 2003static __be32
1836nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_getattr *getattr) 2004nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
1837{ 2005{
1838 struct svc_fh *fhp = getattr->ga_fhp; 2006 struct svc_fh *fhp = getattr->ga_fhp;
1839 int buflen; 2007 int buflen;
@@ -1845,14 +2013,13 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ge
1845 nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, 2013 nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
1846 resp->p, &buflen, getattr->ga_bmval, 2014 resp->p, &buflen, getattr->ga_bmval,
1847 resp->rqstp); 2015 resp->rqstp);
1848
1849 if (!nfserr) 2016 if (!nfserr)
1850 resp->p += buflen; 2017 resp->p += buflen;
1851 return nfserr; 2018 return nfserr;
1852} 2019}
1853 2020
1854static void 2021static void
1855nfsd4_encode_getfh(struct nfsd4_compoundres *resp, int nfserr, struct svc_fh *fhp) 2022nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp)
1856{ 2023{
1857 unsigned int len; 2024 unsigned int len;
1858 ENCODE_HEAD; 2025 ENCODE_HEAD;
@@ -1892,7 +2059,7 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
1892} 2059}
1893 2060
1894static void 2061static void
1895nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock *lock) 2062nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
1896{ 2063{
1897 ENCODE_SEQID_OP_HEAD; 2064 ENCODE_SEQID_OP_HEAD;
1898 2065
@@ -1908,14 +2075,14 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
1908} 2075}
1909 2076
1910static void 2077static void
1911nfsd4_encode_lockt(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lockt *lockt) 2078nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
1912{ 2079{
1913 if (nfserr == nfserr_denied) 2080 if (nfserr == nfserr_denied)
1914 nfsd4_encode_lock_denied(resp, &lockt->lt_denied); 2081 nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
1915} 2082}
1916 2083
1917static void 2084static void
1918nfsd4_encode_locku(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_locku *locku) 2085nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
1919{ 2086{
1920 ENCODE_SEQID_OP_HEAD; 2087 ENCODE_SEQID_OP_HEAD;
1921 2088
@@ -1931,7 +2098,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
1931 2098
1932 2099
1933static void 2100static void
1934nfsd4_encode_link(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_link *link) 2101nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
1935{ 2102{
1936 ENCODE_HEAD; 2103 ENCODE_HEAD;
1937 2104
@@ -1944,7 +2111,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_link
1944 2111
1945 2112
1946static void 2113static void
1947nfsd4_encode_open(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open *open) 2114nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
1948{ 2115{
1949 ENCODE_SEQID_OP_HEAD; 2116 ENCODE_SEQID_OP_HEAD;
1950 2117
@@ -2009,7 +2176,7 @@ out:
2009} 2176}
2010 2177
2011static void 2178static void
2012nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open_confirm *oc) 2179nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
2013{ 2180{
2014 ENCODE_SEQID_OP_HEAD; 2181 ENCODE_SEQID_OP_HEAD;
2015 2182
@@ -2024,7 +2191,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, int nfserr, struct nfs
2024} 2191}
2025 2192
2026static void 2193static void
2027nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open_downgrade *od) 2194nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
2028{ 2195{
2029 ENCODE_SEQID_OP_HEAD; 2196 ENCODE_SEQID_OP_HEAD;
2030 2197
@@ -2038,8 +2205,9 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct n
2038 ENCODE_SEQID_OP_TAIL(od->od_stateowner); 2205 ENCODE_SEQID_OP_TAIL(od->od_stateowner);
2039} 2206}
2040 2207
2041static int 2208static __be32
2042nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read *read) 2209nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2210 struct nfsd4_read *read)
2043{ 2211{
2044 u32 eof; 2212 u32 eof;
2045 int v, pn; 2213 int v, pn;
@@ -2054,31 +2222,33 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
2054 2222
2055 RESERVE_SPACE(8); /* eof flag and byte count */ 2223 RESERVE_SPACE(8); /* eof flag and byte count */
2056 2224
2057 maxcount = NFSSVC_MAXBLKSIZE; 2225 maxcount = svc_max_payload(resp->rqstp);
2058 if (maxcount > read->rd_length) 2226 if (maxcount > read->rd_length)
2059 maxcount = read->rd_length; 2227 maxcount = read->rd_length;
2060 2228
2061 len = maxcount; 2229 len = maxcount;
2062 v = 0; 2230 v = 0;
2063 while (len > 0) { 2231 while (len > 0) {
2064 pn = resp->rqstp->rq_resused; 2232 pn = resp->rqstp->rq_resused++;
2065 svc_take_page(resp->rqstp); 2233 resp->rqstp->rq_vec[v].iov_base =
2066 read->rd_iov[v].iov_base = page_address(resp->rqstp->rq_respages[pn]); 2234 page_address(resp->rqstp->rq_respages[pn]);
2067 read->rd_iov[v].iov_len = len < PAGE_SIZE ? len : PAGE_SIZE; 2235 resp->rqstp->rq_vec[v].iov_len =
2236 len < PAGE_SIZE ? len : PAGE_SIZE;
2068 v++; 2237 v++;
2069 len -= PAGE_SIZE; 2238 len -= PAGE_SIZE;
2070 } 2239 }
2071 read->rd_vlen = v; 2240 read->rd_vlen = v;
2072 2241
2073 nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, 2242 nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp,
2074 read->rd_offset, read->rd_iov, read->rd_vlen, 2243 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
2075 &maxcount); 2244 &maxcount);
2076 2245
2077 if (nfserr == nfserr_symlink) 2246 if (nfserr == nfserr_symlink)
2078 nfserr = nfserr_inval; 2247 nfserr = nfserr_inval;
2079 if (nfserr) 2248 if (nfserr)
2080 return nfserr; 2249 return nfserr;
2081 eof = (read->rd_offset + maxcount >= read->rd_fhp->fh_dentry->d_inode->i_size); 2250 eof = (read->rd_offset + maxcount >=
2251 read->rd_fhp->fh_dentry->d_inode->i_size);
2082 2252
2083 WRITE32(eof); 2253 WRITE32(eof);
2084 WRITE32(maxcount); 2254 WRITE32(maxcount);
@@ -2088,7 +2258,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
2088 resp->xbuf->page_len = maxcount; 2258 resp->xbuf->page_len = maxcount;
2089 2259
2090 /* Use rest of head for padding and remaining ops: */ 2260 /* Use rest of head for padding and remaining ops: */
2091 resp->rqstp->rq_restailpage = 0;
2092 resp->xbuf->tail[0].iov_base = p; 2261 resp->xbuf->tail[0].iov_base = p;
2093 resp->xbuf->tail[0].iov_len = 0; 2262 resp->xbuf->tail[0].iov_len = 0;
2094 if (maxcount&3) { 2263 if (maxcount&3) {
@@ -2101,8 +2270,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
2101 return 0; 2270 return 0;
2102} 2271}
2103 2272
2104static int 2273static __be32
2105nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readlink *readlink) 2274nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
2106{ 2275{
2107 int maxcount; 2276 int maxcount;
2108 char *page; 2277 char *page;
@@ -2113,8 +2282,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
2113 if (resp->xbuf->page_len) 2282 if (resp->xbuf->page_len)
2114 return nfserr_resource; 2283 return nfserr_resource;
2115 2284
2116 svc_take_page(resp->rqstp); 2285 page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
2117 page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
2118 2286
2119 maxcount = PAGE_SIZE; 2287 maxcount = PAGE_SIZE;
2120 RESERVE_SPACE(4); 2288 RESERVE_SPACE(4);
@@ -2138,7 +2306,6 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
2138 resp->xbuf->page_len = maxcount; 2306 resp->xbuf->page_len = maxcount;
2139 2307
2140 /* Use rest of head for padding and remaining ops: */ 2308 /* Use rest of head for padding and remaining ops: */
2141 resp->rqstp->rq_restailpage = 0;
2142 resp->xbuf->tail[0].iov_base = p; 2309 resp->xbuf->tail[0].iov_base = p;
2143 resp->xbuf->tail[0].iov_len = 0; 2310 resp->xbuf->tail[0].iov_len = 0;
2144 if (maxcount&3) { 2311 if (maxcount&3) {
@@ -2151,12 +2318,12 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
2151 return 0; 2318 return 0;
2152} 2319}
2153 2320
2154static int 2321static __be32
2155nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readdir *readdir) 2322nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
2156{ 2323{
2157 int maxcount; 2324 int maxcount;
2158 loff_t offset; 2325 loff_t offset;
2159 u32 *page, *savep, *tailbase; 2326 __be32 *page, *savep, *tailbase;
2160 ENCODE_HEAD; 2327 ENCODE_HEAD;
2161 2328
2162 if (nfserr) 2329 if (nfserr)
@@ -2189,8 +2356,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
2189 goto err_no_verf; 2356 goto err_no_verf;
2190 } 2357 }
2191 2358
2192 svc_take_page(resp->rqstp); 2359 page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
2193 page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
2194 readdir->common.err = 0; 2360 readdir->common.err = 0;
2195 readdir->buflen = maxcount; 2361 readdir->buflen = maxcount;
2196 readdir->buffer = page; 2362 readdir->buffer = page;
@@ -2215,10 +2381,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
2215 p = readdir->buffer; 2381 p = readdir->buffer;
2216 *p++ = 0; /* no more entries */ 2382 *p++ = 0; /* no more entries */
2217 *p++ = htonl(readdir->common.err == nfserr_eof); 2383 *p++ = htonl(readdir->common.err == nfserr_eof);
2218 resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); 2384 resp->xbuf->page_len = ((char*)p) - (char*)page_address(
2385 resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
2219 2386
2220 /* Use rest of head for padding and remaining ops: */ 2387 /* Use rest of head for padding and remaining ops: */
2221 resp->rqstp->rq_restailpage = 0;
2222 resp->xbuf->tail[0].iov_base = tailbase; 2388 resp->xbuf->tail[0].iov_base = tailbase;
2223 resp->xbuf->tail[0].iov_len = 0; 2389 resp->xbuf->tail[0].iov_len = 0;
2224 resp->p = resp->xbuf->tail[0].iov_base; 2390 resp->p = resp->xbuf->tail[0].iov_base;
@@ -2232,7 +2398,7 @@ err_no_verf:
2232} 2398}
2233 2399
2234static void 2400static void
2235nfsd4_encode_remove(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_remove *remove) 2401nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
2236{ 2402{
2237 ENCODE_HEAD; 2403 ENCODE_HEAD;
2238 2404
@@ -2244,7 +2410,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_rem
2244} 2410}
2245 2411
2246static void 2412static void
2247nfsd4_encode_rename(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_rename *rename) 2413nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
2248{ 2414{
2249 ENCODE_HEAD; 2415 ENCODE_HEAD;
2250 2416
@@ -2261,7 +2427,7 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ren
2261 * regardless of the error status. 2427 * regardless of the error status.
2262 */ 2428 */
2263static void 2429static void
2264nfsd4_encode_setattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_setattr *setattr) 2430nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
2265{ 2431{
2266 ENCODE_HEAD; 2432 ENCODE_HEAD;
2267 2433
@@ -2280,7 +2446,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_se
2280} 2446}
2281 2447
2282static void 2448static void
2283nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_setclientid *scd) 2449nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
2284{ 2450{
2285 ENCODE_HEAD; 2451 ENCODE_HEAD;
2286 2452
@@ -2299,7 +2465,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd
2299} 2465}
2300 2466
2301static void 2467static void
2302nfsd4_encode_write(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_write *write) 2468nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
2303{ 2469{
2304 ENCODE_HEAD; 2470 ENCODE_HEAD;
2305 2471
@@ -2315,7 +2481,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_writ
2315void 2481void
2316nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) 2482nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2317{ 2483{
2318 u32 *statp; 2484 __be32 *statp;
2319 ENCODE_HEAD; 2485 ENCODE_HEAD;
2320 2486
2321 RESERVE_SPACE(8); 2487 RESERVE_SPACE(8);
@@ -2453,7 +2619,7 @@ nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2453 */ 2619 */
2454 2620
2455int 2621int
2456nfs4svc_encode_voidres(struct svc_rqst *rqstp, u32 *p, void *dummy) 2622nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
2457{ 2623{
2458 return xdr_ressize_check(rqstp, p); 2624 return xdr_ressize_check(rqstp, p);
2459} 2625}
@@ -2475,9 +2641,9 @@ void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
2475} 2641}
2476 2642
2477int 2643int
2478nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundargs *args) 2644nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
2479{ 2645{
2480 int status; 2646 __be32 status;
2481 2647
2482 args->p = p; 2648 args->p = p;
2483 args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; 2649 args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
@@ -2496,7 +2662,7 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoun
2496} 2662}
2497 2663
2498int 2664int
2499nfs4svc_encode_compoundres(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundres *resp) 2665nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundres *resp)
2500{ 2666{
2501 /* 2667 /*
2502 * All that remains is to write the tag and operation count... 2668 * All that remains is to write the tag and operation count...
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index fdf7cf3dfadc..6100bbe27432 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,7 +29,7 @@
29 */ 29 */
30#define CACHESIZE 1024 30#define CACHESIZE 1024
31#define HASHSIZE 64 31#define HASHSIZE 64
32#define REQHASH(xid) ((((xid) >> 24) ^ (xid)) & (HASHSIZE-1)) 32#define REQHASH(xid) (((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
33 33
34static struct hlist_head * hash_list; 34static struct hlist_head * hash_list;
35static struct list_head lru_head; 35static struct list_head lru_head;
@@ -127,8 +127,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
127 struct hlist_node *hn; 127 struct hlist_node *hn;
128 struct hlist_head *rh; 128 struct hlist_head *rh;
129 struct svc_cacherep *rp; 129 struct svc_cacherep *rp;
130 u32 xid = rqstp->rq_xid, 130 __be32 xid = rqstp->rq_xid;
131 proto = rqstp->rq_prot, 131 u32 proto = rqstp->rq_prot,
132 vers = rqstp->rq_vers, 132 vers = rqstp->rq_vers,
133 proc = rqstp->rq_proc; 133 proc = rqstp->rq_proc;
134 unsigned long age; 134 unsigned long age;
@@ -258,7 +258,7 @@ found_entry:
258 * In this case, nfsd_cache_update is called with statp == NULL. 258 * In this case, nfsd_cache_update is called with statp == NULL.
259 */ 259 */
260void 260void
261nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp) 261nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
262{ 262{
263 struct svc_cacherep *rp; 263 struct svc_cacherep *rp;
264 struct kvec *resv = &rqstp->rq_res.head[0], *cachv; 264 struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5c6a477c20ec..39aed901514b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -57,6 +57,7 @@ enum {
57 NFSD_Pool_Threads, 57 NFSD_Pool_Threads,
58 NFSD_Versions, 58 NFSD_Versions,
59 NFSD_Ports, 59 NFSD_Ports,
60 NFSD_MaxBlkSize,
60 /* 61 /*
61 * The below MUST come last. Otherwise we leave a hole in nfsd_files[] 62 * The below MUST come last. Otherwise we leave a hole in nfsd_files[]
62 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops 63 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -82,6 +83,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size);
82static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); 83static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
83static ssize_t write_versions(struct file *file, char *buf, size_t size); 84static ssize_t write_versions(struct file *file, char *buf, size_t size);
84static ssize_t write_ports(struct file *file, char *buf, size_t size); 85static ssize_t write_ports(struct file *file, char *buf, size_t size);
86static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
85#ifdef CONFIG_NFSD_V4 87#ifdef CONFIG_NFSD_V4
86static ssize_t write_leasetime(struct file *file, char *buf, size_t size); 88static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
87static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 89static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
@@ -100,6 +102,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
100 [NFSD_Pool_Threads] = write_pool_threads, 102 [NFSD_Pool_Threads] = write_pool_threads,
101 [NFSD_Versions] = write_versions, 103 [NFSD_Versions] = write_versions,
102 [NFSD_Ports] = write_ports, 104 [NFSD_Ports] = write_ports,
105 [NFSD_MaxBlkSize] = write_maxblksize,
103#ifdef CONFIG_NFSD_V4 106#ifdef CONFIG_NFSD_V4
104 [NFSD_Leasetime] = write_leasetime, 107 [NFSD_Leasetime] = write_leasetime,
105 [NFSD_RecoveryDir] = write_recoverydir, 108 [NFSD_RecoveryDir] = write_recoverydir,
@@ -523,18 +526,20 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
523 err = nfsd_create_serv(); 526 err = nfsd_create_serv();
524 if (!err) { 527 if (!err) {
525 int proto = 0; 528 int proto = 0;
526 err = lockd_up(proto); 529 err = svc_addsock(nfsd_serv, fd, buf, &proto);
527 if (!err) { 530 if (err >= 0) {
528 err = svc_addsock(nfsd_serv, fd, buf, &proto); 531 err = lockd_up(proto);
529 if (err) 532 if (err < 0)
530 lockd_down(); 533 svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
531 } 534 }
532 /* Decrease the count, but don't shutdown the 535 /* Decrease the count, but don't shutdown the
533 * the service 536 * the service
534 */ 537 */
538 lock_kernel();
535 nfsd_serv->sv_nrthreads--; 539 nfsd_serv->sv_nrthreads--;
540 unlock_kernel();
536 } 541 }
537 return err; 542 return err < 0 ? err : 0;
538 } 543 }
539 if (buf[0] == '-') { 544 if (buf[0] == '-') {
540 char *toclose = kstrdup(buf+1, GFP_KERNEL); 545 char *toclose = kstrdup(buf+1, GFP_KERNEL);
@@ -545,12 +550,43 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
545 if (nfsd_serv) 550 if (nfsd_serv)
546 len = svc_sock_names(buf, nfsd_serv, toclose); 551 len = svc_sock_names(buf, nfsd_serv, toclose);
547 unlock_kernel(); 552 unlock_kernel();
553 if (len >= 0)
554 lockd_down();
548 kfree(toclose); 555 kfree(toclose);
549 return len; 556 return len;
550 } 557 }
551 return -EINVAL; 558 return -EINVAL;
552} 559}
553 560
561int nfsd_max_blksize;
562
563static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
564{
565 char *mesg = buf;
566 if (size > 0) {
567 int bsize;
568 int rv = get_int(&mesg, &bsize);
569 if (rv)
570 return rv;
571 /* force bsize into allowed range and
572 * required alignment.
573 */
574 if (bsize < 1024)
575 bsize = 1024;
576 if (bsize > NFSSVC_MAXBLKSIZE)
577 bsize = NFSSVC_MAXBLKSIZE;
578 bsize &= ~(1024-1);
579 lock_kernel();
580 if (nfsd_serv && nfsd_serv->sv_nrthreads) {
581 unlock_kernel();
582 return -EBUSY;
583 }
584 nfsd_max_blksize = bsize;
585 unlock_kernel();
586 }
587 return sprintf(buf, "%d\n", nfsd_max_blksize);
588}
589
554#ifdef CONFIG_NFSD_V4 590#ifdef CONFIG_NFSD_V4
555extern time_t nfs4_leasetime(void); 591extern time_t nfs4_leasetime(void);
556 592
@@ -616,6 +652,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
616 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, 652 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
617 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 653 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
618 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 654 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
655 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
619#ifdef CONFIG_NFSD_V4 656#ifdef CONFIG_NFSD_V4
620 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 657 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
621 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, 658 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 501d83884530..727ab3bd450d 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -76,7 +76,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
76 * comment in the NFSv3 spec says this is incorrect (implementation notes for 76 * comment in the NFSv3 spec says this is incorrect (implementation notes for
77 * the write call). 77 * the write call).
78 */ 78 */
79static inline int 79static inline __be32
80nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type) 80nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
81{ 81{
82 /* Type can be negative when creating hardlinks - not to a dir */ 82 /* Type can be negative when creating hardlinks - not to a dir */
@@ -110,13 +110,13 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
110 * This is only called at the start of an nfsproc call, so fhp points to 110 * This is only called at the start of an nfsproc call, so fhp points to
111 * a svc_fh which is all 0 except for the over-the-wire file handle. 111 * a svc_fh which is all 0 except for the over-the-wire file handle.
112 */ 112 */
113u32 113__be32
114fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) 114fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
115{ 115{
116 struct knfsd_fh *fh = &fhp->fh_handle; 116 struct knfsd_fh *fh = &fhp->fh_handle;
117 struct svc_export *exp = NULL; 117 struct svc_export *exp = NULL;
118 struct dentry *dentry; 118 struct dentry *dentry;
119 u32 error = 0; 119 __be32 error = 0;
120 120
121 dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); 121 dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
122 122
@@ -315,7 +315,7 @@ static inline void _fh_update_old(struct dentry *dentry,
315 fh->ofh_dirino = 0; 315 fh->ofh_dirino = 0;
316} 316}
317 317
318int 318__be32
319fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct svc_fh *ref_fh) 319fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct svc_fh *ref_fh)
320{ 320{
321 /* ref_fh is a reference file handle. 321 /* ref_fh is a reference file handle.
@@ -451,7 +451,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
451 * Update file handle information after changing a dentry. 451 * Update file handle information after changing a dentry.
452 * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create 452 * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create
453 */ 453 */
454int 454__be32
455fh_update(struct svc_fh *fhp) 455fh_update(struct svc_fh *fhp)
456{ 456{
457 struct dentry *dentry; 457 struct dentry *dentry;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 06cd0db0f32b..ec983b777680 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -30,22 +30,22 @@ typedef struct svc_buf svc_buf;
30#define NFSDDBG_FACILITY NFSDDBG_PROC 30#define NFSDDBG_FACILITY NFSDDBG_PROC
31 31
32 32
33static int 33static __be32
34nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) 34nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
35{ 35{
36 return nfs_ok; 36 return nfs_ok;
37} 37}
38 38
39static int 39static __be32
40nfsd_return_attrs(int err, struct nfsd_attrstat *resp) 40nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
41{ 41{
42 if (err) return err; 42 if (err) return err;
43 return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt, 43 return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
44 resp->fh.fh_dentry, 44 resp->fh.fh_dentry,
45 &resp->stat)); 45 &resp->stat));
46} 46}
47static int 47static __be32
48nfsd_return_dirop(int err, struct nfsd_diropres *resp) 48nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
49{ 49{
50 if (err) return err; 50 if (err) return err;
51 return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt, 51 return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
@@ -56,11 +56,11 @@ nfsd_return_dirop(int err, struct nfsd_diropres *resp)
56 * Get a file's attributes 56 * Get a file's attributes
57 * N.B. After this call resp->fh needs an fh_put 57 * N.B. After this call resp->fh needs an fh_put
58 */ 58 */
59static int 59static __be32
60nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, 60nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
61 struct nfsd_attrstat *resp) 61 struct nfsd_attrstat *resp)
62{ 62{
63 int nfserr; 63 __be32 nfserr;
64 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); 64 dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
65 65
66 fh_copy(&resp->fh, &argp->fh); 66 fh_copy(&resp->fh, &argp->fh);
@@ -72,11 +72,11 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
72 * Set a file's attributes 72 * Set a file's attributes
73 * N.B. After this call resp->fh needs an fh_put 73 * N.B. After this call resp->fh needs an fh_put
74 */ 74 */
75static int 75static __be32
76nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, 76nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
77 struct nfsd_attrstat *resp) 77 struct nfsd_attrstat *resp)
78{ 78{
79 int nfserr; 79 __be32 nfserr;
80 dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", 80 dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
81 SVCFH_fmt(&argp->fh), 81 SVCFH_fmt(&argp->fh),
82 argp->attrs.ia_valid, (long) argp->attrs.ia_size); 82 argp->attrs.ia_valid, (long) argp->attrs.ia_size);
@@ -92,11 +92,11 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
92 * doesn't exist yet. 92 * doesn't exist yet.
93 * N.B. After this call resp->fh needs an fh_put 93 * N.B. After this call resp->fh needs an fh_put
94 */ 94 */
95static int 95static __be32
96nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, 96nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
97 struct nfsd_diropres *resp) 97 struct nfsd_diropres *resp)
98{ 98{
99 int nfserr; 99 __be32 nfserr;
100 100
101 dprintk("nfsd: LOOKUP %s %.*s\n", 101 dprintk("nfsd: LOOKUP %s %.*s\n",
102 SVCFH_fmt(&argp->fh), argp->len, argp->name); 102 SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -112,11 +112,11 @@ nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
112/* 112/*
113 * Read a symlink. 113 * Read a symlink.
114 */ 114 */
115static int 115static __be32
116nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp, 116nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
117 struct nfsd_readlinkres *resp) 117 struct nfsd_readlinkres *resp)
118{ 118{
119 int nfserr; 119 __be32 nfserr;
120 120
121 dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); 121 dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
122 122
@@ -132,11 +132,11 @@ nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
132 * Read a portion of a file. 132 * Read a portion of a file.
133 * N.B. After this call resp->fh needs an fh_put 133 * N.B. After this call resp->fh needs an fh_put
134 */ 134 */
135static int 135static __be32
136nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp, 136nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
137 struct nfsd_readres *resp) 137 struct nfsd_readres *resp)
138{ 138{
139 int nfserr; 139 __be32 nfserr;
140 140
141 dprintk("nfsd: READ %s %d bytes at %d\n", 141 dprintk("nfsd: READ %s %d bytes at %d\n",
142 SVCFH_fmt(&argp->fh), 142 SVCFH_fmt(&argp->fh),
@@ -146,20 +146,20 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
146 * status, 17 words for fattr, and 1 word for the byte count. 146 * status, 17 words for fattr, and 1 word for the byte count.
147 */ 147 */
148 148
149 if (NFSSVC_MAXBLKSIZE < argp->count) { 149 if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
150 printk(KERN_NOTICE 150 printk(KERN_NOTICE
151 "oversized read request from %u.%u.%u.%u:%d (%d bytes)\n", 151 "oversized read request from %u.%u.%u.%u:%d (%d bytes)\n",
152 NIPQUAD(rqstp->rq_addr.sin_addr.s_addr), 152 NIPQUAD(rqstp->rq_addr.sin_addr.s_addr),
153 ntohs(rqstp->rq_addr.sin_port), 153 ntohs(rqstp->rq_addr.sin_port),
154 argp->count); 154 argp->count);
155 argp->count = NFSSVC_MAXBLKSIZE; 155 argp->count = NFSSVC_MAXBLKSIZE_V2;
156 } 156 }
157 svc_reserve(rqstp, (19<<2) + argp->count + 4); 157 svc_reserve(rqstp, (19<<2) + argp->count + 4);
158 158
159 resp->count = argp->count; 159 resp->count = argp->count;
160 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 160 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
161 argp->offset, 161 argp->offset,
162 argp->vec, argp->vlen, 162 rqstp->rq_vec, argp->vlen,
163 &resp->count); 163 &resp->count);
164 164
165 if (nfserr) return nfserr; 165 if (nfserr) return nfserr;
@@ -172,11 +172,11 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
172 * Write data to a file 172 * Write data to a file
173 * N.B. After this call resp->fh needs an fh_put 173 * N.B. After this call resp->fh needs an fh_put
174 */ 174 */
175static int 175static __be32
176nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, 176nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
177 struct nfsd_attrstat *resp) 177 struct nfsd_attrstat *resp)
178{ 178{
179 int nfserr; 179 __be32 nfserr;
180 int stable = 1; 180 int stable = 1;
181 181
182 dprintk("nfsd: WRITE %s %d bytes at %d\n", 182 dprintk("nfsd: WRITE %s %d bytes at %d\n",
@@ -185,7 +185,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
185 185
186 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 186 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
187 argp->offset, 187 argp->offset,
188 argp->vec, argp->vlen, 188 rqstp->rq_vec, argp->vlen,
189 argp->len, 189 argp->len,
190 &stable); 190 &stable);
191 return nfsd_return_attrs(nfserr, resp); 191 return nfsd_return_attrs(nfserr, resp);
@@ -197,7 +197,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
197 * and the actual create() call in compliance with VFS protocols. 197 * and the actual create() call in compliance with VFS protocols.
198 * N.B. After this call _both_ argp->fh and resp->fh need an fh_put 198 * N.B. After this call _both_ argp->fh and resp->fh need an fh_put
199 */ 199 */
200static int 200static __be32
201nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, 201nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
202 struct nfsd_diropres *resp) 202 struct nfsd_diropres *resp)
203{ 203{
@@ -206,7 +206,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
206 struct iattr *attr = &argp->attrs; 206 struct iattr *attr = &argp->attrs;
207 struct inode *inode; 207 struct inode *inode;
208 struct dentry *dchild; 208 struct dentry *dchild;
209 int nfserr, type, mode; 209 int type, mode;
210 __be32 nfserr;
210 dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); 211 dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size);
211 212
212 dprintk("nfsd: CREATE %s %.*s\n", 213 dprintk("nfsd: CREATE %s %.*s\n",
@@ -225,7 +226,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
225 nfserr = nfserr_exist; 226 nfserr = nfserr_exist;
226 if (isdotent(argp->name, argp->len)) 227 if (isdotent(argp->name, argp->len))
227 goto done; 228 goto done;
228 fh_lock(dirfhp); 229 fh_lock_nested(dirfhp, I_MUTEX_PARENT);
229 dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); 230 dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
230 if (IS_ERR(dchild)) { 231 if (IS_ERR(dchild)) {
231 nfserr = nfserrno(PTR_ERR(dchild)); 232 nfserr = nfserrno(PTR_ERR(dchild));
@@ -348,11 +349,11 @@ done:
348 return nfsd_return_dirop(nfserr, resp); 349 return nfsd_return_dirop(nfserr, resp);
349} 350}
350 351
351static int 352static __be32
352nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, 353nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
353 void *resp) 354 void *resp)
354{ 355{
355 int nfserr; 356 __be32 nfserr;
356 357
357 dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), 358 dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh),
358 argp->len, argp->name); 359 argp->len, argp->name);
@@ -363,11 +364,11 @@ nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
363 return nfserr; 364 return nfserr;
364} 365}
365 366
366static int 367static __be32
367nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp, 368nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
368 void *resp) 369 void *resp)
369{ 370{
370 int nfserr; 371 __be32 nfserr;
371 372
372 dprintk("nfsd: RENAME %s %.*s -> \n", 373 dprintk("nfsd: RENAME %s %.*s -> \n",
373 SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); 374 SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
@@ -381,11 +382,11 @@ nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
381 return nfserr; 382 return nfserr;
382} 383}
383 384
384static int 385static __be32
385nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp, 386nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
386 void *resp) 387 void *resp)
387{ 388{
388 int nfserr; 389 __be32 nfserr;
389 390
390 dprintk("nfsd: LINK %s ->\n", 391 dprintk("nfsd: LINK %s ->\n",
391 SVCFH_fmt(&argp->ffh)); 392 SVCFH_fmt(&argp->ffh));
@@ -401,12 +402,12 @@ nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
401 return nfserr; 402 return nfserr;
402} 403}
403 404
404static int 405static __be32
405nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp, 406nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
406 void *resp) 407 void *resp)
407{ 408{
408 struct svc_fh newfh; 409 struct svc_fh newfh;
409 int nfserr; 410 __be32 nfserr;
410 411
411 dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", 412 dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n",
412 SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, 413 SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
@@ -430,11 +431,11 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
430 * Make directory. This operation is not idempotent. 431 * Make directory. This operation is not idempotent.
431 * N.B. After this call resp->fh needs an fh_put 432 * N.B. After this call resp->fh needs an fh_put
432 */ 433 */
433static int 434static __be32
434nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp, 435nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
435 struct nfsd_diropres *resp) 436 struct nfsd_diropres *resp)
436{ 437{
437 int nfserr; 438 __be32 nfserr;
438 439
439 dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); 440 dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
440 441
@@ -454,11 +455,11 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
454/* 455/*
455 * Remove a directory 456 * Remove a directory
456 */ 457 */
457static int 458static __be32
458nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp, 459nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
459 void *resp) 460 void *resp)
460{ 461{
461 int nfserr; 462 __be32 nfserr;
462 463
463 dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); 464 dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
464 465
@@ -470,11 +471,12 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
470/* 471/*
471 * Read a portion of a directory. 472 * Read a portion of a directory.
472 */ 473 */
473static int 474static __be32
474nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp, 475nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
475 struct nfsd_readdirres *resp) 476 struct nfsd_readdirres *resp)
476{ 477{
477 int nfserr, count; 478 int count;
479 __be32 nfserr;
478 loff_t offset; 480 loff_t offset;
479 481
480 dprintk("nfsd: READDIR %s %d bytes at %d\n", 482 dprintk("nfsd: READDIR %s %d bytes at %d\n",
@@ -509,11 +511,11 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
509/* 511/*
510 * Get file system info 512 * Get file system info
511 */ 513 */
512static int 514static __be32
513nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, 515nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
514 struct nfsd_statfsres *resp) 516 struct nfsd_statfsres *resp)
515{ 517{
516 int nfserr; 518 __be32 nfserr;
517 519
518 dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); 520 dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
519 521
@@ -553,7 +555,7 @@ static struct svc_procedure nfsd_procedures2[18] = {
553 PROC(none, void, void, none, RC_NOCACHE, ST), 555 PROC(none, void, void, none, RC_NOCACHE, ST),
554 PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), 556 PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT),
555 PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), 557 PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
556 PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), 558 PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4),
557 PROC(none, void, void, none, RC_NOCACHE, ST), 559 PROC(none, void, void, none, RC_NOCACHE, ST),
558 PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), 560 PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT),
559 PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), 561 PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT),
@@ -579,11 +581,11 @@ struct svc_version nfsd_version2 = {
579/* 581/*
580 * Map errnos to NFS errnos. 582 * Map errnos to NFS errnos.
581 */ 583 */
582int 584__be32
583nfserrno (int errno) 585nfserrno (int errno)
584{ 586{
585 static struct { 587 static struct {
586 int nfserr; 588 __be32 nfserr;
587 int syserr; 589 int syserr;
588 } nfs_errtbl[] = { 590 } nfs_errtbl[] = {
589 { nfs_ok, 0 }, 591 { nfs_ok, 0 },
@@ -615,11 +617,10 @@ nfserrno (int errno)
615 { nfserr_badname, -ESRCH }, 617 { nfserr_badname, -ESRCH },
616 { nfserr_io, -ETXTBSY }, 618 { nfserr_io, -ETXTBSY },
617 { nfserr_notsupp, -EOPNOTSUPP }, 619 { nfserr_notsupp, -EOPNOTSUPP },
618 { -1, -EIO }
619 }; 620 };
620 int i; 621 int i;
621 622
622 for (i = 0; nfs_errtbl[i].nfserr != -1; i++) { 623 for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
623 if (nfs_errtbl[i].syserr == errno) 624 if (nfs_errtbl[i].syserr == errno)
624 return nfs_errtbl[i].nfserr; 625 return nfs_errtbl[i].nfserr;
625 } 626 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 19443056ec30..0aaccb03bf76 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -198,9 +198,26 @@ int nfsd_create_serv(void)
198 unlock_kernel(); 198 unlock_kernel();
199 return 0; 199 return 0;
200 } 200 }
201 if (nfsd_max_blksize == 0) {
202 /* choose a suitable default */
203 struct sysinfo i;
204 si_meminfo(&i);
205 /* Aim for 1/4096 of memory per thread
206 * This gives 1MB on 4Gig machines
207 * But only uses 32K on 128M machines.
208 * Bottom out at 8K on 32M and smaller.
209 * Of course, this is only a default.
210 */
211 nfsd_max_blksize = NFSSVC_MAXBLKSIZE;
212 i.totalram <<= PAGE_SHIFT - 12;
213 while (nfsd_max_blksize > i.totalram &&
214 nfsd_max_blksize >= 8*1024*2)
215 nfsd_max_blksize /= 2;
216 }
201 217
202 atomic_set(&nfsd_busy, 0); 218 atomic_set(&nfsd_busy, 0);
203 nfsd_serv = svc_create_pooled(&nfsd_program, NFSD_BUFSIZE, 219 nfsd_serv = svc_create_pooled(&nfsd_program,
220 nfsd_max_blksize,
204 nfsd_last_thread, 221 nfsd_last_thread,
205 nfsd, SIG_NOCLEAN, THIS_MODULE); 222 nfsd, SIG_NOCLEAN, THIS_MODULE);
206 if (nfsd_serv == NULL) 223 if (nfsd_serv == NULL)
@@ -474,12 +491,12 @@ out:
474} 491}
475 492
476int 493int
477nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp) 494nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
478{ 495{
479 struct svc_procedure *proc; 496 struct svc_procedure *proc;
480 kxdrproc_t xdr; 497 kxdrproc_t xdr;
481 u32 nfserr; 498 __be32 nfserr;
482 u32 *nfserrp; 499 __be32 *nfserrp;
483 500
484 dprintk("nfsd_dispatch: vers %d proc %d\n", 501 dprintk("nfsd_dispatch: vers %d proc %d\n",
485 rqstp->rq_vers, rqstp->rq_proc); 502 rqstp->rq_vers, rqstp->rq_proc);
@@ -498,7 +515,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
498 515
499 /* Decode arguments */ 516 /* Decode arguments */
500 xdr = proc->pc_decode; 517 xdr = proc->pc_decode;
501 if (xdr && !xdr(rqstp, (u32*)rqstp->rq_arg.head[0].iov_base, 518 if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base,
502 rqstp->rq_argp)) { 519 rqstp->rq_argp)) {
503 dprintk("nfsd: failed to decode arguments!\n"); 520 dprintk("nfsd: failed to decode arguments!\n");
504 nfsd_cache_update(rqstp, RC_NOCACHE, NULL); 521 nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
@@ -511,7 +528,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
511 */ 528 */
512 nfserrp = rqstp->rq_res.head[0].iov_base 529 nfserrp = rqstp->rq_res.head[0].iov_base
513 + rqstp->rq_res.head[0].iov_len; 530 + rqstp->rq_res.head[0].iov_len;
514 rqstp->rq_res.head[0].iov_len += sizeof(u32); 531 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
515 532
516 /* Now call the procedure handler, and encode NFS status. */ 533 /* Now call the procedure handler, and encode NFS status. */
517 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 534 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 3f14a17eaa6e..56ebb1443e0e 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -37,8 +37,8 @@ static u32 nfs_ftypes[] = {
37/* 37/*
38 * XDR functions for basic NFS types 38 * XDR functions for basic NFS types
39 */ 39 */
40static u32 * 40static __be32 *
41decode_fh(u32 *p, struct svc_fh *fhp) 41decode_fh(__be32 *p, struct svc_fh *fhp)
42{ 42{
43 fh_init(fhp, NFS_FHSIZE); 43 fh_init(fhp, NFS_FHSIZE);
44 memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); 44 memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
@@ -50,13 +50,13 @@ decode_fh(u32 *p, struct svc_fh *fhp)
50} 50}
51 51
52/* Helper function for NFSv2 ACL code */ 52/* Helper function for NFSv2 ACL code */
53u32 *nfs2svc_decode_fh(u32 *p, struct svc_fh *fhp) 53__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
54{ 54{
55 return decode_fh(p, fhp); 55 return decode_fh(p, fhp);
56} 56}
57 57
58static inline u32 * 58static inline __be32 *
59encode_fh(u32 *p, struct svc_fh *fhp) 59encode_fh(__be32 *p, struct svc_fh *fhp)
60{ 60{
61 memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE); 61 memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
62 return p + (NFS_FHSIZE>> 2); 62 return p + (NFS_FHSIZE>> 2);
@@ -66,8 +66,8 @@ encode_fh(u32 *p, struct svc_fh *fhp)
66 * Decode a file name and make sure that the path contains 66 * Decode a file name and make sure that the path contains
67 * no slashes or null bytes. 67 * no slashes or null bytes.
68 */ 68 */
69static inline u32 * 69static inline __be32 *
70decode_filename(u32 *p, char **namp, int *lenp) 70decode_filename(__be32 *p, char **namp, int *lenp)
71{ 71{
72 char *name; 72 char *name;
73 int i; 73 int i;
@@ -82,8 +82,8 @@ decode_filename(u32 *p, char **namp, int *lenp)
82 return p; 82 return p;
83} 83}
84 84
85static inline u32 * 85static inline __be32 *
86decode_pathname(u32 *p, char **namp, int *lenp) 86decode_pathname(__be32 *p, char **namp, int *lenp)
87{ 87{
88 char *name; 88 char *name;
89 int i; 89 int i;
@@ -98,8 +98,8 @@ decode_pathname(u32 *p, char **namp, int *lenp)
98 return p; 98 return p;
99} 99}
100 100
101static inline u32 * 101static inline __be32 *
102decode_sattr(u32 *p, struct iattr *iap) 102decode_sattr(__be32 *p, struct iattr *iap)
103{ 103{
104 u32 tmp, tmp1; 104 u32 tmp, tmp1;
105 105
@@ -151,8 +151,8 @@ decode_sattr(u32 *p, struct iattr *iap)
151 return p; 151 return p;
152} 152}
153 153
154static u32 * 154static __be32 *
155encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp, 155encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
156 struct kstat *stat) 156 struct kstat *stat)
157{ 157{
158 struct dentry *dentry = fhp->fh_dentry; 158 struct dentry *dentry = fhp->fh_dentry;
@@ -195,7 +195,7 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
195} 195}
196 196
197/* Helper function for NFSv2 ACL code */ 197/* Helper function for NFSv2 ACL code */
198u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 198__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
199{ 199{
200 struct kstat stat; 200 struct kstat stat;
201 vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat); 201 vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
@@ -206,13 +206,13 @@ u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
206 * XDR decode functions 206 * XDR decode functions
207 */ 207 */
208int 208int
209nfssvc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy) 209nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
210{ 210{
211 return xdr_argsize_check(rqstp, p); 211 return xdr_argsize_check(rqstp, p);
212} 212}
213 213
214int 214int
215nfssvc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args) 215nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
216{ 216{
217 if (!(p = decode_fh(p, &args->fh))) 217 if (!(p = decode_fh(p, &args->fh)))
218 return 0; 218 return 0;
@@ -220,7 +220,7 @@ nfssvc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
220} 220}
221 221
222int 222int
223nfssvc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p, 223nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
224 struct nfsd_sattrargs *args) 224 struct nfsd_sattrargs *args)
225{ 225{
226 if (!(p = decode_fh(p, &args->fh)) 226 if (!(p = decode_fh(p, &args->fh))
@@ -231,7 +231,7 @@ nfssvc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
231} 231}
232 232
233int 233int
234nfssvc_decode_diropargs(struct svc_rqst *rqstp, u32 *p, 234nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
235 struct nfsd_diropargs *args) 235 struct nfsd_diropargs *args)
236{ 236{
237 if (!(p = decode_fh(p, &args->fh)) 237 if (!(p = decode_fh(p, &args->fh))
@@ -242,7 +242,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
242} 242}
243 243
244int 244int
245nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p, 245nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
246 struct nfsd_readargs *args) 246 struct nfsd_readargs *args)
247{ 247{
248 unsigned int len; 248 unsigned int len;
@@ -254,19 +254,18 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
254 len = args->count = ntohl(*p++); 254 len = args->count = ntohl(*p++);
255 p++; /* totalcount - unused */ 255 p++; /* totalcount - unused */
256 256
257 if (len > NFSSVC_MAXBLKSIZE) 257 if (len > NFSSVC_MAXBLKSIZE_V2)
258 len = NFSSVC_MAXBLKSIZE; 258 len = NFSSVC_MAXBLKSIZE_V2;
259 259
260 /* set up somewhere to store response. 260 /* set up somewhere to store response.
261 * We take pages, put them on reslist and include in iovec 261 * We take pages, put them on reslist and include in iovec
262 */ 262 */
263 v=0; 263 v=0;
264 while (len > 0) { 264 while (len > 0) {
265 pn=rqstp->rq_resused; 265 pn = rqstp->rq_resused++;
266 svc_take_page(rqstp); 266 rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
267 args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]); 267 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
268 args->vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; 268 len -= rqstp->rq_vec[v].iov_len;
269 len -= args->vec[v].iov_len;
270 v++; 269 v++;
271 } 270 }
272 args->vlen = v; 271 args->vlen = v;
@@ -274,7 +273,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
274} 273}
275 274
276int 275int
277nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p, 276nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
278 struct nfsd_writeargs *args) 277 struct nfsd_writeargs *args)
279{ 278{
280 unsigned int len; 279 unsigned int len;
@@ -286,25 +285,25 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
286 args->offset = ntohl(*p++); /* offset */ 285 args->offset = ntohl(*p++); /* offset */
287 p++; /* totalcount */ 286 p++; /* totalcount */
288 len = args->len = ntohl(*p++); 287 len = args->len = ntohl(*p++);
289 args->vec[0].iov_base = (void*)p; 288 rqstp->rq_vec[0].iov_base = (void*)p;
290 args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - 289 rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
291 (((void*)p) - rqstp->rq_arg.head[0].iov_base); 290 (((void*)p) - rqstp->rq_arg.head[0].iov_base);
292 if (len > NFSSVC_MAXBLKSIZE) 291 if (len > NFSSVC_MAXBLKSIZE_V2)
293 len = NFSSVC_MAXBLKSIZE; 292 len = NFSSVC_MAXBLKSIZE_V2;
294 v = 0; 293 v = 0;
295 while (len > args->vec[v].iov_len) { 294 while (len > rqstp->rq_vec[v].iov_len) {
296 len -= args->vec[v].iov_len; 295 len -= rqstp->rq_vec[v].iov_len;
297 v++; 296 v++;
298 args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]); 297 rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
299 args->vec[v].iov_len = PAGE_SIZE; 298 rqstp->rq_vec[v].iov_len = PAGE_SIZE;
300 } 299 }
301 args->vec[v].iov_len = len; 300 rqstp->rq_vec[v].iov_len = len;
302 args->vlen = v+1; 301 args->vlen = v+1;
303 return args->vec[0].iov_len > 0; 302 return rqstp->rq_vec[0].iov_len > 0;
304} 303}
305 304
306int 305int
307nfssvc_decode_createargs(struct svc_rqst *rqstp, u32 *p, 306nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
308 struct nfsd_createargs *args) 307 struct nfsd_createargs *args)
309{ 308{
310 if (!(p = decode_fh(p, &args->fh)) 309 if (!(p = decode_fh(p, &args->fh))
@@ -316,7 +315,7 @@ nfssvc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
316} 315}
317 316
318int 317int
319nfssvc_decode_renameargs(struct svc_rqst *rqstp, u32 *p, 318nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
320 struct nfsd_renameargs *args) 319 struct nfsd_renameargs *args)
321{ 320{
322 if (!(p = decode_fh(p, &args->ffh)) 321 if (!(p = decode_fh(p, &args->ffh))
@@ -329,18 +328,17 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
329} 328}
330 329
331int 330int
332nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p, struct nfsd_readlinkargs *args) 331nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args)
333{ 332{
334 if (!(p = decode_fh(p, &args->fh))) 333 if (!(p = decode_fh(p, &args->fh)))
335 return 0; 334 return 0;
336 svc_take_page(rqstp); 335 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
337 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
338 336
339 return xdr_argsize_check(rqstp, p); 337 return xdr_argsize_check(rqstp, p);
340} 338}
341 339
342int 340int
343nfssvc_decode_linkargs(struct svc_rqst *rqstp, u32 *p, 341nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
344 struct nfsd_linkargs *args) 342 struct nfsd_linkargs *args)
345{ 343{
346 if (!(p = decode_fh(p, &args->ffh)) 344 if (!(p = decode_fh(p, &args->ffh))
@@ -352,7 +350,7 @@ nfssvc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
352} 350}
353 351
354int 352int
355nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p, 353nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
356 struct nfsd_symlinkargs *args) 354 struct nfsd_symlinkargs *args)
357{ 355{
358 if (!(p = decode_fh(p, &args->ffh)) 356 if (!(p = decode_fh(p, &args->ffh))
@@ -365,7 +363,7 @@ nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
365} 363}
366 364
367int 365int
368nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p, 366nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
369 struct nfsd_readdirargs *args) 367 struct nfsd_readdirargs *args)
370{ 368{
371 if (!(p = decode_fh(p, &args->fh))) 369 if (!(p = decode_fh(p, &args->fh)))
@@ -375,8 +373,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
375 if (args->count > PAGE_SIZE) 373 if (args->count > PAGE_SIZE)
376 args->count = PAGE_SIZE; 374 args->count = PAGE_SIZE;
377 375
378 svc_take_page(rqstp); 376 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
379 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
380 377
381 return xdr_argsize_check(rqstp, p); 378 return xdr_argsize_check(rqstp, p);
382} 379}
@@ -385,13 +382,13 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
385 * XDR encode functions 382 * XDR encode functions
386 */ 383 */
387int 384int
388nfssvc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy) 385nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
389{ 386{
390 return xdr_ressize_check(rqstp, p); 387 return xdr_ressize_check(rqstp, p);
391} 388}
392 389
393int 390int
394nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p, 391nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
395 struct nfsd_attrstat *resp) 392 struct nfsd_attrstat *resp)
396{ 393{
397 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); 394 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
@@ -399,7 +396,7 @@ nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
399} 396}
400 397
401int 398int
402nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p, 399nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
403 struct nfsd_diropres *resp) 400 struct nfsd_diropres *resp)
404{ 401{
405 p = encode_fh(p, &resp->fh); 402 p = encode_fh(p, &resp->fh);
@@ -408,7 +405,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
408} 405}
409 406
410int 407int
411nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p, 408nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
412 struct nfsd_readlinkres *resp) 409 struct nfsd_readlinkres *resp)
413{ 410{
414 *p++ = htonl(resp->len); 411 *p++ = htonl(resp->len);
@@ -416,7 +413,6 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
416 rqstp->rq_res.page_len = resp->len; 413 rqstp->rq_res.page_len = resp->len;
417 if (resp->len & 3) { 414 if (resp->len & 3) {
418 /* need to pad the tail */ 415 /* need to pad the tail */
419 rqstp->rq_restailpage = 0;
420 rqstp->rq_res.tail[0].iov_base = p; 416 rqstp->rq_res.tail[0].iov_base = p;
421 *p = 0; 417 *p = 0;
422 rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); 418 rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
@@ -425,7 +421,7 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
425} 421}
426 422
427int 423int
428nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p, 424nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
429 struct nfsd_readres *resp) 425 struct nfsd_readres *resp)
430{ 426{
431 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); 427 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
@@ -436,7 +432,6 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
436 rqstp->rq_res.page_len = resp->count; 432 rqstp->rq_res.page_len = resp->count;
437 if (resp->count & 3) { 433 if (resp->count & 3) {
438 /* need to pad the tail */ 434 /* need to pad the tail */
439 rqstp->rq_restailpage = 0;
440 rqstp->rq_res.tail[0].iov_base = p; 435 rqstp->rq_res.tail[0].iov_base = p;
441 *p = 0; 436 *p = 0;
442 rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); 437 rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
@@ -445,7 +440,7 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
445} 440}
446 441
447int 442int
448nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p, 443nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
449 struct nfsd_readdirres *resp) 444 struct nfsd_readdirres *resp)
450{ 445{
451 xdr_ressize_check(rqstp, p); 446 xdr_ressize_check(rqstp, p);
@@ -458,12 +453,12 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
458} 453}
459 454
460int 455int
461nfssvc_encode_statfsres(struct svc_rqst *rqstp, u32 *p, 456nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p,
462 struct nfsd_statfsres *resp) 457 struct nfsd_statfsres *resp)
463{ 458{
464 struct kstatfs *stat = &resp->stats; 459 struct kstatfs *stat = &resp->stats;
465 460
466 *p++ = htonl(NFSSVC_MAXBLKSIZE); /* max transfer size */ 461 *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */
467 *p++ = htonl(stat->f_bsize); 462 *p++ = htonl(stat->f_bsize);
468 *p++ = htonl(stat->f_blocks); 463 *p++ = htonl(stat->f_blocks);
469 *p++ = htonl(stat->f_bfree); 464 *p++ = htonl(stat->f_bfree);
@@ -476,7 +471,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
476 int namlen, loff_t offset, ino_t ino, unsigned int d_type) 471 int namlen, loff_t offset, ino_t ino, unsigned int d_type)
477{ 472{
478 struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common); 473 struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
479 u32 *p = cd->buffer; 474 __be32 *p = cd->buffer;
480 int buflen, slen; 475 int buflen, slen;
481 476
482 /* 477 /*
@@ -502,7 +497,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
502 *p++ = htonl((u32) ino); /* file id */ 497 *p++ = htonl((u32) ino); /* file id */
503 p = xdr_encode_array(p, name, namlen);/* name length & name */ 498 p = xdr_encode_array(p, name, namlen);/* name length & name */
504 cd->offset = p; /* remember pointer */ 499 cd->offset = p; /* remember pointer */
505 *p++ = ~(u32) 0; /* offset of next entry */ 500 *p++ = htonl(~0U); /* offset of next entry */
506 501
507 cd->buflen = buflen; 502 cd->buflen = buflen;
508 cd->buffer = p; 503 cd->buffer = p;
@@ -514,7 +509,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
514 * XDR release functions 509 * XDR release functions
515 */ 510 */
516int 511int
517nfssvc_release_fhandle(struct svc_rqst *rqstp, u32 *p, 512nfssvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
518 struct nfsd_fhandle *resp) 513 struct nfsd_fhandle *resp)
519{ 514{
520 fh_put(&resp->fh); 515 fh_put(&resp->fh);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 443ebc52e382..f21e917bb8ed 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -54,6 +54,7 @@
54#include <linux/nfsd_idmap.h> 54#include <linux/nfsd_idmap.h>
55#include <linux/security.h> 55#include <linux/security.h>
56#endif /* CONFIG_NFSD_V4 */ 56#endif /* CONFIG_NFSD_V4 */
57#include <linux/jhash.h>
57 58
58#include <asm/uaccess.h> 59#include <asm/uaccess.h>
59 60
@@ -81,10 +82,19 @@ struct raparms {
81 dev_t p_dev; 82 dev_t p_dev;
82 int p_set; 83 int p_set;
83 struct file_ra_state p_ra; 84 struct file_ra_state p_ra;
85 unsigned int p_hindex;
84}; 86};
85 87
88struct raparm_hbucket {
89 struct raparms *pb_head;
90 spinlock_t pb_lock;
91} ____cacheline_aligned_in_smp;
92
86static struct raparms * raparml; 93static struct raparms * raparml;
87static struct raparms * raparm_cache; 94#define RAPARM_HASH_BITS 4
95#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
96#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
97static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
88 98
89/* 99/*
90 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 100 * Called from nfsd_lookup and encode_dirent. Check if we have crossed
@@ -100,7 +110,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
100 struct dentry *dentry = *dpp; 110 struct dentry *dentry = *dpp;
101 struct vfsmount *mnt = mntget(exp->ex_mnt); 111 struct vfsmount *mnt = mntget(exp->ex_mnt);
102 struct dentry *mounts = dget(dentry); 112 struct dentry *mounts = dget(dentry);
103 int err = nfs_ok; 113 int err = 0;
104 114
105 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); 115 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
106 116
@@ -138,14 +148,15 @@ out:
138 * clients and is explicitly disallowed for NFSv3 148 * clients and is explicitly disallowed for NFSv3
139 * NeilBrown <neilb@cse.unsw.edu.au> 149 * NeilBrown <neilb@cse.unsw.edu.au>
140 */ 150 */
141int 151__be32
142nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, 152nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
143 int len, struct svc_fh *resfh) 153 int len, struct svc_fh *resfh)
144{ 154{
145 struct svc_export *exp; 155 struct svc_export *exp;
146 struct dentry *dparent; 156 struct dentry *dparent;
147 struct dentry *dentry; 157 struct dentry *dentry;
148 int err; 158 __be32 err;
159 int host_err;
149 160
150 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); 161 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
151 162
@@ -183,7 +194,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
183 exp2 = exp_parent(exp->ex_client, mnt, dentry, 194 exp2 = exp_parent(exp->ex_client, mnt, dentry,
184 &rqstp->rq_chandle); 195 &rqstp->rq_chandle);
185 if (IS_ERR(exp2)) { 196 if (IS_ERR(exp2)) {
186 err = PTR_ERR(exp2); 197 host_err = PTR_ERR(exp2);
187 dput(dentry); 198 dput(dentry);
188 mntput(mnt); 199 mntput(mnt);
189 goto out_nfserr; 200 goto out_nfserr;
@@ -200,14 +211,14 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
200 } else { 211 } else {
201 fh_lock(fhp); 212 fh_lock(fhp);
202 dentry = lookup_one_len(name, dparent, len); 213 dentry = lookup_one_len(name, dparent, len);
203 err = PTR_ERR(dentry); 214 host_err = PTR_ERR(dentry);
204 if (IS_ERR(dentry)) 215 if (IS_ERR(dentry))
205 goto out_nfserr; 216 goto out_nfserr;
206 /* 217 /*
207 * check if we have crossed a mount point ... 218 * check if we have crossed a mount point ...
208 */ 219 */
209 if (d_mountpoint(dentry)) { 220 if (d_mountpoint(dentry)) {
210 if ((err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { 221 if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
211 dput(dentry); 222 dput(dentry);
212 goto out_nfserr; 223 goto out_nfserr;
213 } 224 }
@@ -226,7 +237,7 @@ out:
226 return err; 237 return err;
227 238
228out_nfserr: 239out_nfserr:
229 err = nfserrno(err); 240 err = nfserrno(host_err);
230 goto out; 241 goto out;
231} 242}
232 243
@@ -234,7 +245,7 @@ out_nfserr:
234 * Set various file attributes. 245 * Set various file attributes.
235 * N.B. After this call fhp needs an fh_put 246 * N.B. After this call fhp needs an fh_put
236 */ 247 */
237int 248__be32
238nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, 249nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
239 int check_guard, time_t guardtime) 250 int check_guard, time_t guardtime)
240{ 251{
@@ -243,7 +254,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
243 int accmode = MAY_SATTR; 254 int accmode = MAY_SATTR;
244 int ftype = 0; 255 int ftype = 0;
245 int imode; 256 int imode;
246 int err; 257 __be32 err;
258 int host_err;
247 int size_change = 0; 259 int size_change = 0;
248 260
249 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) 261 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
@@ -309,19 +321,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
309 * If we are changing the size of the file, then 321 * If we are changing the size of the file, then
310 * we need to break all leases. 322 * we need to break all leases.
311 */ 323 */
312 err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); 324 host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
313 if (err == -EWOULDBLOCK) 325 if (host_err == -EWOULDBLOCK)
314 err = -ETIMEDOUT; 326 host_err = -ETIMEDOUT;
315 if (err) /* ENOMEM or EWOULDBLOCK */ 327 if (host_err) /* ENOMEM or EWOULDBLOCK */
316 goto out_nfserr; 328 goto out_nfserr;
317 329
318 err = get_write_access(inode); 330 host_err = get_write_access(inode);
319 if (err) 331 if (host_err)
320 goto out_nfserr; 332 goto out_nfserr;
321 333
322 size_change = 1; 334 size_change = 1;
323 err = locks_verify_truncate(inode, NULL, iap->ia_size); 335 host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
324 if (err) { 336 if (host_err) {
325 put_write_access(inode); 337 put_write_access(inode);
326 goto out_nfserr; 338 goto out_nfserr;
327 } 339 }
@@ -347,8 +359,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
347 err = nfserr_notsync; 359 err = nfserr_notsync;
348 if (!check_guard || guardtime == inode->i_ctime.tv_sec) { 360 if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
349 fh_lock(fhp); 361 fh_lock(fhp);
350 err = notify_change(dentry, iap); 362 host_err = notify_change(dentry, iap);
351 err = nfserrno(err); 363 err = nfserrno(host_err);
352 fh_unlock(fhp); 364 fh_unlock(fhp);
353 } 365 }
354 if (size_change) 366 if (size_change)
@@ -360,7 +372,7 @@ out:
360 return err; 372 return err;
361 373
362out_nfserr: 374out_nfserr:
363 err = nfserrno(err); 375 err = nfserrno(host_err);
364 goto out; 376 goto out;
365} 377}
366 378
@@ -410,11 +422,12 @@ out:
410 return error; 422 return error;
411} 423}
412 424
413int 425__be32
414nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, 426nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
415 struct nfs4_acl *acl) 427 struct nfs4_acl *acl)
416{ 428{
417 int error; 429 __be32 error;
430 int host_error;
418 struct dentry *dentry; 431 struct dentry *dentry;
419 struct inode *inode; 432 struct inode *inode;
420 struct posix_acl *pacl = NULL, *dpacl = NULL; 433 struct posix_acl *pacl = NULL, *dpacl = NULL;
@@ -430,22 +443,20 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
430 if (S_ISDIR(inode->i_mode)) 443 if (S_ISDIR(inode->i_mode))
431 flags = NFS4_ACL_DIR; 444 flags = NFS4_ACL_DIR;
432 445
433 error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); 446 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
434 if (error == -EINVAL) { 447 if (host_error == -EINVAL) {
435 error = nfserr_attrnotsupp; 448 error = nfserr_attrnotsupp;
436 goto out; 449 goto out;
437 } else if (error < 0) 450 } else if (host_error < 0)
438 goto out_nfserr; 451 goto out_nfserr;
439 452
440 if (pacl) { 453 host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
441 error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); 454 if (host_error < 0)
442 if (error < 0) 455 goto out_nfserr;
443 goto out_nfserr;
444 }
445 456
446 if (dpacl) { 457 if (S_ISDIR(inode->i_mode)) {
447 error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); 458 host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
448 if (error < 0) 459 if (host_error < 0)
449 goto out_nfserr; 460 goto out_nfserr;
450 } 461 }
451 462
@@ -456,7 +467,7 @@ out:
456 posix_acl_release(dpacl); 467 posix_acl_release(dpacl);
457 return (error); 468 return (error);
458out_nfserr: 469out_nfserr:
459 error = nfserrno(error); 470 error = nfserrno(host_error);
460 goto out; 471 goto out;
461} 472}
462 473
@@ -563,14 +574,14 @@ static struct accessmap nfs3_anyaccess[] = {
563 { 0, 0 } 574 { 0, 0 }
564}; 575};
565 576
566int 577__be32
567nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) 578nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
568{ 579{
569 struct accessmap *map; 580 struct accessmap *map;
570 struct svc_export *export; 581 struct svc_export *export;
571 struct dentry *dentry; 582 struct dentry *dentry;
572 u32 query, result = 0, sresult = 0; 583 u32 query, result = 0, sresult = 0;
573 unsigned int error; 584 __be32 error;
574 585
575 error = fh_verify(rqstp, fhp, 0, MAY_NOP); 586 error = fh_verify(rqstp, fhp, 0, MAY_NOP);
576 if (error) 587 if (error)
@@ -590,7 +601,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
590 query = *access; 601 query = *access;
591 for (; map->access; map++) { 602 for (; map->access; map++) {
592 if (map->access & query) { 603 if (map->access & query) {
593 unsigned int err2; 604 __be32 err2;
594 605
595 sresult |= map->access; 606 sresult |= map->access;
596 607
@@ -629,13 +640,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
629 * The access argument indicates the type of open (read/write/lock) 640 * The access argument indicates the type of open (read/write/lock)
630 * N.B. After this call fhp needs an fh_put 641 * N.B. After this call fhp needs an fh_put
631 */ 642 */
632int 643__be32
633nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, 644nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
634 int access, struct file **filp) 645 int access, struct file **filp)
635{ 646{
636 struct dentry *dentry; 647 struct dentry *dentry;
637 struct inode *inode; 648 struct inode *inode;
638 int flags = O_RDONLY|O_LARGEFILE, err; 649 int flags = O_RDONLY|O_LARGEFILE;
650 __be32 err;
651 int host_err;
639 652
640 /* 653 /*
641 * If we get here, then the client has already done an "open", 654 * If we get here, then the client has already done an "open",
@@ -665,10 +678,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
665 * Check to see if there are any leases on this file. 678 * Check to see if there are any leases on this file.
666 * This may block while leases are broken. 679 * This may block while leases are broken.
667 */ 680 */
668 err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0)); 681 host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
669 if (err == -EWOULDBLOCK) 682 if (host_err == -EWOULDBLOCK)
670 err = -ETIMEDOUT; 683 host_err = -ETIMEDOUT;
671 if (err) /* NOMEM or WOULDBLOCK */ 684 if (host_err) /* NOMEM or WOULDBLOCK */
672 goto out_nfserr; 685 goto out_nfserr;
673 686
674 if (access & MAY_WRITE) { 687 if (access & MAY_WRITE) {
@@ -681,10 +694,9 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
681 } 694 }
682 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags); 695 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags);
683 if (IS_ERR(*filp)) 696 if (IS_ERR(*filp))
684 err = PTR_ERR(*filp); 697 host_err = PTR_ERR(*filp);
685out_nfserr: 698out_nfserr:
686 if (err) 699 err = nfserrno(host_err);
687 err = nfserrno(err);
688out: 700out:
689 return err; 701 return err;
690} 702}
@@ -743,16 +755,20 @@ nfsd_sync_dir(struct dentry *dp)
743 * Obtain the readahead parameters for the file 755 * Obtain the readahead parameters for the file
744 * specified by (dev, ino). 756 * specified by (dev, ino).
745 */ 757 */
746static DEFINE_SPINLOCK(ra_lock);
747 758
748static inline struct raparms * 759static inline struct raparms *
749nfsd_get_raparms(dev_t dev, ino_t ino) 760nfsd_get_raparms(dev_t dev, ino_t ino)
750{ 761{
751 struct raparms *ra, **rap, **frap = NULL; 762 struct raparms *ra, **rap, **frap = NULL;
752 int depth = 0; 763 int depth = 0;
764 unsigned int hash;
765 struct raparm_hbucket *rab;
753 766
754 spin_lock(&ra_lock); 767 hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
755 for (rap = &raparm_cache; (ra = *rap); rap = &ra->p_next) { 768 rab = &raparm_hash[hash];
769
770 spin_lock(&rab->pb_lock);
771 for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
756 if (ra->p_ino == ino && ra->p_dev == dev) 772 if (ra->p_ino == ino && ra->p_dev == dev)
757 goto found; 773 goto found;
758 depth++; 774 depth++;
@@ -761,7 +777,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
761 } 777 }
762 depth = nfsdstats.ra_size*11/10; 778 depth = nfsdstats.ra_size*11/10;
763 if (!frap) { 779 if (!frap) {
764 spin_unlock(&ra_lock); 780 spin_unlock(&rab->pb_lock);
765 return NULL; 781 return NULL;
766 } 782 }
767 rap = frap; 783 rap = frap;
@@ -769,15 +785,16 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
769 ra->p_dev = dev; 785 ra->p_dev = dev;
770 ra->p_ino = ino; 786 ra->p_ino = ino;
771 ra->p_set = 0; 787 ra->p_set = 0;
788 ra->p_hindex = hash;
772found: 789found:
773 if (rap != &raparm_cache) { 790 if (rap != &rab->pb_head) {
774 *rap = ra->p_next; 791 *rap = ra->p_next;
775 ra->p_next = raparm_cache; 792 ra->p_next = rab->pb_head;
776 raparm_cache = ra; 793 rab->pb_head = ra;
777 } 794 }
778 ra->p_count++; 795 ra->p_count++;
779 nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; 796 nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
780 spin_unlock(&ra_lock); 797 spin_unlock(&rab->pb_lock);
781 return ra; 798 return ra;
782} 799}
783 800
@@ -791,36 +808,41 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset
791{ 808{
792 unsigned long count = desc->count; 809 unsigned long count = desc->count;
793 struct svc_rqst *rqstp = desc->arg.data; 810 struct svc_rqst *rqstp = desc->arg.data;
811 struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
794 812
795 if (size > count) 813 if (size > count)
796 size = count; 814 size = count;
797 815
798 if (rqstp->rq_res.page_len == 0) { 816 if (rqstp->rq_res.page_len == 0) {
799 get_page(page); 817 get_page(page);
800 rqstp->rq_respages[rqstp->rq_resused++] = page; 818 put_page(*pp);
819 *pp = page;
820 rqstp->rq_resused++;
801 rqstp->rq_res.page_base = offset; 821 rqstp->rq_res.page_base = offset;
802 rqstp->rq_res.page_len = size; 822 rqstp->rq_res.page_len = size;
803 } else if (page != rqstp->rq_respages[rqstp->rq_resused-1]) { 823 } else if (page != pp[-1]) {
804 get_page(page); 824 get_page(page);
805 rqstp->rq_respages[rqstp->rq_resused++] = page; 825 put_page(*pp);
826 *pp = page;
827 rqstp->rq_resused++;
806 rqstp->rq_res.page_len += size; 828 rqstp->rq_res.page_len += size;
807 } else { 829 } else
808 rqstp->rq_res.page_len += size; 830 rqstp->rq_res.page_len += size;
809 }
810 831
811 desc->count = count - size; 832 desc->count = count - size;
812 desc->written += size; 833 desc->written += size;
813 return size; 834 return size;
814} 835}
815 836
816static int 837static __be32
817nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 838nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
818 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 839 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
819{ 840{
820 struct inode *inode; 841 struct inode *inode;
821 struct raparms *ra; 842 struct raparms *ra;
822 mm_segment_t oldfs; 843 mm_segment_t oldfs;
823 int err; 844 __be32 err;
845 int host_err;
824 846
825 err = nfserr_perm; 847 err = nfserr_perm;
826 inode = file->f_dentry->d_inode; 848 inode = file->f_dentry->d_inode;
@@ -837,32 +859,33 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
837 file->f_ra = ra->p_ra; 859 file->f_ra = ra->p_ra;
838 860
839 if (file->f_op->sendfile && rqstp->rq_sendfile_ok) { 861 if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
840 svc_pushback_unused_pages(rqstp); 862 rqstp->rq_resused = 1;
841 err = file->f_op->sendfile(file, &offset, *count, 863 host_err = file->f_op->sendfile(file, &offset, *count,
842 nfsd_read_actor, rqstp); 864 nfsd_read_actor, rqstp);
843 } else { 865 } else {
844 oldfs = get_fs(); 866 oldfs = get_fs();
845 set_fs(KERNEL_DS); 867 set_fs(KERNEL_DS);
846 err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); 868 host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
847 set_fs(oldfs); 869 set_fs(oldfs);
848 } 870 }
849 871
850 /* Write back readahead params */ 872 /* Write back readahead params */
851 if (ra) { 873 if (ra) {
852 spin_lock(&ra_lock); 874 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
875 spin_lock(&rab->pb_lock);
853 ra->p_ra = file->f_ra; 876 ra->p_ra = file->f_ra;
854 ra->p_set = 1; 877 ra->p_set = 1;
855 ra->p_count--; 878 ra->p_count--;
856 spin_unlock(&ra_lock); 879 spin_unlock(&rab->pb_lock);
857 } 880 }
858 881
859 if (err >= 0) { 882 if (host_err >= 0) {
860 nfsdstats.io_read += err; 883 nfsdstats.io_read += host_err;
861 *count = err; 884 *count = host_err;
862 err = 0; 885 err = 0;
863 fsnotify_access(file->f_dentry); 886 fsnotify_access(file->f_dentry);
864 } else 887 } else
865 err = nfserrno(err); 888 err = nfserrno(host_err);
866out: 889out:
867 return err; 890 return err;
868} 891}
@@ -877,7 +900,7 @@ static void kill_suid(struct dentry *dentry)
877 mutex_unlock(&dentry->d_inode->i_mutex); 900 mutex_unlock(&dentry->d_inode->i_mutex);
878} 901}
879 902
880static int 903static __be32
881nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 904nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
882 loff_t offset, struct kvec *vec, int vlen, 905 loff_t offset, struct kvec *vec, int vlen,
883 unsigned long cnt, int *stablep) 906 unsigned long cnt, int *stablep)
@@ -886,7 +909,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
886 struct dentry *dentry; 909 struct dentry *dentry;
887 struct inode *inode; 910 struct inode *inode;
888 mm_segment_t oldfs; 911 mm_segment_t oldfs;
889 int err = 0; 912 __be32 err = 0;
913 int host_err;
890 int stable = *stablep; 914 int stable = *stablep;
891 915
892#ifdef MSNFS 916#ifdef MSNFS
@@ -922,18 +946,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
922 946
923 /* Write the data. */ 947 /* Write the data. */
924 oldfs = get_fs(); set_fs(KERNEL_DS); 948 oldfs = get_fs(); set_fs(KERNEL_DS);
925 err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 949 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
926 set_fs(oldfs); 950 set_fs(oldfs);
927 if (err >= 0) { 951 if (host_err >= 0) {
928 nfsdstats.io_write += cnt; 952 nfsdstats.io_write += cnt;
929 fsnotify_modify(file->f_dentry); 953 fsnotify_modify(file->f_dentry);
930 } 954 }
931 955
932 /* clear setuid/setgid flag after write */ 956 /* clear setuid/setgid flag after write */
933 if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) 957 if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
934 kill_suid(dentry); 958 kill_suid(dentry);
935 959
936 if (err >= 0 && stable) { 960 if (host_err >= 0 && stable) {
937 static ino_t last_ino; 961 static ino_t last_ino;
938 static dev_t last_dev; 962 static dev_t last_dev;
939 963
@@ -959,7 +983,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
959 983
960 if (inode->i_state & I_DIRTY) { 984 if (inode->i_state & I_DIRTY) {
961 dprintk("nfsd: write sync %d\n", current->pid); 985 dprintk("nfsd: write sync %d\n", current->pid);
962 err=nfsd_sync(file); 986 host_err=nfsd_sync(file);
963 } 987 }
964#if 0 988#if 0
965 wake_up(&inode->i_wait); 989 wake_up(&inode->i_wait);
@@ -969,11 +993,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
969 last_dev = inode->i_sb->s_dev; 993 last_dev = inode->i_sb->s_dev;
970 } 994 }
971 995
972 dprintk("nfsd: write complete err=%d\n", err); 996 dprintk("nfsd: write complete host_err=%d\n", host_err);
973 if (err >= 0) 997 if (host_err >= 0)
974 err = 0; 998 err = 0;
975 else 999 else
976 err = nfserrno(err); 1000 err = nfserrno(host_err);
977out: 1001out:
978 return err; 1002 return err;
979} 1003}
@@ -983,12 +1007,12 @@ out:
983 * on entry. On return, *count contains the number of bytes actually read. 1007 * on entry. On return, *count contains the number of bytes actually read.
984 * N.B. After this call fhp needs an fh_put 1008 * N.B. After this call fhp needs an fh_put
985 */ 1009 */
986int 1010__be32
987nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1011nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
988 loff_t offset, struct kvec *vec, int vlen, 1012 loff_t offset, struct kvec *vec, int vlen,
989 unsigned long *count) 1013 unsigned long *count)
990{ 1014{
991 int err; 1015 __be32 err;
992 1016
993 if (file) { 1017 if (file) {
994 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, 1018 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
@@ -1012,12 +1036,12 @@ out:
1012 * The stable flag requests synchronous writes. 1036 * The stable flag requests synchronous writes.
1013 * N.B. After this call fhp needs an fh_put 1037 * N.B. After this call fhp needs an fh_put
1014 */ 1038 */
1015int 1039__be32
1016nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1040nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1017 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, 1041 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
1018 int *stablep) 1042 int *stablep)
1019{ 1043{
1020 int err = 0; 1044 __be32 err = 0;
1021 1045
1022 if (file) { 1046 if (file) {
1023 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, 1047 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
@@ -1049,12 +1073,12 @@ out:
1049 * Unfortunately we cannot lock the file to make sure we return full WCC 1073 * Unfortunately we cannot lock the file to make sure we return full WCC
1050 * data to the client, as locking happens lower down in the filesystem. 1074 * data to the client, as locking happens lower down in the filesystem.
1051 */ 1075 */
1052int 1076__be32
1053nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, 1077nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1054 loff_t offset, unsigned long count) 1078 loff_t offset, unsigned long count)
1055{ 1079{
1056 struct file *file; 1080 struct file *file;
1057 int err; 1081 __be32 err;
1058 1082
1059 if ((u64)count > ~(u64)offset) 1083 if ((u64)count > ~(u64)offset)
1060 return nfserr_inval; 1084 return nfserr_inval;
@@ -1082,14 +1106,15 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1082 * 1106 *
1083 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp 1107 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
1084 */ 1108 */
1085int 1109__be32
1086nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, 1110nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1087 char *fname, int flen, struct iattr *iap, 1111 char *fname, int flen, struct iattr *iap,
1088 int type, dev_t rdev, struct svc_fh *resfhp) 1112 int type, dev_t rdev, struct svc_fh *resfhp)
1089{ 1113{
1090 struct dentry *dentry, *dchild = NULL; 1114 struct dentry *dentry, *dchild = NULL;
1091 struct inode *dirp; 1115 struct inode *dirp;
1092 int err; 1116 __be32 err;
1117 int host_err;
1093 1118
1094 err = nfserr_perm; 1119 err = nfserr_perm;
1095 if (!flen) 1120 if (!flen)
@@ -1116,7 +1141,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1116 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ 1141 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
1117 fh_lock_nested(fhp, I_MUTEX_PARENT); 1142 fh_lock_nested(fhp, I_MUTEX_PARENT);
1118 dchild = lookup_one_len(fname, dentry, flen); 1143 dchild = lookup_one_len(fname, dentry, flen);
1119 err = PTR_ERR(dchild); 1144 host_err = PTR_ERR(dchild);
1120 if (IS_ERR(dchild)) 1145 if (IS_ERR(dchild))
1121 goto out_nfserr; 1146 goto out_nfserr;
1122 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); 1147 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
@@ -1155,22 +1180,22 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1155 err = nfserr_perm; 1180 err = nfserr_perm;
1156 switch (type) { 1181 switch (type) {
1157 case S_IFREG: 1182 case S_IFREG:
1158 err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1183 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1159 break; 1184 break;
1160 case S_IFDIR: 1185 case S_IFDIR:
1161 err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1186 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
1162 break; 1187 break;
1163 case S_IFCHR: 1188 case S_IFCHR:
1164 case S_IFBLK: 1189 case S_IFBLK:
1165 case S_IFIFO: 1190 case S_IFIFO:
1166 case S_IFSOCK: 1191 case S_IFSOCK:
1167 err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); 1192 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
1168 break; 1193 break;
1169 default: 1194 default:
1170 printk("nfsd: bad file type %o in nfsd_create\n", type); 1195 printk("nfsd: bad file type %o in nfsd_create\n", type);
1171 err = -EINVAL; 1196 host_err = -EINVAL;
1172 } 1197 }
1173 if (err < 0) 1198 if (host_err < 0)
1174 goto out_nfserr; 1199 goto out_nfserr;
1175 1200
1176 if (EX_ISSYNC(fhp->fh_export)) { 1201 if (EX_ISSYNC(fhp->fh_export)) {
@@ -1185,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1185 * directories via NFS. 1210 * directories via NFS.
1186 */ 1211 */
1187 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { 1212 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
1188 int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1213 __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1189 if (err2) 1214 if (err2)
1190 err = err2; 1215 err = err2;
1191 } 1216 }
@@ -1200,7 +1225,7 @@ out:
1200 return err; 1225 return err;
1201 1226
1202out_nfserr: 1227out_nfserr:
1203 err = nfserrno(err); 1228 err = nfserrno(host_err);
1204 goto out; 1229 goto out;
1205} 1230}
1206 1231
@@ -1208,7 +1233,7 @@ out_nfserr:
1208/* 1233/*
1209 * NFSv3 version of nfsd_create 1234 * NFSv3 version of nfsd_create
1210 */ 1235 */
1211int 1236__be32
1212nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, 1237nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1213 char *fname, int flen, struct iattr *iap, 1238 char *fname, int flen, struct iattr *iap,
1214 struct svc_fh *resfhp, int createmode, u32 *verifier, 1239 struct svc_fh *resfhp, int createmode, u32 *verifier,
@@ -1216,7 +1241,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1216{ 1241{
1217 struct dentry *dentry, *dchild = NULL; 1242 struct dentry *dentry, *dchild = NULL;
1218 struct inode *dirp; 1243 struct inode *dirp;
1219 int err; 1244 __be32 err;
1245 int host_err;
1220 __u32 v_mtime=0, v_atime=0; 1246 __u32 v_mtime=0, v_atime=0;
1221 int v_mode=0; 1247 int v_mode=0;
1222 1248
@@ -1246,7 +1272,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1246 * Compose the response file handle. 1272 * Compose the response file handle.
1247 */ 1273 */
1248 dchild = lookup_one_len(fname, dentry, flen); 1274 dchild = lookup_one_len(fname, dentry, flen);
1249 err = PTR_ERR(dchild); 1275 host_err = PTR_ERR(dchild);
1250 if (IS_ERR(dchild)) 1276 if (IS_ERR(dchild))
1251 goto out_nfserr; 1277 goto out_nfserr;
1252 1278
@@ -1302,8 +1328,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1302 goto out; 1328 goto out;
1303 } 1329 }
1304 1330
1305 err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1331 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1306 if (err < 0) 1332 if (host_err < 0)
1307 goto out_nfserr; 1333 goto out_nfserr;
1308 1334
1309 if (EX_ISSYNC(fhp->fh_export)) { 1335 if (EX_ISSYNC(fhp->fh_export)) {
@@ -1332,7 +1358,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1332 */ 1358 */
1333 set_attr: 1359 set_attr:
1334 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) { 1360 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) {
1335 int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1361 __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1336 if (err2) 1362 if (err2)
1337 err = err2; 1363 err = err2;
1338 } 1364 }
@@ -1350,7 +1376,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1350 return err; 1376 return err;
1351 1377
1352 out_nfserr: 1378 out_nfserr:
1353 err = nfserrno(err); 1379 err = nfserrno(host_err);
1354 goto out; 1380 goto out;
1355} 1381}
1356#endif /* CONFIG_NFSD_V3 */ 1382#endif /* CONFIG_NFSD_V3 */
@@ -1360,13 +1386,14 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1360 * fits into the buffer. On return, it contains the true length. 1386 * fits into the buffer. On return, it contains the true length.
1361 * N.B. After this call fhp needs an fh_put 1387 * N.B. After this call fhp needs an fh_put
1362 */ 1388 */
1363int 1389__be32
1364nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) 1390nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1365{ 1391{
1366 struct dentry *dentry; 1392 struct dentry *dentry;
1367 struct inode *inode; 1393 struct inode *inode;
1368 mm_segment_t oldfs; 1394 mm_segment_t oldfs;
1369 int err; 1395 __be32 err;
1396 int host_err;
1370 1397
1371 err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP); 1398 err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
1372 if (err) 1399 if (err)
@@ -1385,18 +1412,18 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1385 */ 1412 */
1386 1413
1387 oldfs = get_fs(); set_fs(KERNEL_DS); 1414 oldfs = get_fs(); set_fs(KERNEL_DS);
1388 err = inode->i_op->readlink(dentry, buf, *lenp); 1415 host_err = inode->i_op->readlink(dentry, buf, *lenp);
1389 set_fs(oldfs); 1416 set_fs(oldfs);
1390 1417
1391 if (err < 0) 1418 if (host_err < 0)
1392 goto out_nfserr; 1419 goto out_nfserr;
1393 *lenp = err; 1420 *lenp = host_err;
1394 err = 0; 1421 err = 0;
1395out: 1422out:
1396 return err; 1423 return err;
1397 1424
1398out_nfserr: 1425out_nfserr:
1399 err = nfserrno(err); 1426 err = nfserrno(host_err);
1400 goto out; 1427 goto out;
1401} 1428}
1402 1429
@@ -1404,7 +1431,7 @@ out_nfserr:
1404 * Create a symlink and look up its inode 1431 * Create a symlink and look up its inode
1405 * N.B. After this call _both_ fhp and resfhp need an fh_put 1432 * N.B. After this call _both_ fhp and resfhp need an fh_put
1406 */ 1433 */
1407int 1434__be32
1408nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, 1435nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1409 char *fname, int flen, 1436 char *fname, int flen,
1410 char *path, int plen, 1437 char *path, int plen,
@@ -1412,7 +1439,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1412 struct iattr *iap) 1439 struct iattr *iap)
1413{ 1440{
1414 struct dentry *dentry, *dnew; 1441 struct dentry *dentry, *dnew;
1415 int err, cerr; 1442 __be32 err, cerr;
1443 int host_err;
1416 umode_t mode; 1444 umode_t mode;
1417 1445
1418 err = nfserr_noent; 1446 err = nfserr_noent;
@@ -1428,7 +1456,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1428 fh_lock(fhp); 1456 fh_lock(fhp);
1429 dentry = fhp->fh_dentry; 1457 dentry = fhp->fh_dentry;
1430 dnew = lookup_one_len(fname, dentry, flen); 1458 dnew = lookup_one_len(fname, dentry, flen);
1431 err = PTR_ERR(dnew); 1459 host_err = PTR_ERR(dnew);
1432 if (IS_ERR(dnew)) 1460 if (IS_ERR(dnew))
1433 goto out_nfserr; 1461 goto out_nfserr;
1434 1462
@@ -1440,21 +1468,21 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1440 if (unlikely(path[plen] != 0)) { 1468 if (unlikely(path[plen] != 0)) {
1441 char *path_alloced = kmalloc(plen+1, GFP_KERNEL); 1469 char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
1442 if (path_alloced == NULL) 1470 if (path_alloced == NULL)
1443 err = -ENOMEM; 1471 host_err = -ENOMEM;
1444 else { 1472 else {
1445 strncpy(path_alloced, path, plen); 1473 strncpy(path_alloced, path, plen);
1446 path_alloced[plen] = 0; 1474 path_alloced[plen] = 0;
1447 err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode); 1475 host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
1448 kfree(path_alloced); 1476 kfree(path_alloced);
1449 } 1477 }
1450 } else 1478 } else
1451 err = vfs_symlink(dentry->d_inode, dnew, path, mode); 1479 host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
1452 1480
1453 if (!err) 1481 if (!host_err) {
1454 if (EX_ISSYNC(fhp->fh_export)) 1482 if (EX_ISSYNC(fhp->fh_export))
1455 err = nfsd_sync_dir(dentry); 1483 host_err = nfsd_sync_dir(dentry);
1456 if (err) 1484 }
1457 err = nfserrno(err); 1485 err = nfserrno(host_err);
1458 fh_unlock(fhp); 1486 fh_unlock(fhp);
1459 1487
1460 cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); 1488 cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
@@ -1464,7 +1492,7 @@ out:
1464 return err; 1492 return err;
1465 1493
1466out_nfserr: 1494out_nfserr:
1467 err = nfserrno(err); 1495 err = nfserrno(host_err);
1468 goto out; 1496 goto out;
1469} 1497}
1470 1498
@@ -1472,13 +1500,14 @@ out_nfserr:
1472 * Create a hardlink 1500 * Create a hardlink
1473 * N.B. After this call _both_ ffhp and tfhp need an fh_put 1501 * N.B. After this call _both_ ffhp and tfhp need an fh_put
1474 */ 1502 */
1475int 1503__be32
1476nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, 1504nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1477 char *name, int len, struct svc_fh *tfhp) 1505 char *name, int len, struct svc_fh *tfhp)
1478{ 1506{
1479 struct dentry *ddir, *dnew, *dold; 1507 struct dentry *ddir, *dnew, *dold;
1480 struct inode *dirp, *dest; 1508 struct inode *dirp, *dest;
1481 int err; 1509 __be32 err;
1510 int host_err;
1482 1511
1483 err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE); 1512 err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
1484 if (err) 1513 if (err)
@@ -1499,24 +1528,25 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1499 dirp = ddir->d_inode; 1528 dirp = ddir->d_inode;
1500 1529
1501 dnew = lookup_one_len(name, ddir, len); 1530 dnew = lookup_one_len(name, ddir, len);
1502 err = PTR_ERR(dnew); 1531 host_err = PTR_ERR(dnew);
1503 if (IS_ERR(dnew)) 1532 if (IS_ERR(dnew))
1504 goto out_nfserr; 1533 goto out_nfserr;
1505 1534
1506 dold = tfhp->fh_dentry; 1535 dold = tfhp->fh_dentry;
1507 dest = dold->d_inode; 1536 dest = dold->d_inode;
1508 1537
1509 err = vfs_link(dold, dirp, dnew); 1538 host_err = vfs_link(dold, dirp, dnew);
1510 if (!err) { 1539 if (!host_err) {
1511 if (EX_ISSYNC(ffhp->fh_export)) { 1540 if (EX_ISSYNC(ffhp->fh_export)) {
1512 err = nfserrno(nfsd_sync_dir(ddir)); 1541 err = nfserrno(nfsd_sync_dir(ddir));
1513 write_inode_now(dest, 1); 1542 write_inode_now(dest, 1);
1514 } 1543 }
1544 err = 0;
1515 } else { 1545 } else {
1516 if (err == -EXDEV && rqstp->rq_vers == 2) 1546 if (host_err == -EXDEV && rqstp->rq_vers == 2)
1517 err = nfserr_acces; 1547 err = nfserr_acces;
1518 else 1548 else
1519 err = nfserrno(err); 1549 err = nfserrno(host_err);
1520 } 1550 }
1521 1551
1522 dput(dnew); 1552 dput(dnew);
@@ -1526,7 +1556,7 @@ out:
1526 return err; 1556 return err;
1527 1557
1528out_nfserr: 1558out_nfserr:
1529 err = nfserrno(err); 1559 err = nfserrno(host_err);
1530 goto out_unlock; 1560 goto out_unlock;
1531} 1561}
1532 1562
@@ -1534,13 +1564,14 @@ out_nfserr:
1534 * Rename a file 1564 * Rename a file
1535 * N.B. After this call _both_ ffhp and tfhp need an fh_put 1565 * N.B. After this call _both_ ffhp and tfhp need an fh_put
1536 */ 1566 */
1537int 1567__be32
1538nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, 1568nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1539 struct svc_fh *tfhp, char *tname, int tlen) 1569 struct svc_fh *tfhp, char *tname, int tlen)
1540{ 1570{
1541 struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; 1571 struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap;
1542 struct inode *fdir, *tdir; 1572 struct inode *fdir, *tdir;
1543 int err; 1573 __be32 err;
1574 int host_err;
1544 1575
1545 err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE); 1576 err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
1546 if (err) 1577 if (err)
@@ -1571,22 +1602,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1571 fill_pre_wcc(tfhp); 1602 fill_pre_wcc(tfhp);
1572 1603
1573 odentry = lookup_one_len(fname, fdentry, flen); 1604 odentry = lookup_one_len(fname, fdentry, flen);
1574 err = PTR_ERR(odentry); 1605 host_err = PTR_ERR(odentry);
1575 if (IS_ERR(odentry)) 1606 if (IS_ERR(odentry))
1576 goto out_nfserr; 1607 goto out_nfserr;
1577 1608
1578 err = -ENOENT; 1609 host_err = -ENOENT;
1579 if (!odentry->d_inode) 1610 if (!odentry->d_inode)
1580 goto out_dput_old; 1611 goto out_dput_old;
1581 err = -EINVAL; 1612 host_err = -EINVAL;
1582 if (odentry == trap) 1613 if (odentry == trap)
1583 goto out_dput_old; 1614 goto out_dput_old;
1584 1615
1585 ndentry = lookup_one_len(tname, tdentry, tlen); 1616 ndentry = lookup_one_len(tname, tdentry, tlen);
1586 err = PTR_ERR(ndentry); 1617 host_err = PTR_ERR(ndentry);
1587 if (IS_ERR(ndentry)) 1618 if (IS_ERR(ndentry))
1588 goto out_dput_old; 1619 goto out_dput_old;
1589 err = -ENOTEMPTY; 1620 host_err = -ENOTEMPTY;
1590 if (ndentry == trap) 1621 if (ndentry == trap)
1591 goto out_dput_new; 1622 goto out_dput_new;
1592 1623
@@ -1594,14 +1625,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1594 if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1625 if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1595 ((atomic_read(&odentry->d_count) > 1) 1626 ((atomic_read(&odentry->d_count) > 1)
1596 || (atomic_read(&ndentry->d_count) > 1))) { 1627 || (atomic_read(&ndentry->d_count) > 1))) {
1597 err = -EPERM; 1628 host_err = -EPERM;
1598 } else 1629 } else
1599#endif 1630#endif
1600 err = vfs_rename(fdir, odentry, tdir, ndentry); 1631 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1601 if (!err && EX_ISSYNC(tfhp->fh_export)) { 1632 if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
1602 err = nfsd_sync_dir(tdentry); 1633 host_err = nfsd_sync_dir(tdentry);
1603 if (!err) 1634 if (!host_err)
1604 err = nfsd_sync_dir(fdentry); 1635 host_err = nfsd_sync_dir(fdentry);
1605 } 1636 }
1606 1637
1607 out_dput_new: 1638 out_dput_new:
@@ -1609,8 +1640,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1609 out_dput_old: 1640 out_dput_old:
1610 dput(odentry); 1641 dput(odentry);
1611 out_nfserr: 1642 out_nfserr:
1612 if (err) 1643 err = nfserrno(host_err);
1613 err = nfserrno(err);
1614 1644
1615 /* we cannot reply on fh_unlock on the two filehandles, 1645 /* we cannot reply on fh_unlock on the two filehandles,
1616 * as that would do the wrong thing if the two directories 1646 * as that would do the wrong thing if the two directories
@@ -1629,13 +1659,14 @@ out:
1629 * Unlink a file or directory 1659 * Unlink a file or directory
1630 * N.B. After this call fhp needs an fh_put 1660 * N.B. After this call fhp needs an fh_put
1631 */ 1661 */
1632int 1662__be32
1633nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, 1663nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1634 char *fname, int flen) 1664 char *fname, int flen)
1635{ 1665{
1636 struct dentry *dentry, *rdentry; 1666 struct dentry *dentry, *rdentry;
1637 struct inode *dirp; 1667 struct inode *dirp;
1638 int err; 1668 __be32 err;
1669 int host_err;
1639 1670
1640 err = nfserr_acces; 1671 err = nfserr_acces;
1641 if (!flen || isdotent(fname, flen)) 1672 if (!flen || isdotent(fname, flen))
@@ -1649,7 +1680,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1649 dirp = dentry->d_inode; 1680 dirp = dentry->d_inode;
1650 1681
1651 rdentry = lookup_one_len(fname, dentry, flen); 1682 rdentry = lookup_one_len(fname, dentry, flen);
1652 err = PTR_ERR(rdentry); 1683 host_err = PTR_ERR(rdentry);
1653 if (IS_ERR(rdentry)) 1684 if (IS_ERR(rdentry))
1654 goto out_nfserr; 1685 goto out_nfserr;
1655 1686
@@ -1666,22 +1697,23 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1666#ifdef MSNFS 1697#ifdef MSNFS
1667 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1698 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1668 (atomic_read(&rdentry->d_count) > 1)) { 1699 (atomic_read(&rdentry->d_count) > 1)) {
1669 err = -EPERM; 1700 host_err = -EPERM;
1670 } else 1701 } else
1671#endif 1702#endif
1672 err = vfs_unlink(dirp, rdentry); 1703 host_err = vfs_unlink(dirp, rdentry);
1673 } else { /* It's RMDIR */ 1704 } else { /* It's RMDIR */
1674 err = vfs_rmdir(dirp, rdentry); 1705 host_err = vfs_rmdir(dirp, rdentry);
1675 } 1706 }
1676 1707
1677 dput(rdentry); 1708 dput(rdentry);
1678 1709
1679 if (err == 0 && 1710 if (host_err)
1680 EX_ISSYNC(fhp->fh_export)) 1711 goto out_nfserr;
1681 err = nfsd_sync_dir(dentry); 1712 if (EX_ISSYNC(fhp->fh_export))
1713 host_err = nfsd_sync_dir(dentry);
1682 1714
1683out_nfserr: 1715out_nfserr:
1684 err = nfserrno(err); 1716 err = nfserrno(host_err);
1685out: 1717out:
1686 return err; 1718 return err;
1687} 1719}
@@ -1690,11 +1722,12 @@ out:
1690 * Read entries from a directory. 1722 * Read entries from a directory.
1691 * The NFSv3/4 verifier we ignore for now. 1723 * The NFSv3/4 verifier we ignore for now.
1692 */ 1724 */
1693int 1725__be32
1694nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 1726nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
1695 struct readdir_cd *cdp, encode_dent_fn func) 1727 struct readdir_cd *cdp, encode_dent_fn func)
1696{ 1728{
1697 int err; 1729 __be32 err;
1730 int host_err;
1698 struct file *file; 1731 struct file *file;
1699 loff_t offset = *offsetp; 1732 loff_t offset = *offsetp;
1700 1733
@@ -1716,10 +1749,10 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
1716 1749
1717 do { 1750 do {
1718 cdp->err = nfserr_eof; /* will be cleared on successful read */ 1751 cdp->err = nfserr_eof; /* will be cleared on successful read */
1719 err = vfs_readdir(file, (filldir_t) func, cdp); 1752 host_err = vfs_readdir(file, (filldir_t) func, cdp);
1720 } while (err >=0 && cdp->err == nfs_ok); 1753 } while (host_err >=0 && cdp->err == nfs_ok);
1721 if (err) 1754 if (host_err)
1722 err = nfserrno(err); 1755 err = nfserrno(host_err);
1723 else 1756 else
1724 err = cdp->err; 1757 err = cdp->err;
1725 *offsetp = vfs_llseek(file, 0, 1); 1758 *offsetp = vfs_llseek(file, 0, 1);
@@ -1736,10 +1769,10 @@ out:
1736 * Get file system stats 1769 * Get file system stats
1737 * N.B. After this call fhp needs an fh_put 1770 * N.B. After this call fhp needs an fh_put
1738 */ 1771 */
1739int 1772__be32
1740nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) 1773nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
1741{ 1774{
1742 int err = fh_verify(rqstp, fhp, 0, MAY_NOP); 1775 __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
1743 if (!err && vfs_statfs(fhp->fh_dentry,stat)) 1776 if (!err && vfs_statfs(fhp->fh_dentry,stat))
1744 err = nfserr_io; 1777 err = nfserr_io;
1745 return err; 1778 return err;
@@ -1748,7 +1781,7 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
1748/* 1781/*
1749 * Check for a user's access permissions to this inode. 1782 * Check for a user's access permissions to this inode.
1750 */ 1783 */
1751int 1784__be32
1752nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) 1785nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
1753{ 1786{
1754 struct inode *inode = dentry->d_inode; 1787 struct inode *inode = dentry->d_inode;
@@ -1829,11 +1862,11 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
1829void 1862void
1830nfsd_racache_shutdown(void) 1863nfsd_racache_shutdown(void)
1831{ 1864{
1832 if (!raparm_cache) 1865 if (!raparml)
1833 return; 1866 return;
1834 dprintk("nfsd: freeing readahead buffers.\n"); 1867 dprintk("nfsd: freeing readahead buffers.\n");
1835 kfree(raparml); 1868 kfree(raparml);
1836 raparm_cache = raparml = NULL; 1869 raparml = NULL;
1837} 1870}
1838/* 1871/*
1839 * Initialize readahead param cache 1872 * Initialize readahead param cache
@@ -1842,19 +1875,31 @@ int
1842nfsd_racache_init(int cache_size) 1875nfsd_racache_init(int cache_size)
1843{ 1876{
1844 int i; 1877 int i;
1878 int j = 0;
1879 int nperbucket;
1880
1845 1881
1846 if (raparm_cache) 1882 if (raparml)
1847 return 0; 1883 return 0;
1884 if (cache_size < 2*RAPARM_HASH_SIZE)
1885 cache_size = 2*RAPARM_HASH_SIZE;
1848 raparml = kmalloc(sizeof(struct raparms) * cache_size, GFP_KERNEL); 1886 raparml = kmalloc(sizeof(struct raparms) * cache_size, GFP_KERNEL);
1849 1887
1850 if (raparml != NULL) { 1888 if (raparml != NULL) {
1851 dprintk("nfsd: allocating %d readahead buffers.\n", 1889 dprintk("nfsd: allocating %d readahead buffers.\n",
1852 cache_size); 1890 cache_size);
1891 for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
1892 raparm_hash[i].pb_head = NULL;
1893 spin_lock_init(&raparm_hash[i].pb_lock);
1894 }
1895 nperbucket = cache_size >> RAPARM_HASH_BITS;
1853 memset(raparml, 0, sizeof(struct raparms) * cache_size); 1896 memset(raparml, 0, sizeof(struct raparms) * cache_size);
1854 for (i = 0; i < cache_size - 1; i++) { 1897 for (i = 0; i < cache_size - 1; i++) {
1855 raparml[i].p_next = raparml + i + 1; 1898 if (i % nperbucket == 0)
1899 raparm_hash[j++].pb_head = raparml + i;
1900 if (i % nperbucket < nperbucket-1)
1901 raparml[i].p_next = raparml + i + 1;
1856 } 1902 }
1857 raparm_cache = raparml;
1858 } else { 1903 } else {
1859 printk(KERN_WARNING 1904 printk(KERN_WARNING
1860 "nfsd: Could not allocate memory read-ahead cache.\n"); 1905 "nfsd: Could not allocate memory read-ahead cache.\n");
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index e1fceb8aa32d..d11753c50bc1 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -152,14 +152,16 @@ static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
152 struct o2nm_node *node, *ret = NULL; 152 struct o2nm_node *node, *ret = NULL;
153 153
154 while (*p) { 154 while (*p) {
155 int cmp;
156
155 parent = *p; 157 parent = *p;
156 node = rb_entry(parent, struct o2nm_node, nd_ip_node); 158 node = rb_entry(parent, struct o2nm_node, nd_ip_node);
157 159
158 if (memcmp(&ip_needle, &node->nd_ipv4_address, 160 cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
159 sizeof(ip_needle)) < 0) 161 sizeof(ip_needle));
162 if (cmp < 0)
160 p = &(*p)->rb_left; 163 p = &(*p)->rb_left;
161 else if (memcmp(&ip_needle, &node->nd_ipv4_address, 164 else if (cmp > 0)
162 sizeof(ip_needle)) > 0)
163 p = &(*p)->rb_right; 165 p = &(*p)->rb_right;
164 else { 166 else {
165 ret = node; 167 ret = node;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d9ba0a931a03..1be74c4e7814 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -30,6 +30,7 @@
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/pagemap.h> 31#include <linux/pagemap.h>
32#include <linux/uio.h> 32#include <linux/uio.h>
33#include <linux/sched.h>
33 34
34#define MLOG_MASK_PREFIX ML_INODE 35#define MLOG_MASK_PREFIX ML_INODE
35#include <cluster/masklog.h> 36#include <cluster/masklog.h>
@@ -691,6 +692,12 @@ static int ocfs2_zero_extend(struct inode *inode,
691 } 692 }
692 693
693 start_off += sb->s_blocksize; 694 start_off += sb->s_blocksize;
695
696 /*
697 * Very large extends have the potential to lock up
698 * the cpu for extended periods of time.
699 */
700 cond_resched();
694 } 701 }
695 702
696out: 703out:
@@ -728,31 +735,36 @@ static int ocfs2_extend_file(struct inode *inode,
728 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 735 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
729 OCFS2_I(inode)->ip_clusters; 736 OCFS2_I(inode)->ip_clusters;
730 737
731 if (clusters_to_add) { 738 /*
732 /* 739 * protect the pages that ocfs2_zero_extend is going to be
733 * protect the pages that ocfs2_zero_extend is going to 740 * pulling into the page cache.. we do this before the
734 * be pulling into the page cache.. we do this before the 741 * metadata extend so that we don't get into the situation
735 * metadata extend so that we don't get into the situation 742 * where we've extended the metadata but can't get the data
736 * where we've extended the metadata but can't get the data 743 * lock to zero.
737 * lock to zero. 744 */
738 */ 745 ret = ocfs2_data_lock(inode, 1);
739 ret = ocfs2_data_lock(inode, 1); 746 if (ret < 0) {
740 if (ret < 0) { 747 mlog_errno(ret);
741 mlog_errno(ret); 748 goto out;
742 goto out; 749 }
743 }
744 750
751 if (clusters_to_add) {
745 ret = ocfs2_extend_allocation(inode, clusters_to_add); 752 ret = ocfs2_extend_allocation(inode, clusters_to_add);
746 if (ret < 0) { 753 if (ret < 0) {
747 mlog_errno(ret); 754 mlog_errno(ret);
748 goto out_unlock; 755 goto out_unlock;
749 } 756 }
757 }
750 758
751 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); 759 /*
752 if (ret < 0) { 760 * Call this even if we don't add any clusters to the tree. We
753 mlog_errno(ret); 761 * still need to zero the area between the old i_size and the
754 goto out_unlock; 762 * new i_size.
755 } 763 */
764 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
765 if (ret < 0) {
766 mlog_errno(ret);
767 goto out_unlock;
756 } 768 }
757 769
758 if (!tail_to_skip) { 770 if (!tail_to_skip) {
@@ -764,8 +776,7 @@ static int ocfs2_extend_file(struct inode *inode,
764 } 776 }
765 777
766out_unlock: 778out_unlock:
767 if (clusters_to_add) /* this is the only case in which we lock */ 779 ocfs2_data_unlock(inode, 1);
768 ocfs2_data_unlock(inode, 1);
769 780
770out: 781out:
771 return ret; 782 return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 259155f0eb2e..a57b751d4f40 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1085,14 +1085,6 @@ static int ocfs2_rename(struct inode *old_dir,
1085 BUG(); 1085 BUG();
1086 } 1086 }
1087 1087
1088 if (atomic_read(&old_dentry->d_count) > 2) {
1089 shrink_dcache_parent(old_dentry);
1090 if (atomic_read(&old_dentry->d_count) > 2) {
1091 status = -EBUSY;
1092 goto bail;
1093 }
1094 }
1095
1096 /* Assume a directory heirarchy thusly: 1088 /* Assume a directory heirarchy thusly:
1097 * a/b/c 1089 * a/b/c
1098 * a/d 1090 * a/d
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4c29cd7cc8e6..76b46ebbb10c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -339,7 +339,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
339 339
340#if BITS_PER_LONG == 32 340#if BITS_PER_LONG == 32
341# if defined(CONFIG_LBD) 341# if defined(CONFIG_LBD)
342 BUG_ON(sizeof(sector_t) != 8); 342 BUILD_BUG_ON(sizeof(sector_t) != 8);
343 pagefactor = PAGE_CACHE_SIZE; 343 pagefactor = PAGE_CACHE_SIZE;
344 bitshift = BITS_PER_LONG; 344 bitshift = BITS_PER_LONG;
345# else 345# else
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 51c6a748df49..6fb4b6150d77 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -376,18 +376,48 @@ static char *make_block_name(struct gendisk *disk)
376 return name; 376 return name;
377} 377}
378 378
379static void disk_sysfs_symlinks(struct gendisk *disk) 379static int disk_sysfs_symlinks(struct gendisk *disk)
380{ 380{
381 struct device *target = get_device(disk->driverfs_dev); 381 struct device *target = get_device(disk->driverfs_dev);
382 int err;
383 char *disk_name = NULL;
384
382 if (target) { 385 if (target) {
383 char *disk_name = make_block_name(disk); 386 disk_name = make_block_name(disk);
384 sysfs_create_link(&disk->kobj,&target->kobj,"device"); 387 if (!disk_name) {
385 if (disk_name) { 388 err = -ENOMEM;
386 sysfs_create_link(&target->kobj,&disk->kobj,disk_name); 389 goto err_out;
387 kfree(disk_name);
388 } 390 }
391
392 err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
393 if (err)
394 goto err_out_disk_name;
395
396 err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
397 if (err)
398 goto err_out_dev_link;
389 } 399 }
390 sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj, "subsystem"); 400
401 err = sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj,
402 "subsystem");
403 if (err)
404 goto err_out_disk_name_lnk;
405
406 kfree(disk_name);
407
408 return 0;
409
410err_out_disk_name_lnk:
411 if (target) {
412 sysfs_remove_link(&target->kobj, disk_name);
413err_out_dev_link:
414 sysfs_remove_link(&disk->kobj, "device");
415err_out_disk_name:
416 kfree(disk_name);
417err_out:
418 put_device(target);
419 }
420 return err;
391} 421}
392 422
393/* Not exported, helper to add_disk(). */ 423/* Not exported, helper to add_disk(). */
@@ -406,7 +436,11 @@ void register_disk(struct gendisk *disk)
406 *s = '!'; 436 *s = '!';
407 if ((err = kobject_add(&disk->kobj))) 437 if ((err = kobject_add(&disk->kobj)))
408 return; 438 return;
409 disk_sysfs_symlinks(disk); 439 err = disk_sysfs_symlinks(disk);
440 if (err) {
441 kobject_del(&disk->kobj);
442 return;
443 }
410 disk_sysfs_add_subdirs(disk); 444 disk_sysfs_add_subdirs(disk);
411 445
412 /* No minors to use for partitions */ 446 /* No minors to use for partitions */
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 4f8df71e49d3..8c7af1777819 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -32,13 +32,11 @@
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) (get_unaligned(&p->sys_ind))
35#define NR_SECTS(p) ({ __typeof__(p->nr_sects) __a = \ 35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 get_unaligned(&p->nr_sects); \
37 le32_to_cpu(__a); \ 36 le32_to_cpu(__a); \
38 }) 37 })
39 38
40#define START_SECT(p) ({ __typeof__(p->start_sect) __a = \ 39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \
41 get_unaligned(&p->start_sect); \
42 le32_to_cpu(__a); \ 40 le32_to_cpu(__a); \
43 }) 41 })
44 42
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 82da55b5cffe..8df27401d292 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -72,6 +72,7 @@
72#include <linux/audit.h> 72#include <linux/audit.h>
73#include <linux/poll.h> 73#include <linux/poll.h>
74#include <linux/nsproxy.h> 74#include <linux/nsproxy.h>
75#include <linux/oom.h>
75#include "internal.h" 76#include "internal.h"
76 77
77/* NOTE: 78/* NOTE:
@@ -86,7 +87,7 @@
86 87
87 88
88/* Worst case buffer size needed for holding an integer. */ 89/* Worst case buffer size needed for holding an integer. */
89#define PROC_NUMBUF 10 90#define PROC_NUMBUF 13
90 91
91struct pid_entry { 92struct pid_entry {
92 int len; 93 int len;
@@ -689,7 +690,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
689 if (copy_from_user(buffer, buf, count)) 690 if (copy_from_user(buffer, buf, count))
690 return -EFAULT; 691 return -EFAULT;
691 oom_adjust = simple_strtol(buffer, &end, 0); 692 oom_adjust = simple_strtol(buffer, &end, 0);
692 if ((oom_adjust < -16 || oom_adjust > 15) && oom_adjust != OOM_DISABLE) 693 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
694 oom_adjust != OOM_DISABLE)
693 return -EINVAL; 695 return -EINVAL;
694 if (*end == '\n') 696 if (*end == '\n')
695 end++; 697 end++;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8d88e58ed5cc..93c43b676e59 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -647,7 +647,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
647 647
648 if (get_user(c, buf)) 648 if (get_user(c, buf))
649 return -EFAULT; 649 return -EFAULT;
650 __handle_sysrq(c, NULL, NULL, 0); 650 __handle_sysrq(c, NULL, 0);
651 } 651 }
652 return count; 652 return count;
653} 653}
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 1bfae42117ca..e3d466a228d4 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1304,8 +1304,8 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
1304 1304
1305 bh = sb_bread(sb, block); 1305 bh = sb_bread(sb, block);
1306 if (bh == NULL) 1306 if (bh == NULL)
1307 reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%lu) " 1307 reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
1308 "reading failed", __FUNCTION__, bh->b_blocknr); 1308 "reading failed", __FUNCTION__, block);
1309 else { 1309 else {
1310 if (buffer_locked(bh)) { 1310 if (buffer_locked(bh)) {
1311 PROC_INFO_INC(sb, scan_bitmap.wait); 1311 PROC_INFO_INC(sb, scan_bitmap.wait);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index c093642fb983..b67ce9354048 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -2,7 +2,6 @@
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README 2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */ 3 */
4 4
5#include <linux/config.h>
6#include <linux/time.h> 5#include <linux/time.h>
7#include <linux/reiserfs_fs.h> 6#include <linux/reiserfs_fs.h>
8#include <linux/reiserfs_acl.h> 7#include <linux/reiserfs_acl.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 7e5a2f5ebeb0..9c69bcacad22 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1780,7 +1780,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1780 err = -EDQUOT; 1780 err = -EDQUOT;
1781 goto out_end_trans; 1781 goto out_end_trans;
1782 } 1782 }
1783 if (!dir || !dir->i_nlink) { 1783 if (!dir->i_nlink) {
1784 err = -EPERM; 1784 err = -EPERM;
1785 goto out_bad_inode; 1785 goto out_bad_inode;
1786 } 1786 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ad8cbc49883a..85ce23268302 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -53,6 +53,7 @@
53#include <linux/workqueue.h> 53#include <linux/workqueue.h>
54#include <linux/writeback.h> 54#include <linux/writeback.h>
55#include <linux/blkdev.h> 55#include <linux/blkdev.h>
56#include <linux/backing-dev.h>
56 57
57/* gets a struct reiserfs_journal_list * from a list head */ 58/* gets a struct reiserfs_journal_list * from a list head */
58#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ 59#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -970,7 +971,7 @@ int reiserfs_async_progress_wait(struct super_block *s)
970 DEFINE_WAIT(wait); 971 DEFINE_WAIT(wait);
971 struct reiserfs_journal *j = SB_JOURNAL(s); 972 struct reiserfs_journal *j = SB_JOURNAL(s);
972 if (atomic_read(&j->j_async_throttle)) 973 if (atomic_read(&j->j_async_throttle))
973 blk_congestion_wait(WRITE, HZ / 10); 974 congestion_wait(WRITE, HZ / 10);
974 return 0; 975 return 0;
975} 976}
976 977
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c89aa2338191..9041802df832 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -430,20 +430,29 @@ int remove_save_link(struct inode *inode, int truncate)
430 return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); 430 return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
431} 431}
432 432
433static void reiserfs_put_super(struct super_block *s) 433static void reiserfs_kill_sb(struct super_block *s)
434{ 434{
435 struct reiserfs_transaction_handle th; 435 if (REISERFS_SB(s)) {
436 th.t_trans_id = 0; 436 if (REISERFS_SB(s)->xattr_root) {
437 d_invalidate(REISERFS_SB(s)->xattr_root);
438 dput(REISERFS_SB(s)->xattr_root);
439 REISERFS_SB(s)->xattr_root = NULL;
440 }
437 441
438 if (REISERFS_SB(s)->xattr_root) { 442 if (REISERFS_SB(s)->priv_root) {
439 d_invalidate(REISERFS_SB(s)->xattr_root); 443 d_invalidate(REISERFS_SB(s)->priv_root);
440 dput(REISERFS_SB(s)->xattr_root); 444 dput(REISERFS_SB(s)->priv_root);
445 REISERFS_SB(s)->priv_root = NULL;
446 }
441 } 447 }
442 448
443 if (REISERFS_SB(s)->priv_root) { 449 kill_block_super(s);
444 d_invalidate(REISERFS_SB(s)->priv_root); 450}
445 dput(REISERFS_SB(s)->priv_root); 451
446 } 452static void reiserfs_put_super(struct super_block *s)
453{
454 struct reiserfs_transaction_handle th;
455 th.t_trans_id = 0;
447 456
448 /* change file system state to current state if it was mounted with read-write permissions */ 457 /* change file system state to current state if it was mounted with read-write permissions */
449 if (!(s->s_flags & MS_RDONLY)) { 458 if (!(s->s_flags & MS_RDONLY)) {
@@ -2156,7 +2165,7 @@ struct file_system_type reiserfs_fs_type = {
2156 .owner = THIS_MODULE, 2165 .owner = THIS_MODULE,
2157 .name = "reiserfs", 2166 .name = "reiserfs",
2158 .get_sb = get_super_block, 2167 .get_sb = get_super_block,
2159 .kill_sb = kill_block_super, 2168 .kill_sb = reiserfs_kill_sb,
2160 .fs_flags = FS_REQUIRES_DEV, 2169 .fs_flags = FS_REQUIRES_DEV,
2161}; 2170};
2162 2171
diff --git a/fs/splice.c b/fs/splice.c
index 13e92dd19fbb..a567010b62ac 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -607,7 +607,7 @@ find_page:
607 ret = -ENOMEM; 607 ret = -ENOMEM;
608 page = page_cache_alloc_cold(mapping); 608 page = page_cache_alloc_cold(mapping);
609 if (unlikely(!page)) 609 if (unlikely(!page))
610 goto out_nomem; 610 goto out_ret;
611 611
612 /* 612 /*
613 * This will also lock the page 613 * This will also lock the page
@@ -666,7 +666,7 @@ find_page:
666 if (sd->pos + this_len > isize) 666 if (sd->pos + this_len > isize)
667 vmtruncate(mapping->host, isize); 667 vmtruncate(mapping->host, isize);
668 668
669 goto out; 669 goto out_ret;
670 } 670 }
671 671
672 if (buf->page != page) { 672 if (buf->page != page) {
@@ -698,7 +698,7 @@ find_page:
698out: 698out:
699 page_cache_release(page); 699 page_cache_release(page);
700 unlock_page(page); 700 unlock_page(page);
701out_nomem: 701out_ret:
702 return ret; 702 return ret;
703} 703}
704 704
diff --git a/fs/super.c b/fs/super.c
index aec99ddbe53f..47e554c12e76 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -260,17 +260,17 @@ int fsync_super(struct super_block *sb)
260 * that need destruction out of superblock, call generic_shutdown_super() 260 * that need destruction out of superblock, call generic_shutdown_super()
261 * and release aforementioned objects. Note: dentries and inodes _are_ 261 * and release aforementioned objects. Note: dentries and inodes _are_
262 * taken care of and do not need specific handling. 262 * taken care of and do not need specific handling.
263 *
264 * Upon calling this function, the filesystem may no longer alter or
265 * rearrange the set of dentries belonging to this super_block, nor may it
266 * change the attachments of dentries to inodes.
263 */ 267 */
264void generic_shutdown_super(struct super_block *sb) 268void generic_shutdown_super(struct super_block *sb)
265{ 269{
266 struct dentry *root = sb->s_root;
267 struct super_operations *sop = sb->s_op; 270 struct super_operations *sop = sb->s_op;
268 271
269 if (root) { 272 if (sb->s_root) {
270 sb->s_root = NULL; 273 shrink_dcache_for_umount(sb);
271 shrink_dcache_parent(root);
272 shrink_dcache_sb(sb);
273 dput(root);
274 fsync_super(sb); 274 fsync_super(sb);
275 lock_super(sb); 275 lock_super(sb);
276 sb->s_flags &= ~MS_ACTIVE; 276 sb->s_flags &= ~MS_ACTIVE;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 146f1dedec84..298303b5a716 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -483,17 +483,12 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
483 (victim->d_parent->d_inode == dir->d_inode)) { 483 (victim->d_parent->d_inode == dir->d_inode)) {
484 victim->d_inode->i_mtime = CURRENT_TIME; 484 victim->d_inode->i_mtime = CURRENT_TIME;
485 fsnotify_modify(victim); 485 fsnotify_modify(victim);
486
487 /**
488 * Drop reference from initial sysfs_get_dentry().
489 */
490 dput(victim);
491 res = 0; 486 res = 0;
492 } else 487 } else
493 d_drop(victim); 488 d_drop(victim);
494 489
495 /** 490 /**
496 * Drop the reference acquired from sysfs_get_dentry() above. 491 * Drop the reference acquired from lookup_one_len() above.
497 */ 492 */
498 dput(victim); 493 dput(victim);
499 } 494 }
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 350cba5d6803..dc9e7dc07fb7 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -358,16 +358,11 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
358 unsigned long blocknr; 358 unsigned long blocknr;
359 int size = 0, i; 359 int size = 0, i;
360 360
361 if (1024 != sizeof (struct xenix_super_block)) 361 BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
362 panic("Xenix FS: bad superblock size"); 362 BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
363 if (512 != sizeof (struct sysv4_super_block)) 363 BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
364 panic("SystemV FS: bad superblock size"); 364 BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
365 if (512 != sizeof (struct sysv2_super_block)) 365 BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
366 panic("SystemV FS: bad superblock size");
367 if (500 != sizeof (struct coh_super_block))
368 panic("Coherent FS: bad superblock size");
369 if (64 != sizeof (struct sysv_inode))
370 panic("sysv fs: bad inode size");
371 366
372 sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); 367 sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
373 if (!sbi) 368 if (!sbi)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1d3b5d2070e5..1aea6a4f9a4a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1621,9 +1621,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1621 goto error_out; 1621 goto error_out;
1622 } 1622 }
1623 1623
1624 if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY) 1624 if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY) {
1625 printk("UDF-fs: Partition marked readonly; forcing readonly mount\n"); 1625 printk("UDF-fs: Partition marked readonly; forcing readonly mount\n");
1626 sb->s_flags |= MS_RDONLY; 1626 sb->s_flags |= MS_RDONLY;
1627 }
1627 1628
1628 if ( udf_find_fileset(sb, &fileset, &rootdir) ) 1629 if ( udf_find_fileset(sb, &fileset, &rootdir) )
1629 { 1630 {
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 22f820a9b15c..17437574f79c 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -184,14 +184,13 @@ void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi,
184dev_t 184dev_t
185ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi) 185ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
186{ 186{
187 __fs32 fs32; 187 __u32 fs32;
188 dev_t dev; 188 dev_t dev;
189 189
190 if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) 190 if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
191 fs32 = ufsi->i_u1.i_data[1]; 191 fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]);
192 else 192 else
193 fs32 = ufsi->i_u1.i_data[0]; 193 fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]);
194 fs32 = fs32_to_cpu(sb, fs32);
195 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { 194 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
196 case UFS_ST_SUNx86: 195 case UFS_ST_SUNx86:
197 case UFS_ST_SUN: 196 case UFS_ST_SUN:
@@ -212,7 +211,7 @@ ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
212void 211void
213ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev) 212ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev)
214{ 213{
215 __fs32 fs32; 214 __u32 fs32;
216 215
217 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { 216 switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
218 case UFS_ST_SUNx86: 217 case UFS_ST_SUNx86:
@@ -227,11 +226,10 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
227 fs32 = old_encode_dev(dev); 226 fs32 = old_encode_dev(dev);
228 break; 227 break;
229 } 228 }
230 fs32 = cpu_to_fs32(sb, fs32);
231 if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) 229 if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
232 ufsi->i_u1.i_data[1] = fs32; 230 ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32);
233 else 231 else
234 ufsi->i_u1.i_data[0] = fs32; 232 ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32);
235} 233}
236 234
237/** 235/**
diff --git a/fs/xattr.c b/fs/xattr.c
index c32f15b5f60f..395635100f77 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -135,6 +135,26 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
135} 135}
136EXPORT_SYMBOL_GPL(vfs_getxattr); 136EXPORT_SYMBOL_GPL(vfs_getxattr);
137 137
138ssize_t
139vfs_listxattr(struct dentry *d, char *list, size_t size)
140{
141 ssize_t error;
142
143 error = security_inode_listxattr(d);
144 if (error)
145 return error;
146 error = -EOPNOTSUPP;
147 if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
148 error = d->d_inode->i_op->listxattr(d, list, size);
149 } else {
150 error = security_inode_listsecurity(d->d_inode, list, size);
151 if (size && error > size)
152 error = -ERANGE;
153 }
154 return error;
155}
156EXPORT_SYMBOL_GPL(vfs_listxattr);
157
138int 158int
139vfs_removexattr(struct dentry *dentry, char *name) 159vfs_removexattr(struct dentry *dentry, char *name)
140{ 160{
@@ -346,17 +366,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
346 return -ENOMEM; 366 return -ENOMEM;
347 } 367 }
348 368
349 error = security_inode_listxattr(d); 369 error = vfs_listxattr(d, klist, size);
350 if (error)
351 goto out;
352 error = -EOPNOTSUPP;
353 if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
354 error = d->d_inode->i_op->listxattr(d, klist, size);
355 } else {
356 error = security_inode_listsecurity(d->d_inode, klist, size);
357 if (size && error > size)
358 error = -ERANGE;
359 }
360 if (error > 0) { 370 if (error > 0) {
361 if (size && copy_to_user(list, klist, error)) 371 if (size && copy_to_user(list, klist, error))
362 error = -EFAULT; 372 error = -EFAULT;
@@ -365,7 +375,6 @@ listxattr(struct dentry *d, char __user *list, size_t size)
365 than XATTR_LIST_MAX bytes. Not possible. */ 375 than XATTR_LIST_MAX bytes. Not possible. */
366 error = -E2BIG; 376 error = -E2BIG;
367 } 377 }
368out:
369 kfree(klist); 378 kfree(klist);
370 return error; 379 return error;
371} 380}
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index d59737589815..004baf600611 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -21,6 +21,7 @@
21#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/backing-dev.h>
24#include "time.h" 25#include "time.h"
25#include "kmem.h" 26#include "kmem.h"
26 27
@@ -53,7 +54,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
53 printk(KERN_ERR "XFS: possible memory allocation " 54 printk(KERN_ERR "XFS: possible memory allocation "
54 "deadlock in %s (mode:0x%x)\n", 55 "deadlock in %s (mode:0x%x)\n",
55 __FUNCTION__, lflags); 56 __FUNCTION__, lflags);
56 blk_congestion_wait(WRITE, HZ/50); 57 congestion_wait(WRITE, HZ/50);
57 } while (1); 58 } while (1);
58} 59}
59 60
@@ -131,7 +132,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
131 printk(KERN_ERR "XFS: possible memory allocation " 132 printk(KERN_ERR "XFS: possible memory allocation "
132 "deadlock in %s (mode:0x%x)\n", 133 "deadlock in %s (mode:0x%x)\n",
133 __FUNCTION__, lflags); 134 __FUNCTION__, lflags);
134 blk_congestion_wait(WRITE, HZ/50); 135 congestion_wait(WRITE, HZ/50);
135 } while (1); 136 } while (1);
136} 137}
137 138
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9bbadafdcb00..db5f5a3608ca 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -30,6 +30,7 @@
30#include <linux/hash.h> 30#include <linux/hash.h>
31#include <linux/kthread.h> 31#include <linux/kthread.h>
32#include <linux/migrate.h> 32#include <linux/migrate.h>
33#include <linux/backing-dev.h>
33#include "xfs_linux.h" 34#include "xfs_linux.h"
34 35
35STATIC kmem_zone_t *xfs_buf_zone; 36STATIC kmem_zone_t *xfs_buf_zone;
@@ -395,7 +396,7 @@ _xfs_buf_lookup_pages(
395 396
396 XFS_STATS_INC(xb_page_retries); 397 XFS_STATS_INC(xb_page_retries);
397 xfsbufd_wakeup(0, gfp_mask); 398 xfsbufd_wakeup(0, gfp_mask);
398 blk_congestion_wait(WRITE, HZ/50); 399 congestion_wait(WRITE, HZ/50);
399 goto retry; 400 goto retry;
400 } 401 }
401 402